#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <pthread.h>

#define NUM_THREADS 4

#define NUM_BARRIERS 5
pthread_barrier_t barrier[NUM_BARRIERS];

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

struct thread_arg_t {
	int tid;
};

#define ARRAY_SIZE (2*1024*1024)
volatile unsigned int A[ARRAY_SIZE*NUM_THREADS];
long long elapsed_time[NUM_THREADS];
volatile unsigned int temp_read;

void test_prefetch_queue_ti(int tid)
{
        unsigned int time_begin_high, time_begin_low, time_end_high, time_end_low;

	volatile unsigned int * A_ti = A + (tid+1) * ARRAY_SIZE;
	int i = 0;
	for(i = 0; i < ARRAY_SIZE; i=i+16){
		_mm_clevict((void *)&A_ti[i], 1);		
		_mm_clevict((void *)&A_ti[i], 2);		
	}

	pthread_barrier_wait(&barrier[0]);
	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	{
        //each thread starts from A+tid*ARRAY_SIZE
        A_ti = A + (tid+1)*ARRAY_SIZE + i;

	//unroll a single-page to minimize control overhead
	_mm_prefetch((void *)A_ti, 1);
	_mm_prefetch((void *)(A_ti+16), 1);
	_mm_prefetch((void *)(A_ti+16*2), 1);
	_mm_prefetch((void *)(A_ti+16*3), 1);
	_mm_prefetch((void *)(A_ti+16*4), 1);
	_mm_prefetch((void *)(A_ti+16*5), 1);
	_mm_prefetch((void *)(A_ti+16*6), 1);
	_mm_prefetch((void *)(A_ti+16*7), 1);

	_mm_prefetch((void *)(A_ti+16*8), 1);
	_mm_prefetch((void *)(A_ti+16*9), 1);
	_mm_prefetch((void *)(A_ti+16*10), 1);
	_mm_prefetch((void *)(A_ti+16*11), 1);
	_mm_prefetch((void *)(A_ti+16*12), 1);
	_mm_prefetch((void *)(A_ti+16*13), 1);
	_mm_prefetch((void *)(A_ti+16*14), 1);
	_mm_prefetch((void *)(A_ti+16*15), 1);

	_mm_prefetch((void *)(A_ti+16*16), 1);
	_mm_prefetch((void *)(A_ti+16*17), 1);
	_mm_prefetch((void *)(A_ti+16*18), 1);
	_mm_prefetch((void *)(A_ti+16*19), 1);
	_mm_prefetch((void *)(A_ti+16*20), 1);
	_mm_prefetch((void *)(A_ti+16*21), 1);
	_mm_prefetch((void *)(A_ti+16*22), 1);
	_mm_prefetch((void *)(A_ti+16*23), 1);

	_mm_prefetch((void *)(A_ti+16*24), 1);
	_mm_prefetch((void *)(A_ti+16*25), 1);
	_mm_prefetch((void *)(A_ti+16*26), 1);
	_mm_prefetch((void *)(A_ti+16*27), 1);
	_mm_prefetch((void *)(A_ti+16*28), 1);
	_mm_prefetch((void *)(A_ti+16*29), 1);
	_mm_prefetch((void *)(A_ti+16*30), 1);
	_mm_prefetch((void *)(A_ti+16*31), 1);

	_mm_prefetch((void *)(A_ti+16*32), 1);
	_mm_prefetch((void *)(A_ti+16*33), 1);
	_mm_prefetch((void *)(A_ti+16*34), 1);
	_mm_prefetch((void *)(A_ti+16*35), 1);
	_mm_prefetch((void *)(A_ti+16*36), 1);
	_mm_prefetch((void *)(A_ti+16*37), 1);
	_mm_prefetch((void *)(A_ti+16*38), 1);
	_mm_prefetch((void *)(A_ti+16*39), 1);

	_mm_prefetch((void *)(A_ti+16*40), 1);
	_mm_prefetch((void *)(A_ti+16*41), 1);
	_mm_prefetch((void *)(A_ti+16*42), 1);
	_mm_prefetch((void *)(A_ti+16*43), 1);
	_mm_prefetch((void *)(A_ti+16*44), 1);
	_mm_prefetch((void *)(A_ti+16*45), 1);
	_mm_prefetch((void *)(A_ti+16*46), 1);
	_mm_prefetch((void *)(A_ti+16*47), 1);

	_mm_prefetch((void *)(A_ti+16*48), 1);
	_mm_prefetch((void *)(A_ti+16*49), 1);
	_mm_prefetch((void *)(A_ti+16*50), 1);
	_mm_prefetch((void *)(A_ti+16*51), 1);
	_mm_prefetch((void *)(A_ti+16*52), 1);
	_mm_prefetch((void *)(A_ti+16*53), 1);
	_mm_prefetch((void *)(A_ti+16*54), 1);
	_mm_prefetch((void *)(A_ti+16*55), 1);

	_mm_prefetch((void *)(A_ti+16*56), 1);
	_mm_prefetch((void *)(A_ti+16*57), 1);
	_mm_prefetch((void *)(A_ti+16*58), 1);
	_mm_prefetch((void *)(A_ti+16*59), 1);
	_mm_prefetch((void *)(A_ti+16*60), 1);
	_mm_prefetch((void *)(A_ti+16*61), 1);
	_mm_prefetch((void *)(A_ti+16*62), 1);
	_mm_prefetch((void *)(A_ti+16*63), 1);
	}

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_begin_low,esi
	__asm mov time_begin_high,edi
	__asm mov time_end_low,eax
	__asm mov time_end_high,edx
	pthread_barrier_wait(&barrier[1]);

	elapsed_time[tid] = ((unsigned long long)time_end_high<< 32 | time_end_low) - ((unsigned long long)time_begin_high<<32 | time_begin_low);

	//printf("thread %d time_begin: %llu\t", tid, ((unsigned long long)time_begin_high<<32 | time_begin_low));
	//printf("thread %d time_end: %llu\t", tid, ((unsigned long long)time_end_high<< 32 | time_end_low));
	//printf("thread %d: %llu\n", tid, elapsed_time[tid]);
}

void test_prefetch_queue_last()
{
        unsigned int time_begin_high, time_begin_low, time_end_high, time_end_low;

	pthread_barrier_wait(&barrier[0]);
	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	temp_read = A[100];

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_begin_low,esi
	__asm mov time_begin_high,edi
	__asm mov time_end_low,eax
	__asm mov time_end_high,edx
	pthread_barrier_wait(&barrier[1]);

	elapsed_time[NUM_THREADS-1] = ((unsigned long long)time_end_high<< 32 | time_end_low) - ((unsigned long long)time_begin_high<<32 | time_begin_low);

	//printf("thread %d time_begin: %llu\t", NUM_THREADS-1, ((unsigned long long)time_begin_high<<32 | time_begin_low));
	//printf("thread %d time_end: %llu\t", NUM_THREADS-1, ((unsigned long long)time_end_high<< 32 | time_end_low));
	//printf("thread %d: %llu\n", NUM_THREADS-1, elapsed_time[NUM_THREADS-1]);
}

void* prefetch_thread_ti(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;
	set_cpu(tid+1);

	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
}

void* prefetch_thread_last(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;
	set_cpu(tid+1);

	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
}

int main()
{
        //initialize the array first to 1) allocate physical page;
        //2) to avoid page fault
	int i;
	for(i = 0; i < ARRAY_SIZE*NUM_THREADS; i=i+1024)
		A[i] = i;

	pthread_t thread_id[NUM_THREADS];
	struct thread_arg_t thread_arg[NUM_THREADS];
	int ti;
	for(ti = 0; ti < NUM_THREADS; ++ti)
	{
		thread_arg[ti].tid = ti;
	}
	for(ti = 0; ti < NUM_BARRIERS; ++ti)
	{
		pthread_barrier_init(&barrier[ti], NULL, NUM_THREADS);
	}
	for(ti = 0; ti < NUM_THREADS-1; ++ti)
	{
	        pthread_create( &thread_id[ti], NULL, prefetch_thread_ti, (void *)&thread_arg[ti]);
       	}
	pthread_create( &thread_id[NUM_THREADS-1], NULL, prefetch_thread_last, (void *)&thread_arg[NUM_THREADS-1]);

	for(ti = 0; ti < NUM_THREADS; ++ti)   
	{      
		pthread_join(thread_id[ti], NULL);    
	}
	for(ti = 0; ti < NUM_BARRIERS; ++ti)   
	{      
		pthread_barrier_destroy(&barrier[ti]);
	}

	for(ti = NUM_THREADS-1; ti < NUM_THREADS; ++ti)   
	{      
		//printf("Thread %d: %llu\n", ti, elapsed_time[ti]);
		printf("%llu\t", elapsed_time[ti]);
	}
	//printf("\n");

	return 0;
}	
