#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <pthread.h>

#define NUM_THREADS 4

#define NUM_BARRIERS 5
pthread_barrier_t barrier[NUM_BARRIERS];

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

struct thread_arg_t {
	int tid;
};

#define ARRAY_SIZE (1*1024)
volatile unsigned int A[ARRAY_SIZE*NUM_THREADS];
long long elapsed_time[NUM_THREADS];
volatile unsigned int temp_read;

void test_prefetch_queue_ti(int tid)
{
        unsigned int time_begin_high, time_begin_low, time_end_high, time_end_low;

	volatile unsigned int * A_ti = A + (tid+1) * ARRAY_SIZE;
	int i = 0, j = 0;
	for(i = 0; i < ARRAY_SIZE; i=i+16){
		_mm_clevict((void *)&A_ti[i], 1);		
		_mm_clevict((void *)&A_ti[i], 2);		
	}
	__asm cpuid

	for(i = 0; i < ARRAY_SIZE; i=i+16){
		_mm_prefetch((void *)&A_ti[i], 1);
	}

	pthread_barrier_wait(&barrier[0]);
	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	for(j = 0; j < 1024; j++){
	  for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  {
	    //each thread starts from A+tid*ARRAY_SIZE
	    A_ti = A + (tid+1)*ARRAY_SIZE + i;

	    //unroll a single-page to minimize control overhead
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[0]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*2]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*3]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*4]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*5]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*6]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*7]) : "r8d");

	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*8]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*9]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*10]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*11]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*12]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*13]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*14]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*15]) : "r8d");

	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*16]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*17]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*18]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*19]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*20]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*21]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*22]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*23]) : "r8d");

	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*24]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*25]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*26]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*27]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*28]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*29]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*30]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*31]) : "r8d");

	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*32]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*33]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*34]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*35]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*36]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*37]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*38]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*39]) : "r8d");

	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*40]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*41]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*42]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*43]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*44]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*45]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*46]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*47]) : "r8d");

	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*48]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*49]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*50]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*51]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*52]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*53]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*54]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*55]) : "r8d");

	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*56]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*57]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*58]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*59]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*60]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*61]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*62]) : "r8d");
	    asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*63]) : "r8d");
	  }
	}

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_begin_low,esi
	__asm mov time_begin_high,edi
	__asm mov time_end_low,eax
	__asm mov time_end_high,edx
	pthread_barrier_wait(&barrier[1]);

	elapsed_time[tid] = ((unsigned long long)time_end_high<< 32 | time_end_low) - ((unsigned long long)time_begin_high<<32 | time_begin_low);

	//printf("thread %d time_begin: %llu\t", tid, ((unsigned long long)time_begin_high<<32 | time_begin_low));
	//printf("thread %d time_end: %llu\t", tid, ((unsigned long long)time_end_high<< 32 | time_end_low));
	//printf("thread %d: %llu\n", tid, elapsed_time[tid]);
}

void test_prefetch_queue_last()
{
        unsigned int time_begin_high, time_begin_low, time_end_high, time_end_low;
	int i = 0;
	for(i = 0; i < 16*8; i=i+16){
		_mm_clevict((void *)&A[i], 1);		
		_mm_clevict((void *)&A[i], 2);		
	}
	__asm cpuid

	for(i = 0; i < 16*8; i=i+16){
		_mm_prefetch((void *)&A[i], 1);
	}

	pthread_barrier_wait(&barrier[0]);
	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm volatile("movl %0, %%r8d" : : "m"(A[0]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A[16]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A[16*2]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A[16*3]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A[16*4]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A[16*5]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A[16*6]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A[16*7]) : "r8d");

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_begin_low,esi
	__asm mov time_begin_high,edi
	__asm mov time_end_low,eax
	__asm mov time_end_high,edx
	pthread_barrier_wait(&barrier[1]);

	elapsed_time[NUM_THREADS-1] = ((unsigned long long)time_end_high<< 32 | time_end_low) - ((unsigned long long)time_begin_high<<32 | time_begin_low);

	//printf("thread %d time_begin: %llu\t", NUM_THREADS-1, ((unsigned long long)time_begin_high<<32 | time_begin_low));
	//printf("thread %d time_end: %llu\t", NUM_THREADS-1, ((unsigned long long)time_end_high<< 32 | time_end_low));
	//printf("thread %d: %llu\n", NUM_THREADS-1, elapsed_time[NUM_THREADS-1]);
}

void* prefetch_thread_ti(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;
	set_cpu(tid+1);

	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
}

void* prefetch_thread_last(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;
	set_cpu(tid+1);

	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
}

int main()
{
        //initialize the array first to 1) allocate physical page;
        //2) to avoid page fault
	int i;
	for(i = 0; i < ARRAY_SIZE*NUM_THREADS; i=i+1024)
		A[i] = i;

	pthread_t thread_id[NUM_THREADS];
	struct thread_arg_t thread_arg[NUM_THREADS];
	int ti;
	for(ti = 0; ti < NUM_THREADS; ++ti)
	{
		thread_arg[ti].tid = ti;
	}
	for(ti = 0; ti < NUM_BARRIERS; ++ti)
	{
		pthread_barrier_init(&barrier[ti], NULL, NUM_THREADS);
	}
	for(ti = 0; ti < NUM_THREADS-1; ++ti)
	{
	        pthread_create( &thread_id[ti], NULL, prefetch_thread_ti, (void *)&thread_arg[ti]);
       	}
	pthread_create( &thread_id[NUM_THREADS-1], NULL, prefetch_thread_last, (void *)&thread_arg[NUM_THREADS-1]);

	for(ti = 0; ti < NUM_THREADS; ++ti)   
	{      
		pthread_join(thread_id[ti], NULL);    
	}
	for(ti = 0; ti < NUM_BARRIERS; ++ti)   
	{      
		pthread_barrier_destroy(&barrier[ti]);
	}

	for(ti = NUM_THREADS-1; ti < NUM_THREADS; ++ti)   
	{      
		//printf("Thread %d: %llu\n", ti, elapsed_time[ti]);
		printf("%llu\t", elapsed_time[ti]);
	}
	//printf("\n");

	return 0;
}	
