#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <pthread.h>

#define NUM_CORES 9
#define NUM_THREADS NUM_CORES 

#define NUM_BARRIERS 5
pthread_barrier_t barrier[NUM_BARRIERS];

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

struct thread_arg_t {
	int tid;
};

volatile unsigned int A[1024];
long long elapsed_time[2];

void test_prefetch_queue_ti(int tid)
{
	asm("movl %0, %%eax" : : "m"(A[100]));
	__asm cpuid

	pthread_barrier_wait(&barrier[0]);

	pthread_barrier_wait(&barrier[1]);

	asm("movl %0, %%eax" : : "m"(A[100]));
	__asm cpuid

	pthread_barrier_wait(&barrier[2]);

	pthread_barrier_wait(&barrier[3]);
}

void test_prefetch_queue_last()
{
        unsigned int time_begin_high, time_begin_low, time_end_high, time_end_low;

	_mm_clevict((void *)&A[100], 1);
	_mm_clevict((void *)&A[100], 2);
	__asm cpuid	

	pthread_barrier_wait(&barrier[0]);

	//_mm_prefetch((void *)&A[100], 1);

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%ecx" : : "m"(A[100]));
	asm("addl $1, %ecx");
	asm("movl %%ecx, %0" : "=m"(A[100]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_begin_low,esi
	__asm mov time_begin_high,edi
	__asm mov time_end_low,eax
	__asm mov time_end_high,edx

	elapsed_time[0] = ((unsigned long long)time_end_high<< 32 | time_end_low) - ((unsigned long long)time_begin_high<<32 | time_begin_low);

	pthread_barrier_wait(&barrier[1]);

	_mm_clevict((void *)&A[100], 1);
	_mm_clevict((void *)&A[100], 2);
	__asm cpuid	

	pthread_barrier_wait(&barrier[2]);

	//_mm_prefetch((void *)&A[100], 5);

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	_mm_prefetch((void *)&A[100], 5);

	asm("movl %0, %%ecx" : : "m"(A[100]));
	asm("addl $1, %ecx");
	asm("movl %%ecx, %0" : "=m"(A[100]));//*/

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_begin_low,esi
	__asm mov time_begin_high,edi
	__asm mov time_end_low,eax
	__asm mov time_end_high,edx

	elapsed_time[1] = ((unsigned long long)time_end_high<< 32 | time_end_low) - ((unsigned long long)time_begin_high<<32 | time_begin_low);

	pthread_barrier_wait(&barrier[3]);
}

void* prefetch_thread_ti(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;
	set_cpu(tid*4+1);

	//initialize the page to avoid page fault and ensure physical page allocation
	A[0] = 0;
	A[1023] = 1023;

	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
	test_prefetch_queue_ti(tid);
}

void* prefetch_thread_last(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;
	set_cpu(tid*4+1);

	//initialize the page to avoid page fault and ensure physical page allocation
	A[0] = 0;
	A[1023] = 1023;

	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
	test_prefetch_queue_last();
}

int main()
{
	pthread_t thread_id[NUM_THREADS];
	struct thread_arg_t thread_arg[NUM_THREADS];
	int ti;
	for(ti = 0; ti < NUM_THREADS; ++ti)
	{
		thread_arg[ti].tid = ti;
	}
	for(ti = 0; ti < NUM_BARRIERS; ++ti)
	{
		pthread_barrier_init(&barrier[ti], NULL, NUM_THREADS);
	}
	for(ti = 0; ti < NUM_THREADS-1; ++ti)
	{
	        pthread_create( &thread_id[ti], NULL, prefetch_thread_ti, (void *)&thread_arg[ti]);
       	}
	pthread_create( &thread_id[NUM_THREADS-1], NULL, prefetch_thread_last, (void *)&thread_arg[NUM_THREADS-1]);

	for(ti = 0; ti < NUM_THREADS; ++ti)   
	{      
		pthread_join(thread_id[ti], NULL);    
	}
	for(ti = 0; ti < NUM_BARRIERS; ++ti)   
	{      
		pthread_barrier_destroy(&barrier[ti]);
	}

	for(ti = 0; ti < 2; ++ti)
	{      
		printf("%llu\t", elapsed_time[ti]);
	}
	printf("\n");

	return 0;
}	
