#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <pthread.h>

#define NUM_THREADS 2 
#define CORE_ID 2

#define NUM_BARRIERS 5
pthread_barrier_t barrier[NUM_BARRIERS];

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

struct thread_arg_t {
	int tid;
};

volatile unsigned int A[1024*2];
//unsigned int time_begin_high[NUM_THREADS], time_begin_low[NUM_THREADS], time_end_high[NUM_THREADS], time_end_low[NUM_THREADS];
long long elapsed_time[NUM_THREADS];
volatile unsigned int temp_read;

void test_prefetch_queue0()
{
        unsigned int time_begin_high, time_begin_low, time_end_high, time_end_low;
		
	_mm_clevict((void *)&A[160], 1);
	_mm_clevict((void *)&A[176], 1);
	_mm_clevict((void *)&A[192], 1);
	_mm_clevict((void *)&A[160], 2);
	_mm_clevict((void *)&A[176], 2);	
	_mm_clevict((void *)&A[192], 2);
	pthread_barrier_wait(&barrier[0]);

	//waiting fore core 1 to first load the data
	pthread_barrier_wait(&barrier[1]);

	asm("movl %%eax, %0" : "=m"(A[160]));
	asm("movl %%eax, %0" : "=m"(A[176]));
	asm("movl %%eax, %0" : "=m"(A[192]));

	pthread_barrier_wait(&barrier[2]);
}

void test_prefetch_queue1()
{
        unsigned int time_begin_high, time_begin_low, time_end_high, time_end_low;

	_mm_clevict((void *)&A[208], 1);
	_mm_clevict((void *)&A[208], 2);
	pthread_barrier_wait(&barrier[0]);

	asm("movl %0, %%eax" : : "m"(A[208]));

	pthread_barrier_wait(&barrier[1]);

	//waiting for HW prefetcher on core 0 triggered
	pthread_barrier_wait(&barrier[2]);

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%eax" : : "m"(A[208]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_begin_low,esi
	__asm mov time_begin_high,edi
	__asm mov time_end_low,eax
	__asm mov time_end_high,edx

	elapsed_time[1] = ((unsigned long long)time_end_high<< 32 | time_end_low) - ((unsigned long long)time_begin_high<<32 | time_begin_low);

	//printf("thread 1 time_begin: %llu\t",((unsigned long long)time_begin_high<<32 | time_begin_low));
	//printf("thread 1 time_end: %llu\t",((unsigned long long)time_end_high<< 32 | time_end_low));
	//printf("thread 1: %llu\n",elapsed_time[1]);
}

void* prefetch_thread_0(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;
	set_cpu(0*4+1);
	
	A[0] = 0;
	A[1024] = 1024;

	test_prefetch_queue0();
	test_prefetch_queue0();
	test_prefetch_queue0();
	test_prefetch_queue0();
	test_prefetch_queue0();
}

void* prefetch_thread_1(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;
	set_cpu(CORE_ID*4+1);
	
	A[0] = 0;
	A[1024] = 1024;

	test_prefetch_queue1();
	test_prefetch_queue1();
	test_prefetch_queue1();
	test_prefetch_queue1();
	test_prefetch_queue1();
}

int main()
{
	pthread_t thread_id[NUM_THREADS];
	struct thread_arg_t thread_arg[NUM_THREADS];
	int ti;
	for(ti = 0; ti < NUM_THREADS; ++ti)
	{
		thread_arg[ti].tid = ti;
	}
	for(ti = 0; ti < NUM_BARRIERS; ++ti)
	{
		pthread_barrier_init(&barrier[ti], NULL, NUM_THREADS);
	}
	pthread_create( &thread_id[0], NULL, prefetch_thread_0, (void *)&thread_arg[0]);
	pthread_create( &thread_id[1], NULL, prefetch_thread_1, (void *)&thread_arg[1]);

	for(ti = 0; ti < NUM_THREADS; ++ti)   
	{      
		pthread_join(thread_id[ti], NULL);    
	}
	for(ti = 0; ti < NUM_BARRIERS; ++ti)   
	{      
		pthread_barrier_destroy(&barrier[ti]);
	}

	for(ti = 1; ti < NUM_THREADS; ++ti)   
	{      
		//printf("Thread %d: %llu\n", ti, elapsed_time[ti]);
		printf("%llu\t", elapsed_time[ti]);
	}
	//printf("\n");

	return 0;
}	
