#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

#define ARRAY_SIZE 1024*16
#define DISTANCE 16

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int *A;
unsigned int time_l1,time_h1,time_l2,time_h2;
long long sum;
unsigned int temp;
long long elapsed[0];

void test_prefetch_queue()
{
        _mm_clevict((void *)&A[16*5], 1);
        _mm_clevict((void *)&A[16*6], 1);
        _mm_clevict((void *)&A[16*8], 1);
        _mm_clevict((void *)&A[16*11], 1);
        _mm_clevict((void *)&A[16*15], 1);
        _mm_clevict((void *)&A[16*20], 1);
        _mm_clevict((void *)&A[16*26], 1);
        _mm_clevict((void *)&A[16*33], 1);

        _mm_clevict((void *)&A[16*5], 2);
        _mm_clevict((void *)&A[16*6], 2);
        _mm_clevict((void *)&A[16*8], 2);
        _mm_clevict((void *)&A[16*11], 2);
        _mm_clevict((void *)&A[16*15], 2);
        _mm_clevict((void *)&A[16*20], 2);
        _mm_clevict((void *)&A[16*26], 2);
        _mm_clevict((void *)&A[16*33], 2);

	_mm_clevict((void *)&A[16*75], 1);
	_mm_clevict((void *)&A[16*50+256*DISTANCE], 1);
	_mm_clevict((void *)&A[16*76], 1);
	_mm_clevict((void *)&A[16*76+256*DISTANCE], 1);
	_mm_clevict((void *)&A[16*78], 1);
	_mm_clevict((void *)&A[16*78+256*DISTANCE], 1);
	_mm_clevict((void *)&A[16*81], 1);
	_mm_clevict((void *)&A[16*81+256*DISTANCE], 1);

	_mm_clevict((void *)&A[16*75], 2);
	_mm_clevict((void *)&A[16*50+256*DISTANCE], 2);
	_mm_clevict((void *)&A[16*76], 2);
	_mm_clevict((void *)&A[16*76+256*DISTANCE], 2);
	_mm_clevict((void *)&A[16*78], 2);
	_mm_clevict((void *)&A[16*78+256*DISTANCE], 2);
	_mm_clevict((void *)&A[16*81], 2);
	_mm_clevict((void *)&A[16*81+256*DISTANCE], 2);

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%ebx" : : "m"(A[16*5]));
	asm("movl %0, %%ebx" : : "m"(A[16*6]));
	asm("movl %0, %%ebx" : : "m"(A[16*8]));
	asm("movl %0, %%ebx" : : "m"(A[16*11]));
	asm("movl %0, %%ebx" : : "m"(A[16*15]));
	asm("movl %0, %%ebx" : : "m"(A[16*20]));
	asm("movl %0, %%ebx" : : "m"(A[16*26]));
	asm("movl %0, %%ebx" : : "m"(A[16*33]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed[0] = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);//*/

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%ebx" : : "m"(A[16*75]));
	asm("movl %0, %%ebx" : : "m"(A[16*50+256*DISTANCE]));
	asm("movl %0, %%ebx" : : "m"(A[16*76]));
	asm("movl %0, %%ebx" : : "m"(A[16*76+256*DISTANCE]));
	asm("movl %0, %%ebx" : : "m"(A[16*78]));
	asm("movl %0, %%ebx" : : "m"(A[16*78+256*DISTANCE]));
	asm("movl %0, %%ebx" : : "m"(A[16*81]));
	asm("movl %0, %%ebx" : : "m"(A[16*81+256*DISTANCE]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed[1] = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);

}

int main()
{
	set_cpu(1);

	A = (volatile unsigned int *)valloc(ARRAY_SIZE*sizeof(volatile unsigned int));
	bzero((void *)A, ARRAY_SIZE*sizeof(volatile unsigned int));
	//to avoid page fault.
	/*int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	A[i] = i;//*/
	A[0] = 0;
	A[1024] = 1024;
	A[256*DISTANCE] = 1024;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();//*/
	printf("%llu\t",elapsed[0]);
	printf("%llu\t",elapsed[1]);
	printf("\n");
	
	return 0;
}	
