#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>

//allocate a number of pages
#define ARRAY_SIZE 1024*32

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int A[ARRAY_SIZE];

struct my_time_struct{
  unsigned int time_l1,time_h1,time_l2,time_h2;
  long long elapsed;
};

struct my_time_struct my_time;

void test_prefetch_queue()
{
    int i;  
    for(i = 0; i < ARRAY_SIZE; i=i+1024){
      _mm_clevict((void *)&A[16*5+i], 1);
      _mm_clevict((void *)&A[16*45+i], 1);
      _mm_clevict((void *)&A[16*5+i], 2);
      _mm_clevict((void *)&A[16*45+i], 2);
    }

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	_mm_prefetch((void *)&A[16*5], 2);
	_mm_prefetch((void *)&A[16*5+1024*1], 2);
	_mm_prefetch((void *)&A[16*5+1024*2], 2);
	_mm_prefetch((void *)&A[16*5+1024*3], 2);
	_mm_prefetch((void *)&A[16*5+1024*4], 2);
	_mm_prefetch((void *)&A[16*5+1024*5], 2);
	_mm_prefetch((void *)&A[16*5+1024*6], 2);
	_mm_prefetch((void *)&A[16*5+1024*7], 2);

	_mm_prefetch((void *)&A[16*5+1024*8], 2);
	_mm_prefetch((void *)&A[16*5+1024*9], 2);
	_mm_prefetch((void *)&A[16*5+1024*10], 2);
	_mm_prefetch((void *)&A[16*5+1024*11], 2);
	_mm_prefetch((void *)&A[16*5+1024*12], 2);
	_mm_prefetch((void *)&A[16*5+1024*13], 2);
	_mm_prefetch((void *)&A[16*5+1024*14], 2);
	_mm_prefetch((void *)&A[16*5+1024*15], 2);

	_mm_prefetch((void *)&A[16*5+1024*16], 2);
	_mm_prefetch((void *)&A[16*5+1024*17], 2);
	_mm_prefetch((void *)&A[16*5+1024*18], 2);
	_mm_prefetch((void *)&A[16*5+1024*19], 2);
	_mm_prefetch((void *)&A[16*5+1024*20], 2);
	_mm_prefetch((void *)&A[16*5+1024*21], 2);
	_mm_prefetch((void *)&A[16*5+1024*22], 2);
	_mm_prefetch((void *)&A[16*5+1024*23], 2);

	_mm_prefetch((void *)&A[16*5+1024*24], 2);
	_mm_prefetch((void *)&A[16*5+1024*25], 2);
	_mm_prefetch((void *)&A[16*5+1024*26], 2);
	_mm_prefetch((void *)&A[16*5+1024*27], 2);
	_mm_prefetch((void *)&A[16*5+1024*28], 2);
	_mm_prefetch((void *)&A[16*5+1024*29], 2);
	_mm_prefetch((void *)&A[16*5+1024*30], 2);
	_mm_prefetch((void *)&A[16*5+1024*31], 2);

	//_mm_prefetch((void *)&A[16*5+1024*32], 2);
	//_mm_prefetch((void *)&A[16*5+1024*33], 2);
	//_mm_prefetch((void *)&A[16*5+1024*34], 2);
	//_mm_prefetch((void *)&A[16*5+1024*35], 2);
	//_mm_prefetch((void *)&A[16*5+1024*36], 2);
	//_mm_prefetch((void *)&A[16*5+1024*37], 2);
	//_mm_prefetch((void *)&A[16*5+1024*38], 2);
	//_mm_prefetch((void *)&A[16*5+1024*39], 2);

	//_mm_prefetch((void *)&A[16*5+1024*40], 2);
	//_mm_prefetch((void *)&A[16*5+1024*41], 2);
	//_mm_prefetch((void *)&A[16*5+1024*42], 2);
	//_mm_prefetch((void *)&A[16*5+1024*43], 2);
	//_mm_prefetch((void *)&A[16*5+1024*44], 2);
	//_mm_prefetch((void *)&A[16*5+1024*45], 2);
	//_mm_prefetch((void *)&A[16*5+1024*46], 2);
	//_mm_prefetch((void *)&A[16*5+1024*47], 2);

	_mm_prefetch((void *)&A[16*45], 2);
	_mm_prefetch((void *)&A[16*45+1024*1], 2);
	_mm_prefetch((void *)&A[16*45+1024*2], 2);
	_mm_prefetch((void *)&A[16*45+1024*3], 2);
	_mm_prefetch((void *)&A[16*45+1024*4], 2);
	_mm_prefetch((void *)&A[16*45+1024*5], 2);
	_mm_prefetch((void *)&A[16*45+1024*6], 2);
	_mm_prefetch((void *)&A[16*45+1024*7], 2);

	_mm_prefetch((void *)&A[16*45+1024*8], 2);
	_mm_prefetch((void *)&A[16*45+1024*9], 2);
	_mm_prefetch((void *)&A[16*45+1024*10], 2);
	_mm_prefetch((void *)&A[16*45+1024*11], 2);
	_mm_prefetch((void *)&A[16*45+1024*12], 2);
	_mm_prefetch((void *)&A[16*45+1024*13], 2);
	_mm_prefetch((void *)&A[16*45+1024*14], 2);
	_mm_prefetch((void *)&A[16*45+1024*15], 2);

	_mm_prefetch((void *)&A[16*45+1024*16], 2);
	_mm_prefetch((void *)&A[16*45+1024*17], 2);
	_mm_prefetch((void *)&A[16*45+1024*18], 2);
	_mm_prefetch((void *)&A[16*45+1024*19], 2);
	_mm_prefetch((void *)&A[16*45+1024*20], 2);
	_mm_prefetch((void *)&A[16*45+1024*21], 2);
	_mm_prefetch((void *)&A[16*45+1024*22], 2);
	_mm_prefetch((void *)&A[16*45+1024*23], 2);

	_mm_prefetch((void *)&A[16*45+1024*24], 2);
	_mm_prefetch((void *)&A[16*45+1024*25], 2);
	_mm_prefetch((void *)&A[16*45+1024*26], 2);
	_mm_prefetch((void *)&A[16*45+1024*27], 2);
	_mm_prefetch((void *)&A[16*45+1024*28], 2);
	_mm_prefetch((void *)&A[16*45+1024*29], 2);
	_mm_prefetch((void *)&A[16*45+1024*30], 2);
	_mm_prefetch((void *)&A[16*45+1024*31], 2);

	//_mm_prefetch((void *)&A[16*45+1024*32], 2);
	//_mm_prefetch((void *)&A[16*45+1024*33], 2);
	//_mm_prefetch((void *)&A[16*45+1024*34], 2);
	//_mm_prefetch((void *)&A[16*45+1024*35], 2);
	//_mm_prefetch((void *)&A[16*45+1024*36], 2);
	//_mm_prefetch((void *)&A[16*45+1024*37], 2);
	//_mm_prefetch((void *)&A[16*45+1024*38], 2);
	//_mm_prefetch((void *)&A[16*45+1024*39], 2);

	//_mm_prefetch((void *)&A[16*45+1024*40], 2);
	//_mm_prefetch((void *)&A[16*45+1024*41], 2);
	//_mm_prefetch((void *)&A[16*45+1024*42], 2);
	//_mm_prefetch((void *)&A[16*45+1024*43], 2);
	//_mm_prefetch((void *)&A[16*45+1024*44], 2);
	//_mm_prefetch((void *)&A[16*45+1024*45], 2);
	//_mm_prefetch((void *)&A[16*45+1024*46], 2);
	//_mm_prefetch((void *)&A[16*45+1024*47], 2);

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov my_time.time_l1,esi
	__asm mov my_time.time_h1,edi
	__asm mov my_time.time_l2,eax
	__asm mov my_time.time_h2,edx

	my_time.elapsed = ((unsigned long long)my_time.time_h2<< 32 | my_time.time_l2) - ((unsigned long long)my_time.time_h1<<32 | my_time.time_l1);
}

int main()
{
	set_cpu(2);

	//to avoid page fault and to guarantee pages are acutually allocated,
	//initialize each page
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",my_time.elapsed);
	
	return 0;
}	
