#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>

//allocate a number of pages
#define ARRAY_SIZE 1024*4

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int A[ARRAY_SIZE];

struct my_time_struct{
  unsigned int time_l1,time_h1,time_l2,time_h2;
  long long elapsed;
};

struct my_time_struct my_time;

void test_prefetch_queue()
{
    int i;  
    for(i = 0; i < 1024*2; i=i+16){
      _mm_clevict((void *)&A[i], 1);
      _mm_clevict((void *)&A[i], 2);
    }

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	_mm_prefetch((void *)&A[0], 2);
	_mm_prefetch((void *)&A[16], 2);
	_mm_prefetch((void *)&A[16*2], 2);
	_mm_prefetch((void *)&A[16*3], 2);
	_mm_prefetch((void *)&A[16*4], 2);
	_mm_prefetch((void *)&A[16*5], 2);
	_mm_prefetch((void *)&A[16*6], 2);
	_mm_prefetch((void *)&A[16*7], 2);

	_mm_prefetch((void *)&A[16*8], 2);
	_mm_prefetch((void *)&A[16*9], 2);
	_mm_prefetch((void *)&A[16*10], 2);
	_mm_prefetch((void *)&A[16*11], 2);
	_mm_prefetch((void *)&A[16*12], 2);
	_mm_prefetch((void *)&A[16*13], 2);
	_mm_prefetch((void *)&A[16*14], 2);
	_mm_prefetch((void *)&A[16*15], 2);

	_mm_prefetch((void *)&A[16*16], 2);
	_mm_prefetch((void *)&A[16*17], 2);
	_mm_prefetch((void *)&A[16*18], 2);
	_mm_prefetch((void *)&A[16*19], 2);
	_mm_prefetch((void *)&A[16*20], 2);
	_mm_prefetch((void *)&A[16*21], 2);
	_mm_prefetch((void *)&A[16*22], 2);
	_mm_prefetch((void *)&A[16*23], 2);

	_mm_prefetch((void *)&A[16*24], 2);
	_mm_prefetch((void *)&A[16*25], 2);
	_mm_prefetch((void *)&A[16*26], 2);
	_mm_prefetch((void *)&A[16*27], 2);
	_mm_prefetch((void *)&A[16*28], 2);
	_mm_prefetch((void *)&A[16*29], 2);
	_mm_prefetch((void *)&A[16*30], 2);
	_mm_prefetch((void *)&A[16*31], 2);

	_mm_prefetch((void *)&A[16*32], 2);
	_mm_prefetch((void *)&A[16*33], 2);
	_mm_prefetch((void *)&A[16*34], 2);
	_mm_prefetch((void *)&A[16*35], 2);
	_mm_prefetch((void *)&A[16*36], 2);
	_mm_prefetch((void *)&A[16*37], 2);
	_mm_prefetch((void *)&A[16*38], 2);
	_mm_prefetch((void *)&A[16*39], 2);

	_mm_prefetch((void *)&A[16*40], 2);
	_mm_prefetch((void *)&A[16*41], 2);
	_mm_prefetch((void *)&A[16*42], 2);
	_mm_prefetch((void *)&A[16*43], 2);
	_mm_prefetch((void *)&A[16*44], 2);
	_mm_prefetch((void *)&A[16*45], 2);
	_mm_prefetch((void *)&A[16*46], 2);
	_mm_prefetch((void *)&A[16*47], 2);

	_mm_prefetch((void *)&A[16*48], 2);
	_mm_prefetch((void *)&A[16*49], 2);
	_mm_prefetch((void *)&A[16*50], 2);
	_mm_prefetch((void *)&A[16*51], 2);
	_mm_prefetch((void *)&A[16*52], 2);
	_mm_prefetch((void *)&A[16*53], 2);
	_mm_prefetch((void *)&A[16*54], 2);
	_mm_prefetch((void *)&A[16*55], 2);

	_mm_prefetch((void *)&A[16*56], 2);
	_mm_prefetch((void *)&A[16*57], 2);
	_mm_prefetch((void *)&A[16*58], 2);
	_mm_prefetch((void *)&A[16*59], 2);
	_mm_prefetch((void *)&A[16*60], 2);
	_mm_prefetch((void *)&A[16*61], 2);
	_mm_prefetch((void *)&A[16*62], 2);
	_mm_prefetch((void *)&A[16*63], 2);

	_mm_prefetch((void *)&A[16*64], 2);
	_mm_prefetch((void *)&A[16*65], 2);
	_mm_prefetch((void *)&A[16*66], 2);
	_mm_prefetch((void *)&A[16*67], 2);
	_mm_prefetch((void *)&A[16*68], 2);
	_mm_prefetch((void *)&A[16*69], 2);
	_mm_prefetch((void *)&A[16*70], 2);
	_mm_prefetch((void *)&A[16*71], 2);

	_mm_prefetch((void *)&A[16*72], 2);
	_mm_prefetch((void *)&A[16*73], 2);
	_mm_prefetch((void *)&A[16*74], 2);
	_mm_prefetch((void *)&A[16*75], 2);
	_mm_prefetch((void *)&A[16*76], 2);
	_mm_prefetch((void *)&A[16*77], 2);
	_mm_prefetch((void *)&A[16*78], 2);
	_mm_prefetch((void *)&A[16*79], 2);

	_mm_prefetch((void *)&A[16*80], 2);
	_mm_prefetch((void *)&A[16*81], 2);
	_mm_prefetch((void *)&A[16*82], 2);
	_mm_prefetch((void *)&A[16*83], 2);
	_mm_prefetch((void *)&A[16*84], 2);
	_mm_prefetch((void *)&A[16*85], 2);
	_mm_prefetch((void *)&A[16*86], 2);
	_mm_prefetch((void *)&A[16*87], 2);

	_mm_prefetch((void *)&A[16*88], 2);
	_mm_prefetch((void *)&A[16*89], 2);
	_mm_prefetch((void *)&A[16*90], 2);
	_mm_prefetch((void *)&A[16*91], 2);
	_mm_prefetch((void *)&A[16*92], 2);
	_mm_prefetch((void *)&A[16*93], 2);
	_mm_prefetch((void *)&A[16*94], 2);
	_mm_prefetch((void *)&A[16*95], 2);

	_mm_prefetch((void *)&A[16*96], 2);
	_mm_prefetch((void *)&A[16*97], 2);
	_mm_prefetch((void *)&A[16*98], 2);
	_mm_prefetch((void *)&A[16*99], 2);
	_mm_prefetch((void *)&A[16*100], 2);
	_mm_prefetch((void *)&A[16*101], 2);
	_mm_prefetch((void *)&A[16*102], 2);
	_mm_prefetch((void *)&A[16*103], 2);

	_mm_prefetch((void *)&A[16*104], 2);
	_mm_prefetch((void *)&A[16*105], 2);
	_mm_prefetch((void *)&A[16*106], 2);
	_mm_prefetch((void *)&A[16*107], 2);
	_mm_prefetch((void *)&A[16*108], 2);
	_mm_prefetch((void *)&A[16*109], 2);
	_mm_prefetch((void *)&A[16*110], 2);
	_mm_prefetch((void *)&A[16*111], 2);

	_mm_prefetch((void *)&A[16*112], 2);
	_mm_prefetch((void *)&A[16*113], 2);
	_mm_prefetch((void *)&A[16*114], 2);
	_mm_prefetch((void *)&A[16*115], 2);
	_mm_prefetch((void *)&A[16*116], 2);
	_mm_prefetch((void *)&A[16*117], 2);
	_mm_prefetch((void *)&A[16*118], 2);
	_mm_prefetch((void *)&A[16*119], 2);

	_mm_prefetch((void *)&A[16*120], 2);
	_mm_prefetch((void *)&A[16*121], 2);
	_mm_prefetch((void *)&A[16*122], 2);
	_mm_prefetch((void *)&A[16*123], 2);
	_mm_prefetch((void *)&A[16*124], 2);
	_mm_prefetch((void *)&A[16*125], 2);
	_mm_prefetch((void *)&A[16*126], 2);
	_mm_prefetch((void *)&A[16*127], 2);

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov my_time.time_l1,esi
	__asm mov my_time.time_h1,edi
	__asm mov my_time.time_l2,eax
	__asm mov my_time.time_h2,edx

	my_time.elapsed = ((unsigned long long)my_time.time_h2<< 32 | my_time.time_l2) - ((unsigned long long)my_time.time_h1<<32 | my_time.time_l1);
}

int main()
{
	set_cpu(1);

	//to avoid page fault and to guarantee pages are acutually allocated,
	//initialize each page
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",my_time.elapsed);
	
	return 0;
}	
