#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>
#include <sys/mman.h>

//allocate 4 huge pages
#define ARRAY_SIZE 1024*1024*1

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int *A;
unsigned int time_l1,time_h1,time_l2,time_h2;
long long elapsed;

/*void test_prefetch_queue()
{
	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%r8d" : : "m"(A[100]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);
}//*/

void test_prefetch_queue()
{
        int i;
	for(i = 0; i < 1024*16; i=i+1024){
	  _mm_clevict((void *)&A[16*5+i], 1);
	  _mm_clevict((void *)&A[16*6+i], 1);
	  _mm_clevict((void *)&A[16*7+i], 1);
	  _mm_clevict((void *)&A[16*8+i], 1);
	  _mm_clevict((void *)&A[16*5+i], 2);
	  _mm_clevict((void *)&A[16*6+i], 2);
	  _mm_clevict((void *)&A[16*7+i], 2);
	  _mm_clevict((void *)&A[16*8+i], 2);

	  /*_mm_clevict((void *)&A[16*45+i], 1);
	  _mm_clevict((void *)&A[16*44+i], 1);
	  _mm_clevict((void *)&A[16*43+i], 1);
	  _mm_clevict((void *)&A[16*42+i], 1);
	  _mm_clevict((void *)&A[16*45+i], 2);
	  _mm_clevict((void *)&A[16*44+i], 2);
	  _mm_clevict((void *)&A[16*43+i], 2);
	  _mm_clevict((void *)&A[16*42+i], 2);//*/
	}
	__asm cpuid

	for(i = 0; i < 1024*16; i=i+1024){
	  asm("movl %0, %%r9d" : : "m"(A[16*5+i]));
	  //asm("movl %0, %%r9d" : : "m"(A[16*43+i]));
	}//*/

	for(i = 0; i < 1024*16; i=i+1024){
	  asm("movl %0, %%r9d" : : "m"(A[16*6+i]));
	  //asm("movl %0, %%r9d" : : "m"(A[16*44+i]));
	}//*/

	for(i = 0; i < 1024*16; i=i+1024){
	  asm("movl %0, %%r9d" : : "m"(A[16*7+i]));
	  //asm("movl %0, %%r9d" : : "m"(A[16*45+i]));
	}//*/

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%r9d" : : "m"(A[16*8]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*1]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*2]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*3]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*4]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*5]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*6]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*7]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*8]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*9]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*10]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*11]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*12]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*13]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*14]));
	asm("movl %0, %%r9d" : : "m"(A[16*8+1024*15]));

	//asm("movl %0, %%r9d" : : "m"(A[16*42]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*1]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*2]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*3]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*4]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*5]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*6]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*7]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*8]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*9]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*10]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*11]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*12]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*13]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*14]));
	//asm("movl %0, %%r9d" : : "m"(A[16*42+1024*15]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);

	//printf("%u\t%u\n",time_h1,time_l1);
	//printf("%u\t%u\n",time_h2,time_l2);
	//printf("%llu\t",elapsed);
	//unsigned long A_addr = (unsigned long)(&A[0]);
	//printf("\nA_addr = %llu\n", A_addr);
}


int main()
{
	set_cpu(1);

	//using huge page size: 2MB
	A = (volatile unsigned int *)mmap(0, ARRAY_SIZE*sizeof(volatile unsigned int),
	     PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB, -1, 0);
	bzero((void *)A, ARRAY_SIZE*sizeof(volatile unsigned int));
	
	//to avoid page fault
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();//*/
	test_prefetch_queue();

	printf("%llu\t",elapsed);

	munmap((void *)A, ARRAY_SIZE*sizeof(volatile unsigned int));
	return 0;
}	
