#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

//double L1 TLB size
#define ARRAY_SIZE 1024*128

//double L2 TLB size
//#define ARRAY_SIZE 1024*1024*64

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int *A;
unsigned int time_l1,time_h1,time_l2,time_h2;
long long elapsed;

void test_prefetch_queue()
{
        //to replace the entry in TLB, making it L1, L2 TLB miss        
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;//*/

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%r8d" : : "m"(A[100]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);
}

int main()
{
	set_cpu(1);

	A = (volatile unsigned int *)valloc(ARRAY_SIZE*sizeof(volatile unsigned int));
	bzero((void *)A, ARRAY_SIZE*sizeof(volatile unsigned int));
	
	//to avoid page fault
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();//*/
	test_prefetch_queue();

	printf("%llu\t",elapsed);
	
	return 0;
}	
