#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>

/* L1 can hold 8 pages of data, after that cache replacement happens.
** the stride to lay in the same cache set is one-page.
** we set ARRAY size to be 16 pages: double the L1 size. */
#define ARRAY_SIZE 1024*16

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int A[ARRAY_SIZE];

unsigned int time_l1,time_h1,time_l2,time_h2;
long long elapsed;

void test_prefetch_queue()
{
        //prefetch the cache line
	_mm_clevict((void *)&A[16], 1);
	_mm_clevict((void *)&A[16], 2);
	_mm_prefetch((void *)&A[16], 1);

        //computation to overlap the prefetching
        int i;
	for(i = 0; i < 100; i++){
	  asm("imull %r8d, %r8d");
	  asm("imull %r9d, %r9d");
	  asm("imull %r10d, %r10d");
	  asm("imull %r11d, %r11d");
	  asm("imull %r12d, %r12d");
	  asm("imull %r13d, %r13d");
	  asm("imull %r14d, %r14d");
	  asm("imull %r15d, %r15d");
	}

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%r8d" : : "m"(A[16]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);
}

int main()
{
	set_cpu(1);

	//initilize the array to ensure all physical pages are allocated
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",elapsed);
	//printf("\n");
	
	return 0;
}	
