#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

#define ARRAY_SIZE 1024*2

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int *A;
unsigned int time_l1,time_h1,time_l2,time_h2;
long long sum;
unsigned int temp;
long long elapsed;

void test_prefetch_queue()
{
        int i, j;
	for(i = 1024; i < ARRAY_SIZE; i=i+16){
	  _mm_clevict((void *)&A[i], 1);
	  _mm_clevict((void *)&A[i], 2);
	}//*/

        _mm_clevict((void *)&A[16*5], 1);
        _mm_clevict((void *)&A[16*6], 1);
        _mm_clevict((void *)&A[16*7], 1);
        _mm_clevict((void *)&A[16*8], 1);

        _mm_clevict((void *)&A[16*5], 2);
        _mm_clevict((void *)&A[16*6], 2);
        _mm_clevict((void *)&A[16*7], 2);
        _mm_clevict((void *)&A[16*8], 2);

	__asm cpuid

	asm("movl %0, %%r9d" : : "m"(A[16*5]));
	asm("movl %0, %%r9d" : : "m"(A[16*6]));
        //for(i = 1024; i < ARRAY_SIZE; i=i+16){
	//asm("movl %0, %%r9d" : : "m"(A[i]));
	  for(j = 0; j < 1000000; j++){
	    asm("imull %r9d, %r9d");
	  }
	//}//*/
	asm("movl %0, %%r9d" : : "m"(A[16*7]));
        
	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%r9d" : : "m"(A[16*8]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);

	//printf("%u\t%u\n",time_h1,time_l1);
	//printf("%u\t%u\n",time_h2,time_l2);
	//printf("%llu\t",elapsed);
	//unsigned long A_addr = (unsigned long)(&A[0]);
	//printf("\nA_addr = %llu\n", A_addr);
}

int main()
{
	set_cpu(1);

	A = (volatile unsigned int *)valloc(ARRAY_SIZE*sizeof(volatile unsigned int));
	//to avoid page fault and ensure physical page allocation
	bzero((void *)A, ARRAY_SIZE*sizeof(volatile unsigned int));
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",elapsed);
	//printf("\n");
	
	return 0;
}	
