#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>

/* L1 can hold 8 pages of data, after that cache replacement happens.
** the stride to lay in the same cache set is one-page.
** we set ARRAY size to be 16 pages: double the L1 size. */
#define ARRAY_SIZE 1024*16

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int *A;

unsigned int time_l1,time_h1,time_l2,time_h2;
long long sum;
unsigned int temp;
long long elapsed;

/* we access 16 lines in the same cache set (the way is 8), thus
** cache replacement happens. we have two types of access pattern:
** streaming and random. also we have two types of prefetching:
** with and without the locality. in the first round we prefetch
** these 16 lines, the second round we load these lines and see
** the latency. this makes sense since we tend to prefetch each
** cache line only once.
*/
void test_prefetch_queue()
{
        //flush these cache lines first
        int i;
	for(i = 160; i < ARRAY_SIZE; i=i+1024){
	  _mm_clevict((void *)&A[i], 1);
	  _mm_clevict((void *)&A[i], 2);
	}
	__asm cpuid

	//prefetch the cache line into L1 cache (also to L2)
	for(i = 160; i < ARRAY_SIZE; i=i+1024){
	  _mm_prefetch((void *)&A[i], 5);
	}
	__asm cpuid
  
	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	asm("movl %0, %%r15d" : : "m"(A[160]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*2]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*3]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*4]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*5]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*6]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*7]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*8]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*9]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*10]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*11]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*12]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*13]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*14]));
	asm("movl %0, %%r15d" : : "m"(A[160+1024*15]));

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);
}

int main()
{
	set_cpu(1);

	A = (volatile unsigned int *)valloc(ARRAY_SIZE*sizeof(volatile unsigned int));
	bzero((void *)A, ARRAY_SIZE*sizeof(volatile unsigned int));
	
	//to avoid page fault
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",elapsed);
	//printf("\n");
	
	return 0;
}	
