#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>

/* L1 can hold 8 pages of data, after that cache replacement happens.
** the stride to lay in the same cache set is one-page.
** we set ARRAY size to be 16 pages: double the L1 size. */
#define ARRAY_SIZE 1024*16

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int *A;

unsigned int time_l1,time_h1,time_l2,time_h2;
long long elapsed;

/* at least half pages should be L1 cache miss
** and L2 cache hit if correct */
void test_prefetch_queue()
{
        volatile unsigned int *A_ti;
	int i;

	//read begin timestamp
	//__asm mov rbx,0
	__asm cpuid
	__asm rdtsc
	__asm mov esi,eax
	__asm mov edi,edx

	//unroll one page
	for(i = 0; i < ARRAY_SIZE; i=i+1024){

	A_ti = &A[i];

	asm("movl %0, %%r8d" : : "m"(A_ti[0]));
	asm("movl %0, %%r9d" : : "m"(A_ti[16]));
	asm("movl %0, %%r10d" : : "m"(A_ti[16*2]));
	asm("movl %0, %%r11d" : : "m"(A_ti[16*3]));
	asm("movl %0, %%r12d" : : "m"(A_ti[16*4]));
	asm("movl %0, %%r13d" : : "m"(A_ti[16*5]));
	asm("movl %0, %%r14d" : : "m"(A_ti[16*6]));
	asm("movl %0, %%r15d" : : "m"(A_ti[16*7]));

	asm("movl %0, %%r8d" : : "m"(A_ti[16*8]));
	asm("movl %0, %%r9d" : : "m"(A_ti[16*9]));
	asm("movl %0, %%r10d" : : "m"(A_ti[16*10]));
	asm("movl %0, %%r11d" : : "m"(A_ti[16*11]));
	asm("movl %0, %%r12d" : : "m"(A_ti[16*12]));
	asm("movl %0, %%r13d" : : "m"(A_ti[16*13]));
	asm("movl %0, %%r14d" : : "m"(A_ti[16*14]));
	asm("movl %0, %%r15d" : : "m"(A_ti[16*15]));

	asm("movl %0, %%r8d" : : "m"(A_ti[16*16]));
	asm("movl %0, %%r9d" : : "m"(A_ti[16*17]));
	asm("movl %0, %%r10d" : : "m"(A_ti[16*18]));
	asm("movl %0, %%r11d" : : "m"(A_ti[16*19]));
	asm("movl %0, %%r12d" : : "m"(A_ti[16*20]));
	asm("movl %0, %%r13d" : : "m"(A_ti[16*21]));
	asm("movl %0, %%r14d" : : "m"(A_ti[16*22]));
	asm("movl %0, %%r15d" : : "m"(A_ti[16*23]));

	asm("movl %0, %%r8d" : : "m"(A_ti[16*24]));
	asm("movl %0, %%r9d" : : "m"(A_ti[16*25]));
	asm("movl %0, %%r10d" : : "m"(A_ti[16*26]));
	asm("movl %0, %%r11d" : : "m"(A_ti[16*27]));
	asm("movl %0, %%r12d" : : "m"(A_ti[16*28]));
	asm("movl %0, %%r13d" : : "m"(A_ti[16*29]));
	asm("movl %0, %%r14d" : : "m"(A_ti[16*30]));
	asm("movl %0, %%r15d" : : "m"(A_ti[16*31]));

	asm("movl %0, %%r8d" : : "m"(A_ti[16*32]));
	asm("movl %0, %%r9d" : : "m"(A_ti[16*33]));
	asm("movl %0, %%r10d" : : "m"(A_ti[16*34]));
	asm("movl %0, %%r11d" : : "m"(A_ti[16*35]));
	asm("movl %0, %%r12d" : : "m"(A_ti[16*36]));
	asm("movl %0, %%r13d" : : "m"(A_ti[16*37]));
	asm("movl %0, %%r14d" : : "m"(A_ti[16*38]));
	asm("movl %0, %%r15d" : : "m"(A_ti[16*39]));

	asm("movl %0, %%r8d" : : "m"(A_ti[16*40]));
	asm("movl %0, %%r9d" : : "m"(A_ti[16*41]));
	asm("movl %0, %%r10d" : : "m"(A_ti[16*42]));
	asm("movl %0, %%r11d" : : "m"(A_ti[16*43]));
	asm("movl %0, %%r12d" : : "m"(A_ti[16*44]));
	asm("movl %0, %%r13d" : : "m"(A_ti[16*45]));
	asm("movl %0, %%r14d" : : "m"(A_ti[16*46]));
	asm("movl %0, %%r15d" : : "m"(A_ti[16*47]));

	asm("movl %0, %%r8d" : : "m"(A_ti[16*48]));
	asm("movl %0, %%r9d" : : "m"(A_ti[16*49]));
	asm("movl %0, %%r10d" : : "m"(A_ti[16*50]));
	asm("movl %0, %%r11d" : : "m"(A_ti[16*51]));
	asm("movl %0, %%r12d" : : "m"(A_ti[16*52]));
	asm("movl %0, %%r13d" : : "m"(A_ti[16*53]));
	asm("movl %0, %%r14d" : : "m"(A_ti[16*54]));
	asm("movl %0, %%r15d" : : "m"(A_ti[16*55]));

	asm("movl %0, %%r8d" : : "m"(A_ti[16*56]));
	asm("movl %0, %%r9d" : : "m"(A_ti[16*57]));
	asm("movl %0, %%r10d" : : "m"(A_ti[16*58]));
	asm("movl %0, %%r11d" : : "m"(A_ti[16*59]));
	asm("movl %0, %%r12d" : : "m"(A_ti[16*60]));
	asm("movl %0, %%r13d" : : "m"(A_ti[16*61]));
	asm("movl %0, %%r14d" : : "m"(A_ti[16*62]));
	asm("movl %0, %%r15d" : : "m"(A_ti[16*63]));

	}

	//read end timestamp	
	//__asm cpuid
	__asm rdtsc
	__asm mov time_l1,esi
	__asm mov time_h1,edi
	__asm mov time_l2,eax
	__asm mov time_h2,edx

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);
}

int main()
{
	set_cpu(1);

	
	//allocate and no initialize pages will not allocate actual physical page
	//think about copy-on-write policy
	A = (volatile unsigned int *)valloc(ARRAY_SIZE*sizeof(volatile unsigned int));
	/*bzero((void *)A, ARRAY_SIZE*sizeof(volatile unsigned int));
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;//*/

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",elapsed);
	//printf("\n");
	
	return 0;
}	
