#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>

//allocate a number of pages
#define ARRAY_SIZE 1024*16

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int A[ARRAY_SIZE];
unsigned int time_l1,time_h1,time_l2,time_h2;
long long sum;
unsigned int temp;
long long elapsed;

void test_prefetch_queue()
{
	//read begin timestamp
        asm volatile ("cpuid\n\t"
		"rdtsc\n\t"
		"mov %%edx, %%edi\n\t"
		"mov %%eax, %%esi\n\t"
		:
		:
		: "rax", "rbx", "rcx", "rdx", "esi", "edi");

	//_mm_prefetch((void *)&A[10], 1);

	asm volatile("movl %0, %%r8d" : : "m"(A[0]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*2]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*3]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*4]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*5]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*6]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*7]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*8]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*9]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*10]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*11]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*12]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*13]) : "r12d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*14]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*15]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*16]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*17]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*18]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*19]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*20]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*21]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*22]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*23]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*24]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*25]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*26]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*27]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*28]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*29]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*30]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*31]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*32]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*33]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*34]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*35]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*36]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*37]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*38]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*39]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*40]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*41]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*42]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*43]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*44]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*45]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*46]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*47]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*48]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*49]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*50]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*51]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*52]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*53]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*54]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*55]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*56]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*57]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*58]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*59]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*60]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*61]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*62]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*63]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*64]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*65]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*66]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*67]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*68]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*69]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*70]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*71]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*72]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*73]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*74]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*75]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*76]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*77]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*78]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*79]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*80]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*81]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*82]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*83]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*84]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*85]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*86]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*87]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*88]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*89]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*90]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*91]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*92]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*93]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*94]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*95]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*96]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*97]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*98]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*99]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*100]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*101]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*102]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*103]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*104]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*105]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*106]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*107]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*108]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*109]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*110]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*111]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*112]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*113]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*114]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*115]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*116]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*117]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*118]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*119]) : "r15d");

	asm volatile("movl %0, %%r8d" : : "m"(A[16*120]) : "r8d");
	asm volatile("movl %0, %%r9d" : : "m"(A[16*121]) : "r9d");
	asm volatile("movl %0, %%r10d" : : "m"(A[16*122]) : "r10d");
	asm volatile("movl %0, %%r11d" : : "m"(A[16*123]) : "r11d");
	asm volatile("movl %0, %%r12d" : : "m"(A[16*124]) : "r12d");
	asm volatile("movl %0, %%r13d" : : "m"(A[16*125]) : "r13d");
	asm volatile("movl %0, %%r14d" : : "m"(A[16*126]) : "r14d");
	asm volatile("movl %0, %%r15d" : : "m"(A[16*127]) : "r15d");

	//read end timestamp	
        asm volatile ("rdtscp\n\t"
		"mov %%edx, %0\n\t"
		"mov %%eax, %1\n\t"
		"cpuid\n\t"
		: "=r" (time_h2), "=r" (time_l2) 
		:
		: "rax", "rbx", "rcx", "rdx");

        asm volatile ("mov %%edi, %0\n\t"
		"mov %%esi, %1\n\t"
		: "=m" (time_h1), "=m" (time_l1) 
		:
		: "edi", "esi");

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);

	//printf("%u\t%u\n",time_h1,time_l1);
	//printf("%u\t%u\n",time_h2,time_l2);
	//printf("%llu\t",elapsed);
	//unsigned long A_addr = (unsigned long)(&A[0]);
	//printf("\nA_addr = %llu\n", A_addr);
}

int main()
{
	set_cpu(1);

	//to avoid page fault and to guarantee pages are acutually allocated,
	//initialize each page
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  A[i] = i;

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",elapsed);
	//printf("\n");
	
	return 0;
}	
