#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>

//allocate a number of pages
#define ARRAY_SIZE 1024

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned long long A[ARRAY_SIZE];
unsigned int time_l1,time_h1,time_l2,time_h2;
long long sum;
unsigned int temp;
long long elapsed;

void test_prefetch_queue()
{
        int i;
	asm volatile ("mfence\n\t"
		      "cpuid\n\t" 
		      : 
		      : 
		      : "rax", "rbx", "rcx", "rdx");

	//flush caches
	for(i = 0; i < ARRAY_SIZE; i=i+8)
	  _mm_clflush((void *)&A[i]);

	asm volatile ("mfence\n\t"
		      "cpuid\n\t" 
		      : 
		      : 
		      : "rax", "rbx", "rcx", "rdx");

	//prefetch caches
	for(i = 0; i < ARRAY_SIZE; i=i+8)
	  _mm_prefetch((void *)&A[i], 1);

	//read begin timestamp
        asm volatile ("mfence\n\t"
		"cpuid\n\t"
		"rdtsc\n\t"
		"mov %%edx, %%edi\n\t"
		"mov %%eax, %%esi\n\t"
		:
		:
		: "rax", "rbx", "rcx", "rdx", "esi", "edi");

	asm volatile ("movl %0, %%r8d\n\t"
		      :
		      : "m" (A[0])
		      : "r8d" );

	asm volatile ("movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      "movl (%%r8d), %%r8d\n\t"
		      :
		      :
		      : "r8d" );

	asm volatile ("add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      :
		      :
		      : "r8d" );

	//read end timestamp	
        asm volatile ("rdtscp\n\t"
		"mov %%edx, %0\n\t"
		"mov %%eax, %1\n\t"
		"cpuid\n\t"
		: "=r" (time_h2), "=r" (time_l2) 
		:
		: "rax", "rbx", "rcx", "rdx");

        asm volatile ("mov %%edi, %0\n\t"
		"mov %%esi, %1\n\t"
		: "=m" (time_h1), "=m" (time_l1) 
		:
		: "edi", "esi");

	elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);

	//printf("%u\t%u\n",time_h1,time_l1);
	//printf("%u\t%u\n",time_h2,time_l2);
	//printf("%llu\t",elapsed);
	//unsigned long A_addr = (unsigned long)(&A[0]);
	//printf("\nA_addr = %llu\n", A_addr);
}

int main()
{
	set_cpu(1);

	//to avoid page fault and to guarantee pages are acutually allocated,
	//initialize each page
	//initialize pointer chasing
	A[ARRAY_SIZE-8] = ARRAY_SIZE;
	int i;
	for(i = ARRAY_SIZE-16; i >= 0; i=i-8)
	  A[i] = (unsigned long long)&A[i+8];

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",elapsed);
	//printf("\n");
	
	return 0;
}	
