#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

#define ARRAY_SIZE 4096

//pin thread to core
void set_cpu(int cpu_no)
{
  cpu_set_t mask;
  CPU_ZERO(&mask);
  CPU_SET(cpu_no, &mask);
  sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

volatile unsigned int *A;
unsigned int time_l1,time_h1,time_l2,time_h2;
long long elapsed, start, stop;
long long latency[128];

void test_prefetch_queue()
{
  int i;  
  for(i = 0; i < ARRAY_SIZE; i=i+16){
    _mm_clevict((void *)&A[i], 1);
    _mm_clevict((void *)&A[i], 2);
  }

  __asm cpuid

  //read begin timestamp
  /*__asm cpuid
  __asm rdtsc
  __asm mov esi,eax
  __asm mov edi,edx //*/

  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*0]) : "r8d");
  stop = _rdtsc();
  latency[0] = stop - start;
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*1]) : "r8d");
  stop = _rdtsc();
  latency[1] = stop - start;//*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*2]) : "r8d");
  stop = _rdtsc();
  latency[2] = stop - start;//*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*3]) : "r8d");
  stop = _rdtsc();
  latency[3] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*4]) : "r8d");
  stop = _rdtsc();
  latency[4] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*5]) : "r8d");
  stop = _rdtsc();
  latency[5] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*6]) : "r8d");
  stop = _rdtsc();
  latency[6] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*7]) : "r8d");
  stop = _rdtsc();
  latency[7] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*8]) : "r8d");
  stop = _rdtsc();
  latency[8] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*9]) : "r8d");
  stop = _rdtsc();
  latency[9] = stop - start; //*/

  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*10]) : "r8d");
  stop = _rdtsc();
  latency[10] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*11]) : "r8d");
  stop = _rdtsc();
  latency[11] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*12]) : "r8d");
  stop = _rdtsc();
  latency[12] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*13]) : "r8d");
  stop = _rdtsc();
  latency[13] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*14]) : "r8d");
  stop = _rdtsc();
  latency[14] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*15]) : "r8d");
  stop = _rdtsc();
  latency[15] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*16]) : "r8d");
  stop = _rdtsc();
  latency[16] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*17]) : "r8d");
  stop = _rdtsc();
  latency[17] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*18]) : "r8d");
  stop = _rdtsc();
  latency[18] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*19]) : "r8d");
  stop = _rdtsc();
  latency[19] = stop - start; //*/

  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*20]) : "r8d");
  stop = _rdtsc();
  latency[20] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*21]) : "r8d");
  stop = _rdtsc();
  latency[21] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*22]) : "r8d");
  stop = _rdtsc();
  latency[22] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*23]) : "r8d");
  stop = _rdtsc();
  latency[23] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*24]) : "r8d");
  stop = _rdtsc();
  latency[24] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*25]) : "r8d");
  stop = _rdtsc();
  latency[25] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*26]) : "r8d");
  stop = _rdtsc();
  latency[26] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*27]) : "r8d");
  stop = _rdtsc();
  latency[27] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*28]) : "r8d");
  stop = _rdtsc();
  latency[28] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*29]) : "r8d");
  stop = _rdtsc();
  latency[29] = stop - start; //*/

  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*30]) : "r8d");
  stop = _rdtsc();
  latency[30] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*31]) : "r8d");
  stop = _rdtsc();
  latency[31] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*32]) : "r8d");
  stop = _rdtsc();
  latency[32] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*33]) : "r8d");
  stop = _rdtsc();
  latency[33] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*34]) : "r8d");
  stop = _rdtsc();
  latency[34] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*35]) : "r8d");
  stop = _rdtsc();
  latency[35] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*36]) : "r8d");
  stop = _rdtsc();
  latency[36] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*37]) : "r8d");
  stop = _rdtsc();
  latency[37] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*38]) : "r8d");
  stop = _rdtsc();
  latency[38] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*39]) : "r8d");
  stop = _rdtsc();
  latency[39] = stop - start; //*/

  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*40]) : "r8d");
  stop = _rdtsc();
  latency[40] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*41]) : "r8d");
  stop = _rdtsc();
  latency[41] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*42]) : "r8d");
  stop = _rdtsc();
  latency[42] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*43]) : "r8d");
  stop = _rdtsc();
  latency[43] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*44]) : "r8d");
  stop = _rdtsc();
  latency[44] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*45]) : "r8d");
  stop = _rdtsc();
  latency[45] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*46]) : "r8d");
  stop = _rdtsc();
  latency[46] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*47]) : "r8d");
  stop = _rdtsc();
  latency[47] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*48]) : "r8d");
  stop = _rdtsc();
  latency[48] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*49]) : "r8d");
  stop = _rdtsc();
  latency[49] = stop - start; //*/

  __asm cpuid

  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*50]) : "r8d");
  stop = _rdtsc();
  latency[50] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*51]) : "r8d");
  stop = _rdtsc();
  latency[51] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*52]) : "r8d");
  stop = _rdtsc();
  latency[52] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*53]) : "r8d");
  stop = _rdtsc();
  latency[53] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*54]) : "r8d");
  stop = _rdtsc();
  latency[54] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*55]) : "r8d");
  stop = _rdtsc();
  latency[55] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*56]) : "r8d");
  stop = _rdtsc();
  latency[56] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*57]) : "r8d");
  stop = _rdtsc();
  latency[57] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*58]) : "r8d");
  stop = _rdtsc();
  latency[58] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*59]) : "r8d");
  stop = _rdtsc();
  latency[59] = stop - start; //*/

  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*60]) : "r8d");
  stop = _rdtsc();
  latency[60] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*61]) : "r8d");
  stop = _rdtsc();
  latency[61] = stop - start; //*/
  start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*62]) : "r8d");
  stop = _rdtsc();
  latency[62] = stop - start; //*/
  /*start = _rdtsc();
  asm volatile("movl %0, %%r8d" : : "m"(A[16*63]) : "r8d");
  stop = _rdtsc();
  latency[63] = stop - start; //*/

  //read end timestamp	
  /*__asm rdtsc
  __asm mov time_l1,esi
  __asm mov time_h1,edi
  __asm mov time_l2,eax
  __asm mov time_h2,edx

  elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1); //*/
}

int main()
{
  set_cpu(1);

  A = (volatile unsigned int *)valloc(ARRAY_SIZE*sizeof(volatile unsigned int));
  bzero((void *)A, ARRAY_SIZE*sizeof(volatile unsigned int));
  //to avoid page fault and ensure physical page allocation
  int i;
  for(i = 0; i < ARRAY_SIZE; i=i+1024)
    A[i] = i;

  test_prefetch_queue();
  test_prefetch_queue();
  test_prefetch_queue();
  test_prefetch_queue();
  test_prefetch_queue();
  for(i = 0; i < 64; i++)
    printf("latency[%d] = %lld\n", i, latency[i]);
  //printf("%llu\t",elapsed);
  //printf("\n");
	
  return 0;
}	
