#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <time.h>
#include <stdlib.h>
#include <strings.h>

//allocate a number of pages
#define ARRAY_SIZE 1024*1024*16

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

//get elapsed time
long time_diff(struct timespec start, struct timespec end)
{
  struct timespec temp;
  if ((end.tv_nsec-start.tv_nsec)<0) {
    temp.tv_sec = end.tv_sec-start.tv_sec-1;
    temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
  } else {
    temp.tv_sec = end.tv_sec-start.tv_sec;
    temp.tv_nsec = end.tv_nsec-start.tv_nsec;
  }
  return temp.tv_sec * 1000000000 + temp.tv_nsec;
}

volatile unsigned int A[ARRAY_SIZE];
struct timespec time_begin, time_end;
long long elapsed_time;

void test_prefetch_queue()
{
    int i; 
    volatile unsigned int *A_ti;
    //flush the caches
    for(i = 0; i < ARRAY_SIZE; i=i+16){
      _mm_clevict((void *)&A[i], 1);
      _mm_clevict((void *)&A[i], 2);
    }

    __asm cpuid

    //prefetch the caches to L2
    for(i = 0; i < ARRAY_SIZE; i=i+16){
      _mm_prefetch((void *)&A[i], 2);
    }

    __asm cpuid

    //get system-wide begin time
    clock_gettime(CLOCK_REALTIME, &time_begin);

    for(i = 0; i < ARRAY_SIZE/1024; i++) {
        //each page starts from A[i*1024]
        A_ti = &(A[i*1024]);

	_mm_prefetch((void *)A_ti, 1);
	_mm_prefetch((void *)(A_ti+16), 1);
	_mm_prefetch((void *)(A_ti+16*2), 1);
	_mm_prefetch((void *)(A_ti+16*3), 1);
	_mm_prefetch((void *)(A_ti+16*4), 1);
	_mm_prefetch((void *)(A_ti+16*5), 1);
	_mm_prefetch((void *)(A_ti+16*6), 1);
	_mm_prefetch((void *)(A_ti+16*7), 1);

	_mm_prefetch((void *)(A_ti+16*8), 1);
	_mm_prefetch((void *)(A_ti+16*9), 1);
	_mm_prefetch((void *)(A_ti+16*10), 1);
	_mm_prefetch((void *)(A_ti+16*11), 1);
	_mm_prefetch((void *)(A_ti+16*11), 1);
	_mm_prefetch((void *)(A_ti+16*13), 1);
	_mm_prefetch((void *)(A_ti+16*14), 1);
	_mm_prefetch((void *)(A_ti+16*15), 1);

	_mm_prefetch((void *)(A_ti+16*16), 1);
	_mm_prefetch((void *)(A_ti+16*17), 1);
	_mm_prefetch((void *)(A_ti+16*18), 1);
	_mm_prefetch((void *)(A_ti+16*19), 1);
	_mm_prefetch((void *)(A_ti+16*20), 1);
	_mm_prefetch((void *)(A_ti+16*21), 1);
	_mm_prefetch((void *)(A_ti+16*21), 1);
	_mm_prefetch((void *)(A_ti+16*23), 1);

	_mm_prefetch((void *)(A_ti+16*24), 1);
	_mm_prefetch((void *)(A_ti+16*25), 1);
	_mm_prefetch((void *)(A_ti+16*26), 1);
	_mm_prefetch((void *)(A_ti+16*27), 1);
	_mm_prefetch((void *)(A_ti+16*28), 1);
	_mm_prefetch((void *)(A_ti+16*29), 1);
	_mm_prefetch((void *)(A_ti+16*30), 1);
	_mm_prefetch((void *)(A_ti+16*31), 1);//*/
    }

    //get system-wide end time
    clock_gettime(CLOCK_REALTIME, &time_end);

    elapsed_time = time_diff(time_begin, time_end)/(ARRAY_SIZE/1024);
}

int main()
{
	set_cpu(1);

	//to avoid page fault and to guarantee pages are acutually allocated,
	//initialize each page
	int i;
	for(i = 0; i < ARRAY_SIZE; i=i+16)
	  A[i] = (unsigned int)rand();

	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	test_prefetch_queue();
	printf("%llu\t",elapsed_time);
	
	return 0;
}	
