#define _GNU_SOURCE
#include <stdint.h>
#include <inttypes.h>
#include <xmmintrin.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

uint32_t const ITER=0x5;
uint32_t const SIZE=0x400;
uint32_t const TRIGGER_SIZE=0x20;
uint32_t const TARGET=0x20;
uint32_t const STRIDE=0x04;

//pin thread to core
void set_cpu(int cpu_no)
{
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(cpu_no, &mask);
    sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

int main() {
    set_cpu(2);
    int i,j,k;
    int cpuinfo[4];
    int64_t lat[SIZE/STRIDE];
    unsigned int time_l1,time_h1,time_l2,time_h2;
    volatile int64_t start,stop,b;
    volatile int64_t sum=0;

    volatile int32_t* a;
    a = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)a, SIZE*sizeof(volatile int32_t));

    for (i=0;i<SIZE;++i)
        a[i]=(int32_t)rand();

    for (i=0;i<SIZE/STRIDE;++i)
        lat[i]=0;
    b=start=stop=_rdtsc();

    for (k=0;k<ITER;++k) {

	_mm_mfence();
        __cpuid(cpuinfo,j);

        for (i=0;i<SIZE;i+=0x10) {
            _mm_clflush(&a[i]);
        }
	
	_mm_mfence();
        __cpuid(cpuinfo,j);

        for (b=_rdtsc();b-stop<0x400;b=_rdtsc());

        for (i=0x0;i<TRIGGER_SIZE;i+=STRIDE) {
	    //read begin timestamp
	    asm volatile ("cpuid\n\t"
		"rdtsc\n\t"
		"mov %%edx, %%edi\n\t"
		"mov %%eax, %%esi\n\t"
		:
		:
		: "rax", "rbx", "rcx", "rdx", "esi", "edi");

	    asm volatile ("movl %0, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		:
		: "m" (a[i])
		: "r8d" );

	    //read end timestamp	
	    asm volatile ("rdtscp\n\t"
		"mov %%edx, %0\n\t"
		"mov %%eax, %1\n\t"
		"cpuid\n\t"
		: "=r" (time_h2), "=r" (time_l2) 
		:
		: "rax", "rbx", "rcx", "rdx");

	    asm volatile ("mov %%edi, %0\n\t"
		"mov %%esi, %1\n\t"
		: "=m" (time_h1), "=m" (time_l1) 
		:
		: "edi", "esi");

	    start = ((unsigned long long)time_h1<<32 | time_l1);
	    stop = ((unsigned long long)time_h2<< 32 | time_l2);
            lat[i/STRIDE]=(stop-start);
            //lat[i/STRIDE]=((lat[i/STRIDE]*k)+(stop-start))/(k+1);
            //for (b=_rdtsc();b-stop<0x200;b=_rdtsc());
        }

	_mm_mfence();
        __cpuid(cpuinfo,j);

        for (b=_rdtsc();b-stop<0x400;b=_rdtsc());

	//read begin timestamp
	asm volatile ("cpuid\n\t"
		      "rdtsc\n\t"
		      "mov %%edx, %%edi\n\t"
		      "mov %%eax, %%esi\n\t"
		      :
		      :
		      : "rax", "rbx", "rcx", "rdx", "esi", "edi");

	asm volatile ("movl %0, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      "add $1, %%r8d\n\t"
		      :
		      : "m" (a[TARGET])
		      : "r8d" );

	//read end timestamp	
	asm volatile ("rdtscp\n\t"
		      "mov %%edx, %0\n\t"
		      "mov %%eax, %1\n\t"
		      "cpuid\n\t"
		      : "=r" (time_h2), "=r" (time_l2) 
		      :
		      : "rax", "rbx", "rcx", "rdx");

	asm volatile ("mov %%edi, %0\n\t"
		      "mov %%esi, %1\n\t"
		      : "=m" (time_h1), "=m" (time_l1) 
		      :
		      : "edi", "esi");

	start = ((unsigned long long)time_h1<<32 | time_l1);
	stop = ((unsigned long long)time_h2<< 32 | time_l2);
	lat[TARGET/STRIDE]=(stop-start);
	//lat[TARGET/STRIDE]=((lat[TARGET/STRIDE]*k)+(stop-start))/(k+1);
    }

    //printf("sum = %lld\n", sum);

    for (i=0;i<=TARGET/STRIDE;++i)
        printf("%3d %3"PRId64"\n",i,lat[i]);

    return 0;
}
