#define _GNU_SOURCE
#include <stdint.h>
#include <inttypes.h>
#include <xmmintrin.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

uint32_t const ITER=0x10;
uint32_t const SIZE=0x400;
uint32_t const STRIDE=0x40;

//pin thread to core
void set_cpu(int cpu_no)
{
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(cpu_no, &mask);
    sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

int main() {
    set_cpu(2);
    int i,j,k;
    int cpuinfo[4];
    int64_t lat[SIZE/STRIDE];
    unsigned int time_l1,time_h1,time_l2,time_h2;
    volatile int64_t start,stop,b;
    volatile int64_t sum=0;

    volatile int32_t* array_a;
    array_a = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_a, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_b;
    array_b = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_b, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_c;
    array_c = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_c, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_d;
    array_d = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_d, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_e;
    array_e = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_e, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_f;
    array_f = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_f, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_g;
    array_g = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_g, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_h;
    array_h = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_h, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_i;
    array_i = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)array_i, SIZE*sizeof(volatile int32_t));

    for (i=0;i<SIZE;++i) {
        array_a[i]=(int32_t)rand();
        array_b[i]=(int32_t)rand();
        array_c[i]=(int32_t)rand();
        array_d[i]=(int32_t)rand();
        array_e[i]=(int32_t)rand();
        array_f[i]=(int32_t)rand();
        array_g[i]=(int32_t)rand();
        array_h[i]=(int32_t)rand();
        array_i[i]=(int32_t)rand();
    }

    for (i=0;i<SIZE/STRIDE;++i)
        lat[i]=0;
    b=start=stop=_rdtsc();

    for (k=0;k<ITER;++k) {

	_mm_mfence();
        __cpuid(cpuinfo,j);

        for (i=0;i<SIZE;i+=0x10) {
            _mm_clflush(&array_a[i]);
            _mm_clflush(&array_b[i]);
            _mm_clflush(&array_c[i]);
            _mm_clflush(&array_d[i]);
            _mm_clflush(&array_e[i]);
            _mm_clflush(&array_f[i]);
            _mm_clflush(&array_g[i]);
            _mm_clflush(&array_h[i]);
            _mm_clflush(&array_i[i]);
        }
	
	_mm_mfence();
        __cpuid(cpuinfo,j);

        for (i=0;i<SIZE;i+=0x10) {
	    _mm_prefetch(&array_a[i], 2);
	    _mm_prefetch(&array_b[i], 2);
	    _mm_prefetch(&array_c[i], 2);
	    _mm_prefetch(&array_d[i], 2);
	    _mm_prefetch(&array_e[i], 2);
	    _mm_prefetch(&array_f[i], 2);
	    _mm_prefetch(&array_g[i], 2);
	    _mm_prefetch(&array_h[i], 2);
	    _mm_prefetch(&array_i[i], 2);
        }
	
	_mm_mfence();
        __cpuid(cpuinfo,j);

        for (b=_rdtsc();b-stop<0x400;b=_rdtsc());

        for (i=0;i<SIZE;i+=STRIDE) {
	    //read begin timestamp
	    asm volatile ("cpuid\n\t"
		"rdtsc\n\t"
		"mov %%edx, %%edi\n\t"
		"mov %%eax, %%esi\n\t"
		:
		:
		: "rax", "rbx", "rcx", "rdx", "esi", "edi");

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_b[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_c[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_d[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_e[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_f[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_g[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_h[i])
		: "r8d" );

	    /*asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_i[i])
		: "r8d" );//*/

	    asm volatile (
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		"add $1, %%r8d\n\t"
		:
		:
		: "r8d" );

	    //read end timestamp	
	    asm volatile ("rdtscp\n\t"
		"mov %%edx, %0\n\t"
		"mov %%eax, %1\n\t"
		"cpuid\n\t"
		: "=r" (time_h2), "=r" (time_l2) 
		:
		: "rax", "rbx", "rcx", "rdx");

	    asm volatile ("mov %%edi, %0\n\t"
		"mov %%esi, %1\n\t"
		: "=m" (time_h1), "=m" (time_l1) 
		:
		: "edi", "esi");

	    start = ((unsigned long long)time_h1<<32 | time_l1);
	    stop = ((unsigned long long)time_h2<< 32 | time_l2);
            lat[i/STRIDE]=(stop-start);
            //lat[i/STRIDE]=((lat[i/STRIDE]*k)+(stop-start))/(k+1);
            //for (b=_rdtsc();b-stop<0x200;b=_rdtsc());
        }
    }

    //printf("sum = %lld\n", sum);

    for (i=0;i<SIZE/STRIDE;++i)
        printf("%3d %3"PRId64"\n",i,lat[i]);

    return 0;
}
