#define _GNU_SOURCE
#include <stdint.h>
#include <inttypes.h>
#include <xmmintrin.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

uint32_t const ITER=0x5;
uint32_t const SIZE=0x400;
uint32_t const ACCESS_SIZE=0xd0;
uint32_t const STRIDE=0x10;

//pin thread to core
void set_cpu(int cpu_no)
{
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(cpu_no, &mask);
    sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

int main() {
    set_cpu(2);
    int i,j,k;
    int cpuinfo[4];
    int64_t lat[SIZE/STRIDE];
    unsigned int time_l1,time_h1,time_l2,time_h2;
    volatile int64_t start,stop,b;
    volatile int64_t sum=0;

    volatile int32_t* array_a;
    array_a = (volatile int32_t *)valloc(33*SIZE*sizeof(volatile int32_t));
    bzero((void *)array_a, SIZE*sizeof(volatile int32_t));

    volatile int32_t* array_a1 = array_a + 1024*1;
    volatile int32_t* array_a2 = array_a + 1024*2;
    volatile int32_t* array_a3 = array_a + 1024*3;
    volatile int32_t* array_a4 = array_a + 1024*4;
    volatile int32_t* array_a5 = array_a + 1024*5;
    volatile int32_t* array_a6 = array_a + 1024*6;
    volatile int32_t* array_a7 = array_a + 1024*7;
    volatile int32_t* array_a8 = array_a + 1024*8;
    volatile int32_t* array_a9 = array_a + 1024*9;
    volatile int32_t* array_a10 = array_a + 1024*10;
    volatile int32_t* array_a11 = array_a + 1024*11;
    volatile int32_t* array_a12 = array_a + 1024*12;
    volatile int32_t* array_a13 = array_a + 1024*13;
    volatile int32_t* array_a14 = array_a + 1024*14;
    volatile int32_t* array_a15 = array_a + 1024*15;
    volatile int32_t* array_a16 = array_a + 1024*16;
    volatile int32_t* array_a17 = array_a + 1024*17;
    volatile int32_t* array_a18 = array_a + 1024*18;
    volatile int32_t* array_a19 = array_a + 1024*19;
    volatile int32_t* array_a20 = array_a + 1024*20;
    volatile int32_t* array_a21 = array_a + 1024*21;
    volatile int32_t* array_a22 = array_a + 1024*22;
    volatile int32_t* array_a23 = array_a + 1024*23;
    volatile int32_t* array_a24 = array_a + 1024*24;
    volatile int32_t* array_a25 = array_a + 1024*25;
    volatile int32_t* array_a26 = array_a + 1024*26;
    volatile int32_t* array_a27 = array_a + 1024*27;
    volatile int32_t* array_a28 = array_a + 1024*28;
    volatile int32_t* array_a29 = array_a + 1024*29;
    volatile int32_t* array_a30 = array_a + 1024*30;
    volatile int32_t* array_a31 = array_a + 1024*31;
    volatile int32_t* array_a32 = array_a + 1024*32;
    volatile int32_t* array_a33 = array_a + 1024*33;

    for (i=0;i<SIZE;++i) {
        array_a1[i]=(int32_t)rand();
        array_a2[i]=(int32_t)rand();
        array_a3[i]=(int32_t)rand();
        array_a4[i]=(int32_t)rand();
        array_a5[i]=(int32_t)rand();
        array_a6[i]=(int32_t)rand();
        array_a7[i]=(int32_t)rand();
        array_a8[i]=(int32_t)rand();
        array_a9[i]=(int32_t)rand();
        array_a10[i]=(int32_t)rand();
        array_a11[i]=(int32_t)rand();
        array_a12[i]=(int32_t)rand();
        array_a13[i]=(int32_t)rand();
        array_a14[i]=(int32_t)rand();
        array_a15[i]=(int32_t)rand();
        array_a16[i]=(int32_t)rand();
        array_a17[i]=(int32_t)rand();
        array_a18[i]=(int32_t)rand();
        array_a19[i]=(int32_t)rand();
        array_a20[i]=(int32_t)rand();
        array_a21[i]=(int32_t)rand();
        array_a22[i]=(int32_t)rand();
        array_a23[i]=(int32_t)rand();
        array_a24[i]=(int32_t)rand();
        array_a25[i]=(int32_t)rand();
        array_a26[i]=(int32_t)rand();
        array_a27[i]=(int32_t)rand();
        array_a28[i]=(int32_t)rand();
        array_a29[i]=(int32_t)rand();
        array_a30[i]=(int32_t)rand();
        array_a31[i]=(int32_t)rand();
        array_a32[i]=(int32_t)rand();
        array_a33[i]=(int32_t)rand();
    }

    for (i=0;i<SIZE/STRIDE;++i)
        lat[i]=0;
    b=start=stop=_rdtsc();

    for (k=0;k<ITER;++k) {

	_mm_mfence();
        __cpuid(cpuinfo,j);

        for (i=0;i<SIZE;i+=0x10) {
            _mm_clflush(&array_a1[i]);
            _mm_clflush(&array_a2[i]);
            _mm_clflush(&array_a3[i]);
            _mm_clflush(&array_a4[i]);
            _mm_clflush(&array_a5[i]);
            _mm_clflush(&array_a6[i]);
            _mm_clflush(&array_a7[i]);
            _mm_clflush(&array_a8[i]);
            _mm_clflush(&array_a9[i]);
            _mm_clflush(&array_a10[i]);
            _mm_clflush(&array_a11[i]);
            _mm_clflush(&array_a12[i]);
            _mm_clflush(&array_a13[i]);
            _mm_clflush(&array_a14[i]);
            _mm_clflush(&array_a15[i]);
            _mm_clflush(&array_a16[i]);
            _mm_clflush(&array_a17[i]);
            _mm_clflush(&array_a18[i]);
            _mm_clflush(&array_a19[i]);
            _mm_clflush(&array_a20[i]);
            _mm_clflush(&array_a21[i]);
            _mm_clflush(&array_a22[i]);
            _mm_clflush(&array_a23[i]);
            _mm_clflush(&array_a24[i]);
            _mm_clflush(&array_a25[i]);
            _mm_clflush(&array_a26[i]);
            _mm_clflush(&array_a27[i]);
            _mm_clflush(&array_a28[i]);
            _mm_clflush(&array_a29[i]);
            _mm_clflush(&array_a30[i]);
            _mm_clflush(&array_a31[i]);
            _mm_clflush(&array_a32[i]);
            _mm_clflush(&array_a33[i]);
        }
	
	_mm_mfence();
        __cpuid(cpuinfo,j);

        for (b=_rdtsc();b-stop<0x400;b=_rdtsc());

        for (i=0;i<ACCESS_SIZE;i+=STRIDE) {
	    //read begin timestamp
	    asm volatile ("cpuid\n\t"
		"rdtsc\n\t"
		"mov %%edx, %%edi\n\t"
		"mov %%eax, %%esi\n\t"
		:
		:
		: "rax", "rbx", "rcx", "rdx", "esi", "edi");

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a1[i])
		: "r8d" );

	    //read end timestamp	
	    asm volatile ("rdtscp\n\t"
		"mov %%edx, %0\n\t"
		"mov %%eax, %1\n\t"
		"cpuid\n\t"
		: "=r" (time_h2), "=r" (time_l2) 
		:
		: "rax", "rbx", "rcx", "rdx");

	    asm volatile ("mov %%edi, %0\n\t"
		"mov %%esi, %1\n\t"
		: "=m" (time_h1), "=m" (time_l1) 
		:
		: "edi", "esi");

	    start = ((unsigned long long)time_h1<<32 | time_l1);
	    stop = ((unsigned long long)time_h2<< 32 | time_l2);
            lat[i/STRIDE]=(stop-start);
            //lat[i/STRIDE]=((lat[i/STRIDE]*k)+(stop-start))/(k+1);
            //for (b=_rdtsc();b-stop<0x200;b=_rdtsc());

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a2[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a3[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a4[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a5[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a6[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a7[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a8[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a9[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a10[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a11[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a12[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a13[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a14[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a15[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a16[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a17[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a18[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a19[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a20[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a21[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a22[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a23[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a24[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a25[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a26[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a27[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a28[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a29[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a30[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a31[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a32[i])
		: "r8d" );

	    asm volatile ("movl %0, %%r8d\n\t"
		:
		: "m" (array_a33[i])
		: "r8d" );//*/
        }
    }

    //printf("sum = %lld\n", sum);

    for (i=0;i<ACCESS_SIZE/STRIDE;++i)
        printf("%3d %3"PRId64"\n",i,lat[i]);

    return 0;
}
