#define _GNU_SOURCE
#include <stdint.h>
#include <inttypes.h>
#include <xmmintrin.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

uint32_t const ITER=0x200;
uint32_t const SIZE=0x400;

//pin thread to core
void set_cpu(int cpu_no)
{
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(cpu_no, &mask);
    sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

int main() {
    set_cpu(1);
    int i,j,k;
    int cpuinfo[4];
    int64_t lat[SIZE/0x10];
    volatile int64_t start,stop,b;
    volatile int64_t sum=0;

    volatile int32_t* a;
    a = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)a, SIZE*sizeof(volatile int32_t));

    for (i=0;i<SIZE;++i)
        a[i]=(int32_t)rand();

    for (i=0;i<SIZE/0x10;++i)
        lat[i]=0;
    b=start=stop=_rdtsc();

    for (k=0;k<ITER;++k) {

        __cpuid(cpuinfo,j);

        for (i=0;i<SIZE;i+=0x10) {
            _mm_clevict(&a[i],_MM_HINT_T0);
            _mm_clevict(&a[i],_MM_HINT_T1);
        }

        __cpuid(cpuinfo,j);

        for (b=_rdtsc();b-stop<0x400;b=_rdtsc());

        for (i=0x0;i<SIZE;i+=0x10) {
            start=_rdtsc();
            sum+=a[i];
            stop=_rdtsc();
            lat[i/0x10]=((lat[i/0x10]*k)+(stop-start))/(k+1);
            for (b=_rdtsc();b-stop<0x200;b=_rdtsc());
        }
    }

    printf("sum = %lld\n", sum);

    for (i=0;i<SIZE/0x10;++i)
        printf("%3d %3"PRId64"\n",i,lat[i]);

    return 0;
}
