#define _GNU_SOURCE
#include <stdint.h>
#include <inttypes.h>
#include <xmmintrin.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <stdlib.h>
#include <strings.h>

uint32_t const ITER=1;//0x200;
uint32_t const SIZE=0x400;

unsigned int time_l1,time_h1,time_l2,time_h2;
long long elapsed;

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

int main() {
    set_cpu(1);
    int i,j,k;
    int cpuinfo[4];
    volatile int64_t lat[SIZE/0x10];
    volatile int64_t start,stop,b;

    //__declspec(aligned(0x1000)) 
    //volatile int32_t a[SIZE];
    volatile int32_t* a;
    a = (volatile int32_t *)valloc(SIZE*sizeof(volatile int32_t));
    bzero((void *)a, SIZE*sizeof(volatile int32_t));

    for (i=0;i<SIZE;++i)
        a[i]=(int32_t)rand();

    for (i=0;i<SIZE/0x10;++i)
        lat[i]=0;
    b=start=stop=_rdtsc();

    for (k=0;k<ITER;++k) {
        //__asm cpuid

        for (i=0;i<SIZE;i+=0x10) {
            _mm_clevict(&a[i],_MM_HINT_T0);
            _mm_clevict(&a[i],_MM_HINT_T1);
        }

	__asm cpuid

	//start = _rdtsc();
	for (i=0x0;i<SIZE;i+=0x10) {
	    __asm rdtsc
	    __asm mov esi,eax
	    __asm mov edi,edx

	    b += a[i];

	    __asm rdtsc
	    __asm mov time_l1,esi
	    __asm mov time_h1,edi
	    __asm mov time_l2,eax
	    __asm mov time_h2,edx

	    elapsed = ((unsigned long long)time_h2<< 32 | time_l2) - ((unsigned long long)time_h1<<32 | time_l1);
            lat[i/0x10]=((lat[i/0x10]*k)+elapsed)/(k+1);

	}//*/
	//stop = _rdtsc();


	//printf("%llu\n",elapsed);
        //__cpuid(cpuinfo,j);
        //__cpuid(cpuinfo,j);
        //__cpuid(cpuinfo,j);
        //for (b=_rdtsc();b-stop<0x400;b=_rdtsc());

        /*start=_rdtsc();
        b=a[0];
        stop=_rdtsc();
        lat[0]=((lat[0]*k)+(stop-start))/(k+1);
        for (b=_rdtsc();b-stop<0x200;b=_rdtsc());//*/

        /*for (i=0x0;i<SIZE;i+=0x80) {
            start=_rdtsc();
            b+=a[i];
            stop=_rdtsc();
            lat[i/0x10]=((lat[i/0x10]*k)+(stop-start))/(k+1);

            start=_rdtsc();
            b+=a[(i+0x10)];
            stop=_rdtsc();
            lat[(i+0x10)/0x10]=((lat[(i+0x10)/0x10]*k)+(stop-start))/(k+1);

            start=_rdtsc();
            b+=a[(i+0x20)];
            stop=_rdtsc();
            lat[(i+0x20)/0x10]=((lat[(i+0x20)/0x10]*k)+(stop-start))/(k+1);

            start=_rdtsc();
            b+=a[(i+0x30)];
            stop=_rdtsc();
            lat[(i+0x30)/0x10]=((lat[(i+0x30)/0x10]*k)+(stop-start))/(k+1);

            start=_rdtsc();
            b+=a[(i+0x40)];
            stop=_rdtsc();
            lat[(i+0x40)/0x10]=((lat[(i+0x40)/0x10]*k)+(stop-start))/(k+1);

            start=_rdtsc();
            b+=a[(i+0x50)];
            stop=_rdtsc();
            lat[(i+0x50)/0x10]=((lat[(i+0x50)/0x10]*k)+(stop-start))/(k+1);

            start=_rdtsc();
            b+=a[(i+0x60)];
            stop=_rdtsc();
            lat[(i+0x60)/0x10]=((lat[(i+0x60)/0x10]*k)+(stop-start))/(k+1);

            start=_rdtsc();
            b+=a[(i+0x70)];
            stop=_rdtsc();
	    //lat[(i+0x70)/0x10] = stop - start;
            lat[(i+0x70)/0x10]=((lat[(i+0x70)/0x10]*k)+(stop-start))/(k+1);
            //for (b=_rdtsc();b-stop<0x200;b=_rdtsc());
	}//*/
    }

    printf("b = %lld\n", b);

    for (i=0;i<SIZE/0x10;++i)
        printf("%3d %3"PRId64"\n",i,lat[i]);

    return 0;
}
