#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <pthread.h>
#include <time.h>
#include <stdlib.h>
#include <strings.h>

#define NUM_CORES 48
#define NUM_THREADS_PER_CORE 3
#define NUM_THREADS (NUM_CORES * NUM_THREADS_PER_CORE)
 
#define NUM_BARRIERS 2
pthread_barrier_t barrier[NUM_BARRIERS];

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

//get elapsed time
long time_diff(struct timespec start, struct timespec end)
{
  struct timespec temp;
  if ((end.tv_nsec-start.tv_nsec)<0) {
    temp.tv_sec = end.tv_sec-start.tv_sec-1;
    temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
  } else {
    temp.tv_sec = end.tv_sec-start.tv_sec;
    temp.tv_nsec = end.tv_nsec-start.tv_nsec;
  }
  return temp.tv_sec * 1000000000 + temp.tv_nsec;
}

struct thread_arg_t {
	int tid;
};

/* 2nd-level TLB can work for 128MB data, we guarantee 2nd-level
** TLB hits. And since the total memory on KNC is limited, 4GB in
** total, usually 2.2GB free memory available. So we limited the
** array size per core to be 4*6=24MB. Thus it doesn't exceed 128MB,
** always 2nd-level TLB hit; and total size is 24*60=1440MB, i.e.,
** 1.4GB, not exceeding free memory size*/
#define ARRAY_SIZE (12*1024*1024/NUM_THREADS_PER_CORE)
volatile unsigned int *A;

//using system-wide timer micetc. time_begin, time_end struct
struct timespec time_begin[NUM_THREADS], time_end[NUM_THREADS];
long long elapsed_time[NUM_THREADS];

//test bandwidth using L2 prefetch, random read, streaming read,
//random write, and streaming write
//this is the routine for each thread
void test_bandwidth_p(int tid)
{
    int i;  
    //each thread starts from A[tid*ARRAY_SIZE]
    volatile unsigned int *A_ti = &A[tid*ARRAY_SIZE];
    for(i = 0; i < ARRAY_SIZE; i=i+16){
      _mm_clevict((void *)&A_ti[i], 1);
      _mm_clevict((void *)&A_ti[i], 2);
    }
    __asm cpuid

    pthread_barrier_wait(&barrier[0]);

    //get system-wide begin time
    clock_gettime(CLOCK_REALTIME, &time_begin[tid]);

    for(i = 0; i < ARRAY_SIZE/2048; i++){
        //each thread starts from A+tid*ARRAY_SIZE
        A_ti = &(A[tid*ARRAY_SIZE+i*2048]);

	//unroll two pages to minimize control overhead
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*91]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*111]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*59]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*120]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*104]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*16]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*34]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*100]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*65]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*93]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*71]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*32]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*74]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*30]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*24]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*96]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*38]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*41]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*22]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*55]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*39]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*5]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*114]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*82]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*43]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*127]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*92]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*86]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*7]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*107]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*62]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*77]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*27]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*10]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*72]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*11]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*109]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*4]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*85]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*70]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*73]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*48]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*106]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*40]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*69]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*23]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*95]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*13]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*52]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*42]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*79]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*1]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*44]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*118]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*125]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*0]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*61]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*50]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*60]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*64]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*99]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*33]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*124]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*25]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*126]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*31]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*116]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*51]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*117]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*81]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*36]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*21]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*76]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*63]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*46]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*45]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*88]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*9]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*108]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*105]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*121]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*87]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*29]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*80]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*112]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*19]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*75]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*56]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*6]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*98]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*17]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*83]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*123]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*54]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*102]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*8]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*84]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*89]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*3]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*47]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*18]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*78]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*122]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*37]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*94]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*101]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*66]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*53]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*110]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*35]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*115]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*28]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*90]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*58]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*57]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*12]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*49]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*113]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*15]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*2]) : "r8d");

	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*97]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*68]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*103]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*26]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*14]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*119]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*20]) : "r8d");
	asm volatile("movl %0, %%r8d" : : "m"(A_ti[16*67]) : "r8d");
    }

    //get system-wide end time
    clock_gettime(CLOCK_REALTIME, &time_end[tid]);

    pthread_barrier_wait(&barrier[1]);

    elapsed_time[tid] = time_diff(time_begin[tid], time_end[tid]);

    //printf("thread %d time_begin: %llu\t time_end: %llu\t elapsed_time: %llu\n", tid, time_begin[tid].tv_sec * 1000000000 + time_begin[tid].tv_nsec, time_end[tid].tv_sec * 1000000000 + time_end[tid].tv_nsec, elapsed_time[tid]);

}

void* bandwidth_thread(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;

	//pin the thread to the core, KNC starts from core 1
	set_cpu(tid*4/NUM_THREADS_PER_CORE+1);

	//initialize the page first, so that no 2nd-level TLB miss and all physical pages allocated
	volatile unsigned int *A_ti = &A[tid*ARRAY_SIZE];
	register unsigned int temp = 0;
	int i = 0;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  //A_ti[i]=i;
	  temp += A_ti[i];

	//test it 5 times to get stable results
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
}

int main()
{
        //allocate the array space for the thread
	A = (volatile unsigned int *)malloc(ARRAY_SIZE*NUM_THREADS*sizeof(volatile unsigned int));
	bzero((void *)A, ARRAY_SIZE*NUM_THREADS*sizeof(volatile unsigned int));

	//initialize the page first, so that all physical pages allocated
	int i = 0;
	for(i = 0; i < ARRAY_SIZE*NUM_THREADS; i=i+1024)
	  A[i]=i;
	
	pthread_t thread_id[NUM_THREADS];
	struct thread_arg_t thread_arg[NUM_THREADS];
	int ti;
	for(ti = 0; ti < NUM_BARRIERS; ++ti)
	{
		pthread_barrier_init(&barrier[ti], NULL, NUM_THREADS);
	}

	for(ti = 0; ti < NUM_THREADS; ++ti)
	{
		thread_arg[ti].tid = ti;
		pthread_create( &thread_id[ti], NULL, bandwidth_thread, (void *)&thread_arg[ti]);
	}

	for(ti = 0; ti < NUM_THREADS; ++ti)   
	{      
		pthread_join(thread_id[ti], NULL);    
	}

	for(ti = 0; ti < NUM_BARRIERS; ++ti)   
	{      
		pthread_barrier_destroy(&barrier[ti]);
	}

	//free the memory
	free((void *)A);

	//get earliest begin time and latest end time of whole system
	long long min_time_begin = time_begin[0].tv_sec * 1000000000 + time_begin[0].tv_nsec;
	long long max_time_end = time_end[0].tv_sec * 1000000000 + time_end[0].tv_nsec;

	for(ti = 1; ti < NUM_THREADS; ++ti)   
	{      
	  if( (min_time_begin - (time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec)) > 0 )
	    min_time_begin = time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec;

	  if( (max_time_end - (time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec)) < 0  )
	    max_time_end = time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec;

	}

	//calculate total elapsed time and bandwidth
	long long total_elapsed_time = max_time_end - min_time_begin;
	double bandwidth = (double)sizeof(unsigned int)*ARRAY_SIZE*NUM_THREADS/total_elapsed_time;
	printf("%.3lf\t", bandwidth);
	//printf("%llu\n", total_elapsed_time);//*/

	//print the beign and end time
	/*for(ti = 0; ti < NUM_THREADS; ++ti){
	  printf("%llu\t", time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec);
	}
	printf("\n");

	for(ti = 0; ti < NUM_THREADS; ++ti){
	  printf("%llu\t", time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec);
	}
	printf("\n");//*/

	return 0;
}	
