#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <pthread.h>
#include <time.h>
#include <stdlib.h>
#include <strings.h>

#define NUM_CORES 60
#define NUM_THREADS_PER_CORE 4
#define NUM_THREADS (NUM_CORES * NUM_THREADS_PER_CORE)
 
#define NUM_BARRIERS 2
pthread_barrier_t barrier[NUM_BARRIERS];

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

//get elapsed time
long time_diff(struct timespec start, struct timespec end)
{
  struct timespec temp;
  if ((end.tv_nsec-start.tv_nsec)<0) {
    temp.tv_sec = end.tv_sec-start.tv_sec-1;
    temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
  } else {
    temp.tv_sec = end.tv_sec-start.tv_sec;
    temp.tv_nsec = end.tv_nsec-start.tv_nsec;
  }
  return temp.tv_sec * 1000000000 + temp.tv_nsec;
}

struct thread_arg_t {
	int tid;
};

/* 2nd-level TLB can work for 128MB data, we guarantee 2nd-level
** TLB hits. And since the total memory on KNC is limited, 4GB in
** total, usually 2.2GB free memory available. So we limited the
** array size per core to be 4*6=24MB. Thus it doesn't exceed 128MB,
** always 2nd-level TLB hit; and total size is 24*60=1440MB, i.e.,
** 1.4GB, not exceeding free memory size*/
#define ARRAY_SIZE (12*1024*1024/NUM_THREADS_PER_CORE)
volatile unsigned int *A;

//using system-wide timer micetc. time_begin, time_end struct
struct timespec time_begin[NUM_THREADS], time_end[NUM_THREADS];
long long elapsed_time[NUM_THREADS];

//test bandwidth using L2 prefetch, random read, streaming read,
//random write, and streaming write
//this is the routine for each thread
void test_bandwidth_p(int tid)
{
    int i;  
    //each thread starts from A[tid*ARRAY_SIZE]
    volatile unsigned int *A_ti = &A[tid*ARRAY_SIZE];
    for(i = 0; i < ARRAY_SIZE; i=i+16){
      _mm_clevict((void *)&A_ti[i], 1);
      _mm_clevict((void *)&A_ti[i], 2);
    }
    __asm cpuid

    pthread_barrier_wait(&barrier[0]);

    //get system-wide begin time
    clock_gettime(CLOCK_REALTIME, &time_begin[tid]);

    for(i = 0; i < ARRAY_SIZE/2048; i++){
        //each thread starts from A+tid*ARRAY_SIZE
        A_ti = &(A[tid*ARRAY_SIZE+i*2048]);

	//unroll two pages to minimize control overhead
	_mm_prefetch((void *)A_ti, 6);
	_mm_prefetch((void *)(A_ti+16), 6);
	_mm_prefetch((void *)(A_ti+16*2), 6);
	_mm_prefetch((void *)(A_ti+16*3), 6);
	_mm_prefetch((void *)(A_ti+16*4), 6);
	_mm_prefetch((void *)(A_ti+16*5), 6);
	_mm_prefetch((void *)(A_ti+16*6), 6);
	_mm_prefetch((void *)(A_ti+16*7), 6);

	_mm_prefetch((void *)(A_ti+16*8), 6);
	_mm_prefetch((void *)(A_ti+16*9), 6);
	_mm_prefetch((void *)(A_ti+16*10), 6);
	_mm_prefetch((void *)(A_ti+16*11), 6);
	_mm_prefetch((void *)(A_ti+16*12), 6);
	_mm_prefetch((void *)(A_ti+16*13), 6);
	_mm_prefetch((void *)(A_ti+16*14), 6);
	_mm_prefetch((void *)(A_ti+16*15), 6);

	_mm_prefetch((void *)(A_ti+16*16), 6);
	_mm_prefetch((void *)(A_ti+16*17), 6);
	_mm_prefetch((void *)(A_ti+16*18), 6);
	_mm_prefetch((void *)(A_ti+16*19), 6);
	_mm_prefetch((void *)(A_ti+16*20), 6);
	_mm_prefetch((void *)(A_ti+16*21), 6);
	_mm_prefetch((void *)(A_ti+16*22), 6);
	_mm_prefetch((void *)(A_ti+16*23), 6);

	_mm_prefetch((void *)(A_ti+16*24), 6);
	_mm_prefetch((void *)(A_ti+16*25), 6);
	_mm_prefetch((void *)(A_ti+16*26), 6);
	_mm_prefetch((void *)(A_ti+16*27), 6);
	_mm_prefetch((void *)(A_ti+16*28), 6);
	_mm_prefetch((void *)(A_ti+16*29), 6);
	_mm_prefetch((void *)(A_ti+16*30), 6);
	_mm_prefetch((void *)(A_ti+16*31), 6);

	_mm_prefetch((void *)(A_ti+16*32), 6);
	_mm_prefetch((void *)(A_ti+16*33), 6);
	_mm_prefetch((void *)(A_ti+16*34), 6);
	_mm_prefetch((void *)(A_ti+16*35), 6);
	_mm_prefetch((void *)(A_ti+16*36), 6);
	_mm_prefetch((void *)(A_ti+16*37), 6);
	_mm_prefetch((void *)(A_ti+16*38), 6);
	_mm_prefetch((void *)(A_ti+16*39), 6);

	_mm_prefetch((void *)(A_ti+16*40), 6);
	_mm_prefetch((void *)(A_ti+16*41), 6);
	_mm_prefetch((void *)(A_ti+16*42), 6);
	_mm_prefetch((void *)(A_ti+16*43), 6);
	_mm_prefetch((void *)(A_ti+16*44), 6);
	_mm_prefetch((void *)(A_ti+16*45), 6);
	_mm_prefetch((void *)(A_ti+16*46), 6);
	_mm_prefetch((void *)(A_ti+16*47), 6);

	_mm_prefetch((void *)(A_ti+16*48), 6);
	_mm_prefetch((void *)(A_ti+16*49), 6);
	_mm_prefetch((void *)(A_ti+16*50), 6);
	_mm_prefetch((void *)(A_ti+16*51), 6);
	_mm_prefetch((void *)(A_ti+16*52), 6);
	_mm_prefetch((void *)(A_ti+16*53), 6);
	_mm_prefetch((void *)(A_ti+16*54), 6);
	_mm_prefetch((void *)(A_ti+16*55), 6);

	_mm_prefetch((void *)(A_ti+16*56), 6);
	_mm_prefetch((void *)(A_ti+16*57), 6);
	_mm_prefetch((void *)(A_ti+16*58), 6);
	_mm_prefetch((void *)(A_ti+16*59), 6);
	_mm_prefetch((void *)(A_ti+16*60), 6);
	_mm_prefetch((void *)(A_ti+16*61), 6);
	_mm_prefetch((void *)(A_ti+16*62), 6);
	_mm_prefetch((void *)(A_ti+16*63), 6);

	_mm_prefetch((void *)(A_ti+16*64), 6);
	_mm_prefetch((void *)(A_ti+16*65), 6);
	_mm_prefetch((void *)(A_ti+16*66), 6);
	_mm_prefetch((void *)(A_ti+16*67), 6);
	_mm_prefetch((void *)(A_ti+16*68), 6);
	_mm_prefetch((void *)(A_ti+16*69), 6);
	_mm_prefetch((void *)(A_ti+16*70), 6);
	_mm_prefetch((void *)(A_ti+16*71), 6);

	_mm_prefetch((void *)(A_ti+16*72), 6);
	_mm_prefetch((void *)(A_ti+16*73), 6);
	_mm_prefetch((void *)(A_ti+16*74), 6);
	_mm_prefetch((void *)(A_ti+16*75), 6);
	_mm_prefetch((void *)(A_ti+16*76), 6);
	_mm_prefetch((void *)(A_ti+16*77), 6);
	_mm_prefetch((void *)(A_ti+16*78), 6);
	_mm_prefetch((void *)(A_ti+16*79), 6);

	_mm_prefetch((void *)(A_ti+16*80), 6);
	_mm_prefetch((void *)(A_ti+16*81), 6);
	_mm_prefetch((void *)(A_ti+16*82), 6);
	_mm_prefetch((void *)(A_ti+16*83), 6);
	_mm_prefetch((void *)(A_ti+16*84), 6);
	_mm_prefetch((void *)(A_ti+16*85), 6);
	_mm_prefetch((void *)(A_ti+16*86), 6);
	_mm_prefetch((void *)(A_ti+16*87), 6);

	_mm_prefetch((void *)(A_ti+16*88), 6);
	_mm_prefetch((void *)(A_ti+16*89), 6);
	_mm_prefetch((void *)(A_ti+16*90), 6);
	_mm_prefetch((void *)(A_ti+16*91), 6);
	_mm_prefetch((void *)(A_ti+16*92), 6);
	_mm_prefetch((void *)(A_ti+16*93), 6);
	_mm_prefetch((void *)(A_ti+16*94), 6);
	_mm_prefetch((void *)(A_ti+16*95), 6);

	_mm_prefetch((void *)(A_ti+16*96), 6);
	_mm_prefetch((void *)(A_ti+16*97), 6);
	_mm_prefetch((void *)(A_ti+16*98), 6);
	_mm_prefetch((void *)(A_ti+16*99), 6);
	_mm_prefetch((void *)(A_ti+16*100), 6);
	_mm_prefetch((void *)(A_ti+16*101), 6);
	_mm_prefetch((void *)(A_ti+16*102), 6);
	_mm_prefetch((void *)(A_ti+16*103), 6);

	_mm_prefetch((void *)(A_ti+16*104), 6);
	_mm_prefetch((void *)(A_ti+16*105), 6);
	_mm_prefetch((void *)(A_ti+16*106), 6);
	_mm_prefetch((void *)(A_ti+16*107), 6);
	_mm_prefetch((void *)(A_ti+16*108), 6);
	_mm_prefetch((void *)(A_ti+16*109), 6);
	_mm_prefetch((void *)(A_ti+16*110), 6);
	_mm_prefetch((void *)(A_ti+16*111), 6);

	_mm_prefetch((void *)(A_ti+16*112), 6);
	_mm_prefetch((void *)(A_ti+16*113), 6);
	_mm_prefetch((void *)(A_ti+16*114), 6);
	_mm_prefetch((void *)(A_ti+16*115), 6);
	_mm_prefetch((void *)(A_ti+16*116), 6);
	_mm_prefetch((void *)(A_ti+16*117), 6);
	_mm_prefetch((void *)(A_ti+16*118), 6);
	_mm_prefetch((void *)(A_ti+16*119), 6);

	_mm_prefetch((void *)(A_ti+16*120), 6);
	_mm_prefetch((void *)(A_ti+16*121), 6);
	_mm_prefetch((void *)(A_ti+16*122), 6);
	_mm_prefetch((void *)(A_ti+16*123), 6);
	_mm_prefetch((void *)(A_ti+16*124), 6);
	_mm_prefetch((void *)(A_ti+16*125), 6);
	_mm_prefetch((void *)(A_ti+16*126), 6);
	_mm_prefetch((void *)(A_ti+16*127), 6);
    }

    //get system-wide end time
    clock_gettime(CLOCK_REALTIME, &time_end[tid]);

    pthread_barrier_wait(&barrier[1]);

    elapsed_time[tid] = time_diff(time_begin[tid], time_end[tid]);

    //printf("thread %d time_begin: %llu\t time_end: %llu\t elapsed_time: %llu\n", tid, time_begin[tid].tv_sec * 1000000000 + time_begin[tid].tv_nsec, time_end[tid].tv_sec * 1000000000 + time_end[tid].tv_nsec, elapsed_time[tid]);

}

void* bandwidth_thread(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;

	//pin the thread to the core, KNC starts from core 1
	set_cpu(tid*4/NUM_THREADS_PER_CORE+1);

	//initialize the page first, so that no 2nd-level TLB miss and all physical pages allocated
	volatile unsigned int *A_ti = &A[tid*ARRAY_SIZE];
	register unsigned int temp = 0;
	int i = 0;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  //A_ti[i]=i;
	  temp += A_ti[i];

	//test it 5 times to get stable results
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
}

int main()
{
        //allocate the array space for the thread
	A = (volatile unsigned int *)malloc(ARRAY_SIZE*NUM_THREADS*sizeof(volatile unsigned int));
	bzero((void *)A, ARRAY_SIZE*NUM_THREADS*sizeof(volatile unsigned int));

	//initialize the page first, so that all physical pages allocated
	int i = 0;
	for(i = 0; i < ARRAY_SIZE*NUM_THREADS; i=i+1024)
	  A[i]=i;
	
	pthread_t thread_id[NUM_THREADS];
	struct thread_arg_t thread_arg[NUM_THREADS];
	int ti;
	for(ti = 0; ti < NUM_BARRIERS; ++ti)
	{
		pthread_barrier_init(&barrier[ti], NULL, NUM_THREADS);
	}

	for(ti = 0; ti < NUM_THREADS; ++ti)
	{
		thread_arg[ti].tid = ti;
		pthread_create( &thread_id[ti], NULL, bandwidth_thread, (void *)&thread_arg[ti]);
	}

	for(ti = 0; ti < NUM_THREADS; ++ti)   
	{      
		pthread_join(thread_id[ti], NULL);    
	}

	for(ti = 0; ti < NUM_BARRIERS; ++ti)   
	{      
		pthread_barrier_destroy(&barrier[ti]);
	}

	//free the memory
	free((void *)A);

	//get earliest begin time and latest end time of whole system
	long long min_time_begin = time_begin[0].tv_sec * 1000000000 + time_begin[0].tv_nsec;
	long long max_time_end = time_end[0].tv_sec * 1000000000 + time_end[0].tv_nsec;

	for(ti = 1; ti < NUM_THREADS; ++ti)   
	{      
	  if( (min_time_begin - (time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec)) > 0 )
	    min_time_begin = time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec;

	  if( (max_time_end - (time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec)) < 0  )
	    max_time_end = time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec;

	}

	//calculate total elapsed time and bandwidth
	long long total_elapsed_time = max_time_end - min_time_begin;
	double bandwidth = (double)sizeof(unsigned int)*ARRAY_SIZE*NUM_THREADS/total_elapsed_time;
	printf("%.3lf\t", bandwidth);
	//printf("%llu\n", total_elapsed_time);//*/

	//print the beign and end time
	/*for(ti = 0; ti < NUM_THREADS; ++ti){
	  printf("%llu\t", time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec);
	}
	printf("\n");

	for(ti = 0; ti < NUM_THREADS; ++ti){
	  printf("%llu\t", time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec);
	}
	printf("\n");//*/

	return 0;
}	
