#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sched.h>
#include <pthread.h>
#include <time.h>
#include <stdlib.h>
#include <strings.h>

#define NUM_CORES 40
#define NUM_THREADS_PER_CORE 1
#define NUM_THREADS (NUM_CORES * NUM_THREADS_PER_CORE)
 
#define NUM_BARRIERS 2
pthread_barrier_t barrier[NUM_BARRIERS];

//pin thread to core
void set_cpu(int cpu_no)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu_no, &mask);
	sched_setaffinity(syscall(SYS_gettid), sizeof(cpu_set_t), &mask);
}

//get elapsed time
long time_diff(struct timespec start, struct timespec end)
{
  struct timespec temp;
  if ((end.tv_nsec-start.tv_nsec)<0) {
    temp.tv_sec = end.tv_sec-start.tv_sec-1;
    temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
  } else {
    temp.tv_sec = end.tv_sec-start.tv_sec;
    temp.tv_nsec = end.tv_nsec-start.tv_nsec;
  }
  return temp.tv_sec * 1000000000 + temp.tv_nsec;
}

struct thread_arg_t {
	int tid;
};

/* 2nd-level TLB can work for 128MB data, we guarantee 2nd-level
** TLB hits. And since the total memory on KNC is limited, 4GB in
** total, usually 2.2GB free memory available. So we limited the
** array size per core to be 4*6=24MB. Thus it doesn't exceed 128MB,
** always 2nd-level TLB hit; and total size is 24*60=1440MB, i.e.,
** 1.4GB, not exceeding free memory size*/
#define ARRAY_SIZE 6*1024*1024/NUM_THREADS_PER_CORE
volatile unsigned int *A;

//using system-wide timer micetc. time_begin, time_end struct
struct timespec time_begin[NUM_THREADS], time_end[NUM_THREADS];
long long elapsed_time[NUM_THREADS];

//test bandwidth using L2 prefetch, random read, streaming read,
//random write, and streaming write
//this is the routine for each thread
void test_bandwidth_p(int tid)
{
    int i;  
    //each thread starts from A[tid*ARRAY_SIZE]
    volatile unsigned int *A_ti = &A[tid*ARRAY_SIZE];
    for(i = 0; i < ARRAY_SIZE; i=i+16){
      _mm_clevict((void *)&A_ti[i], 1);
      _mm_clevict((void *)&A_ti[i], 2);
    }
    __asm cpuid

    volatile unsigned int *A_ti_begin;
    volatile unsigned int *A_ti_end;

    pthread_barrier_wait(&barrier[0]);

    //get system-wide begin time
    clock_gettime(CLOCK_REALTIME, &time_begin[tid]);

    for(i = 0; i < ARRAY_SIZE/2048; i++){
        //each thread starts from A+tid*ARRAY_SIZE
        A_ti_begin = &(A[tid*ARRAY_SIZE+i*1024]);
        A_ti_end = &(A[tid*ARRAY_SIZE+ARRAY_SIZE/2+i*1024]);

	//unroll two pages to minimize control overhead
	_mm_prefetch((void *)A_ti_begin, 2);
	_mm_prefetch((void *)A_ti_end, 2);
	_mm_prefetch((void *)(A_ti_begin+16), 2);
	_mm_prefetch((void *)(A_ti_end+16), 2);
	_mm_prefetch((void *)(A_ti_begin+16*2), 2);
	_mm_prefetch((void *)(A_ti_end+16*2), 2);
	_mm_prefetch((void *)(A_ti_begin+16*3), 2);
	_mm_prefetch((void *)(A_ti_end+16*3), 2);
	_mm_prefetch((void *)(A_ti_begin+16*4), 2);
	_mm_prefetch((void *)(A_ti_end+16*4), 2);
	_mm_prefetch((void *)(A_ti_begin+16*5), 2);
	_mm_prefetch((void *)(A_ti_end+16*5), 2);
	_mm_prefetch((void *)(A_ti_begin+16*6), 2);
	_mm_prefetch((void *)(A_ti_end+16*6), 2);
	_mm_prefetch((void *)(A_ti_begin+16*7), 2);
	_mm_prefetch((void *)(A_ti_end+16*7), 2);
	_mm_prefetch((void *)(A_ti_begin+16*8), 2);
	_mm_prefetch((void *)(A_ti_end+16*8), 2);
	_mm_prefetch((void *)(A_ti_begin+16*9), 2);
	_mm_prefetch((void *)(A_ti_end+16*9), 2);

	_mm_prefetch((void *)(A_ti_begin+16*10), 2);
	_mm_prefetch((void *)(A_ti_end+16*10), 2);
	_mm_prefetch((void *)(A_ti_begin+16*11), 2);
	_mm_prefetch((void *)(A_ti_end+16*11), 2);
	_mm_prefetch((void *)(A_ti_begin+16*12), 2);
	_mm_prefetch((void *)(A_ti_end+16*12), 2);
	_mm_prefetch((void *)(A_ti_begin+16*13), 2);
	_mm_prefetch((void *)(A_ti_end+16*13), 2);
	_mm_prefetch((void *)(A_ti_begin+16*14), 2);
	_mm_prefetch((void *)(A_ti_end+16*14), 2);
	_mm_prefetch((void *)(A_ti_begin+16*15), 2);
	_mm_prefetch((void *)(A_ti_end+16*15), 2);
	_mm_prefetch((void *)(A_ti_begin+16*16), 2);
	_mm_prefetch((void *)(A_ti_end+16*16), 2);
	_mm_prefetch((void *)(A_ti_begin+16*17), 2);
	_mm_prefetch((void *)(A_ti_end+16*17), 2);
	_mm_prefetch((void *)(A_ti_begin+16*18), 2);
	_mm_prefetch((void *)(A_ti_end+16*18), 2);
	_mm_prefetch((void *)(A_ti_begin+16*19), 2);
	_mm_prefetch((void *)(A_ti_end+16*19), 2);

	_mm_prefetch((void *)(A_ti_begin+16*20), 2);
	_mm_prefetch((void *)(A_ti_end+16*20), 2);
	_mm_prefetch((void *)(A_ti_begin+16*21), 2);
	_mm_prefetch((void *)(A_ti_end+16*21), 2);
	_mm_prefetch((void *)(A_ti_begin+16*22), 2);
	_mm_prefetch((void *)(A_ti_end+16*22), 2);
	_mm_prefetch((void *)(A_ti_begin+16*23), 2);
	_mm_prefetch((void *)(A_ti_end+16*23), 2);
	_mm_prefetch((void *)(A_ti_begin+16*24), 2);
	_mm_prefetch((void *)(A_ti_end+16*24), 2);
	_mm_prefetch((void *)(A_ti_begin+16*25), 2);
	_mm_prefetch((void *)(A_ti_end+16*25), 2);
	_mm_prefetch((void *)(A_ti_begin+16*26), 2);
	_mm_prefetch((void *)(A_ti_end+16*26), 2);
	_mm_prefetch((void *)(A_ti_begin+16*27), 2);
	_mm_prefetch((void *)(A_ti_end+16*27), 2);
	_mm_prefetch((void *)(A_ti_begin+16*28), 2);
	_mm_prefetch((void *)(A_ti_end+16*28), 2);
	_mm_prefetch((void *)(A_ti_begin+16*29), 2);
	_mm_prefetch((void *)(A_ti_end+16*29), 2);

	_mm_prefetch((void *)(A_ti_begin+16*30), 2);
	_mm_prefetch((void *)(A_ti_end+16*30), 2);
	_mm_prefetch((void *)(A_ti_begin+16*31), 2);
	_mm_prefetch((void *)(A_ti_end+16*31), 2);
	_mm_prefetch((void *)(A_ti_begin+16*32), 2);
	_mm_prefetch((void *)(A_ti_end+16*32), 2);
	_mm_prefetch((void *)(A_ti_begin+16*33), 2);
	_mm_prefetch((void *)(A_ti_end+16*33), 2);
	_mm_prefetch((void *)(A_ti_begin+16*34), 2);
	_mm_prefetch((void *)(A_ti_end+16*34), 2);
	_mm_prefetch((void *)(A_ti_begin+16*35), 2);
	_mm_prefetch((void *)(A_ti_end+16*35), 2);
	_mm_prefetch((void *)(A_ti_begin+16*36), 2);
	_mm_prefetch((void *)(A_ti_end+16*36), 2);
	_mm_prefetch((void *)(A_ti_begin+16*37), 2);
	_mm_prefetch((void *)(A_ti_end+16*37), 2);
	_mm_prefetch((void *)(A_ti_begin+16*38), 2);
	_mm_prefetch((void *)(A_ti_end+16*38), 2);
	_mm_prefetch((void *)(A_ti_begin+16*39), 2);
	_mm_prefetch((void *)(A_ti_end+16*39), 2);

	_mm_prefetch((void *)(A_ti_begin+16*40), 2);
	_mm_prefetch((void *)(A_ti_end+16*40), 2);
	_mm_prefetch((void *)(A_ti_begin+16*41), 2);
	_mm_prefetch((void *)(A_ti_end+16*41), 2);
	_mm_prefetch((void *)(A_ti_begin+16*42), 2);
	_mm_prefetch((void *)(A_ti_end+16*42), 2);
	_mm_prefetch((void *)(A_ti_begin+16*43), 2);
	_mm_prefetch((void *)(A_ti_end+16*43), 2);
	_mm_prefetch((void *)(A_ti_begin+16*44), 2);
	_mm_prefetch((void *)(A_ti_end+16*44), 2);
	_mm_prefetch((void *)(A_ti_begin+16*45), 2);
	_mm_prefetch((void *)(A_ti_end+16*45), 2);
	_mm_prefetch((void *)(A_ti_begin+16*46), 2);
	_mm_prefetch((void *)(A_ti_end+16*46), 2);
	_mm_prefetch((void *)(A_ti_begin+16*47), 2);
	_mm_prefetch((void *)(A_ti_end+16*47), 2);
	_mm_prefetch((void *)(A_ti_begin+16*48), 2);
	_mm_prefetch((void *)(A_ti_end+16*48), 2);
	_mm_prefetch((void *)(A_ti_begin+16*49), 2);
	_mm_prefetch((void *)(A_ti_end+16*49), 2);

	_mm_prefetch((void *)(A_ti_begin+16*50), 2);
	_mm_prefetch((void *)(A_ti_end+16*50), 2);
	_mm_prefetch((void *)(A_ti_begin+16*51), 2);
	_mm_prefetch((void *)(A_ti_end+16*51), 2);
	_mm_prefetch((void *)(A_ti_begin+16*52), 2);
	_mm_prefetch((void *)(A_ti_end+16*52), 2);
	_mm_prefetch((void *)(A_ti_begin+16*53), 2);
	_mm_prefetch((void *)(A_ti_end+16*53), 2);
	_mm_prefetch((void *)(A_ti_begin+16*54), 2);
	_mm_prefetch((void *)(A_ti_end+16*54), 2);
	_mm_prefetch((void *)(A_ti_begin+16*55), 2);
	_mm_prefetch((void *)(A_ti_end+16*55), 2);
	_mm_prefetch((void *)(A_ti_begin+16*56), 2);
	_mm_prefetch((void *)(A_ti_end+16*56), 2);
	_mm_prefetch((void *)(A_ti_begin+16*57), 2);
	_mm_prefetch((void *)(A_ti_end+16*57), 2);
	_mm_prefetch((void *)(A_ti_begin+16*58), 2);
	_mm_prefetch((void *)(A_ti_end+16*58), 2);
	_mm_prefetch((void *)(A_ti_begin+16*59), 2);
	_mm_prefetch((void *)(A_ti_end+16*59), 2);

	_mm_prefetch((void *)(A_ti_begin+16*60), 2);
	_mm_prefetch((void *)(A_ti_end+16*60), 2);
	_mm_prefetch((void *)(A_ti_begin+16*61), 2);
	_mm_prefetch((void *)(A_ti_end+16*61), 2);
	_mm_prefetch((void *)(A_ti_begin+16*62), 2);
	_mm_prefetch((void *)(A_ti_end+16*62), 2);
	_mm_prefetch((void *)(A_ti_begin+16*63), 2);
	_mm_prefetch((void *)(A_ti_end+16*63), 2);
    }

    //get system-wide end time
    clock_gettime(CLOCK_REALTIME, &time_end[tid]);

    pthread_barrier_wait(&barrier[1]);

    elapsed_time[tid] = time_diff(time_begin[tid], time_end[tid]);

    //printf("thread %d time_begin: %llu\t time_end: %llu\t elapsed_time: %llu\n", tid, time_begin[tid].tv_sec * 1000000000 + time_begin[tid].tv_nsec, time_end[tid].tv_sec * 1000000000 + time_end[tid].tv_nsec, elapsed_time[tid]);

}

void* bandwidth_thread(void* thread_arg)
{
	struct thread_arg_t *my_thread_arg = (struct thread_arg_t *)thread_arg;
	int tid = my_thread_arg->tid;

	//pin the thread to the core, KNC starts from core 1
	set_cpu(tid*4/NUM_THREADS_PER_CORE+1);

	//initialize the page first, so that no 2nd-level TLB miss and all physical pages allocated
	volatile unsigned int *A_ti = &A[tid*ARRAY_SIZE];
	register unsigned int temp = 0;
	int i = 0;
	for(i = 0; i < ARRAY_SIZE; i=i+1024)
	  //A_ti[i]=i;
	  temp += A_ti[i];

	//test it 5 times to get stable results
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
	test_bandwidth_p(tid);
}

int main()
{
        //allocate the array space for the thread
	A = (volatile unsigned int *)malloc(ARRAY_SIZE*NUM_THREADS*sizeof(volatile unsigned int));
	bzero((void *)A, ARRAY_SIZE*NUM_THREADS*sizeof(volatile unsigned int));

	//initialize the page first, so that all physical pages allocated
	int i = 0;
	for(i = 0; i < ARRAY_SIZE*NUM_THREADS; i=i+1024)
	  A[i]=i;
	
	pthread_t thread_id[NUM_THREADS];
	struct thread_arg_t thread_arg[NUM_THREADS];
	int ti;
	for(ti = 0; ti < NUM_BARRIERS; ++ti)
	{
		pthread_barrier_init(&barrier[ti], NULL, NUM_THREADS);
	}

	for(ti = 0; ti < NUM_THREADS; ++ti)
	{
		thread_arg[ti].tid = ti;
		pthread_create( &thread_id[ti], NULL, bandwidth_thread, (void *)&thread_arg[ti]);
	}

	for(ti = 0; ti < NUM_THREADS; ++ti)   
	{      
		pthread_join(thread_id[ti], NULL);    
	}

	for(ti = 0; ti < NUM_BARRIERS; ++ti)   
	{      
		pthread_barrier_destroy(&barrier[ti]);
	}

	//free the memory
	free((void *)A);

	//get earliest begin time and latest end time of whole system
	long long min_time_begin = time_begin[0].tv_sec * 1000000000 + time_begin[0].tv_nsec;
	long long max_time_end = time_end[0].tv_sec * 1000000000 + time_end[0].tv_nsec;

	for(ti = 1; ti < NUM_THREADS; ++ti)   
	{      
	  if( (min_time_begin - (time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec)) > 0 )
	    min_time_begin = time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec;

	  if( (max_time_end - (time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec)) < 0  )
	    max_time_end = time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec;

	}

	//calculate total elapsed time and bandwidth
	/*long long total_elapsed_time = max_time_end - min_time_begin;
	double bandwidth = (double)sizeof(unsigned int)*ARRAY_SIZE*NUM_THREADS/total_elapsed_time;
	printf("%.3lf\t", bandwidth);//*/
	//printf("%llu\n", total_elapsed_time);//*/

	//print the beign and end time
	for(ti = 0; ti < NUM_THREADS; ++ti){
	  printf("%llu\t", time_begin[ti].tv_sec * 1000000000 + time_begin[ti].tv_nsec);
	}
	printf("\n");

	for(ti = 0; ti < NUM_THREADS; ++ti){
	  printf("%llu\t", time_end[ti].tv_sec * 1000000000 + time_end[ti].tv_nsec);
	}
	printf("\n");//*/

	return 0;
}	
