#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <fftw_mpi.h>
#include "randv.h"

inline double min(double a, double b){return (a<b) ? a : b;}
inline double max(double a, double b){return (a>b) ? a : b;}

int main(int argc,char *argv[]){
fftw_complex **hc;
fftw_complex *work;
double **h_re,**h_im,*qx,*qy;
double epscom,fft_norm,err_re,err_im,err,abs_h,start_time,cpu_time;
int ios,iseed,l,l2,ll,lm1,nqx,nqy,i,j,k,kx,ky,nfft,it;
int myid,numprocs,local_nx,local_x_start,local_x_end,n_local,
    local_ny_at,local_y_start_at,local_y_end_at,local_size_total,local_work;
fftwnd_mpi_plan fwplan,bwplan;

   MPI_Init(&argc,&argv);
   MPI_Comm_rank( MPI_COMM_WORLD, &myid );
   MPI_Comm_size( MPI_COMM_WORLD, &numprocs );
   if (argc < 5) {
     if (myid == 0) {
       fprintf(stderr,"usage: %s iseed l epscom nfft\n",argv[0]);
       fprintf(stderr,"with\n"
                      "     iseed   seed for random number generator (int)\n"
                      "     l       (linear) size of the system (int)\n"
                      "     epscom  required accuracy (double)\n"
                      "     nfft    no. of FFTs to be performed (int)\n");
     }
     MPI_Finalize();
     exit(1);
   }
   iseed=atoi(argv[1]);
   l=atoi(argv[2]);
   l=2*numprocs*(int)(((double)l)/(2*numprocs)+0.5);
   epscom=atof(argv[3]);
   nfft=atoi(argv[4]);
   seeds(iseed);
   l2=l/2;
   lm1=l-1;
   ll=l*l;
   fft_norm=(double)ll;
   fwplan=fftw2d_mpi_create_plan(MPI_COMM_WORLD,l,l,FFTW_FORWARD, 
                                 FFTW_ESTIMATE);
   bwplan=fftw2d_mpi_create_plan(MPI_COMM_WORLD,l,l,FFTW_BACKWARD,
                                 FFTW_ESTIMATE);
   fftwnd_mpi_local_sizes(fwplan,&local_nx,&local_x_start,
                          &local_ny_at,&local_y_start_at,&local_size_total);
   hc=(fftw_complex **)malloc(l*sizeof(fftw_complex));
   hc[0]=(fftw_complex *)malloc(ll*sizeof(fftw_complex));
   h_re=(double **)malloc(l*sizeof(double));
   h_re[0]=(double *)malloc(ll*sizeof(double));
   h_im=(double **)malloc(l*sizeof(double));
   h_im[0]=(double *)malloc(ll*sizeof(double));
   for (i=1;i<l;i++){
     hc[i]=hc[i-1]+l;
     h_re[i]=h_re[i-1]+l;
     h_im[i]=h_im[i-1]+l;
   }/*end for*/
   local_x_end=local_x_start+local_nx-1;
   n_local=local_nx*l;
   local_work=(local_x_end+1)%l;
   if (numprocs > 1){
     work=&hc[local_work][0];
   }else{
     work=NULL;
   }/*end if*/
   randv(&h_re[local_x_start][0],n_local);
   randv(&h_im[local_x_start][0],n_local);
   start_time=MPI_Wtime();
   for (it=1;it<=nfft;it++){
     for (i=local_x_start;i<=local_x_end;i++){
       for (j=0;j<l;j++){
         hc[i][j].re=h_re[i][j]/fft_norm;
         hc[i][j].im=h_im[i][j]/fft_norm;
       }/*end for*/
     }/*end for*/
     fftwnd_mpi(fwplan,1,&hc[local_x_start][0],work,FFTW_TRANSPOSED_ORDER);
     fftwnd_mpi(bwplan,1,&hc[local_y_start_at][0],work,
                FFTW_TRANSPOSED_ORDER);
   }/*end for*/
   cpu_time=MPI_Wtime()-start_time;
   for (i=local_x_start;i<=local_x_end;i++){
     for (j=0;j<l;j++){
       err_re=abs(h_re[i][j]-hc[i][j].re);
       err_im=abs(h_im[i][j]-hc[i][j].im);
       abs_h=sqrt(h_re[i][j]*h_re[i][j]+h_im[i][j]*h_im[i][j]);
       err=max(err_re,err_im);
       if (abs_h > epscom) err=err/abs_h;
       if (err > epscom){
         printf("i=%i, j=%i : h=(%f,%f)\n",i,j,h_re[i][j],h_im[i][j]);
         printf("               (%f,%f)\n",hc[i][j].re,hc[i][j].im);
       }/*end if*/
     }/*end for*/
   }/*end for*/
   printf("cpu-time: %f\n",cpu_time);
   MPI_Finalize();
}
