#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "fft2d.h"
#include "alloc_array.h"

inline double min(double a, double b){return (a<b) ? a : b;}
inline double max(double a, double b){return (a>b) ? a : b;}

int main(int argc,char *argv[]){
fftw_complex **hc;
fftw_complex *work;
double **h_re,**h_im;
double epscom,fft_norm,err_re,err_im,err,abs_h,start_time,cpu_time;
int ios,iseed,l,l2,ll,lm1,i,j,k,nfft,it,ialloc_err;
int cont;
int myid,numprocs,local_nx,local_x_start,local_x_end,n_local,
    local_ny_at,local_y_start_at,local_y_end_at,local_size_total,local_work;
fft2d_mpi_plan fwplan,bwplan;
MPI_Status status;

   MPI_Init(&argc,&argv);
   MPI_Comm_rank( MPI_COMM_WORLD, &myid );
   MPI_Comm_size( MPI_COMM_WORLD, &numprocs );
   if (argc < 5) {
     if (myid == 0) {
       fprintf(stderr,"usage: %s iseed l epscom nfft\n",argv[0]);
       fprintf(stderr,"with\n"
                      "     iseed   seed for random number generator (int)\n"
                      "     l       (linear) size of the system (int)\n"
                      "     epscom  required accuracy (double)\n"
                      "     nfft    no. of FFTs to be performed (int)\n");
     }
     MPI_Finalize();
     exit(1);
   }
   iseed=atoi(argv[1]);
   l=atoi(argv[2]);
   l=2*numprocs*(int)(((double)l)/(2*numprocs)+0.5);
   seeds(iseed);
   epscom=atof(argv[3]);
   nfft=atoi(argv[4]);
     l2=l/2;
     lm1=l-1;
     ll=l*l;
     fft_norm=(double)ll;
     fwplan=fft2d_mpi_create_plan(MPI_COMM_WORLD,l,l,FFTW_FORWARD, 
                                  FFTW_ESTIMATE);
     bwplan=fft2d_mpi_create_plan(MPI_COMM_WORLD,l,l,FFTW_BACKWARD,
                                  FFTW_ESTIMATE);
     local_nx=l/numprocs;
     local_x_start=myid*local_nx;
     local_x_end=local_x_start+local_nx-1;
     local_ny_at=local_nx;
     local_y_start_at=local_x_start;
     local_y_end_at=local_x_end;
     n_local=local_nx*l;

     hc=alloc_carray2d(local_x_start,local_x_end,0,lm1,&ialloc_err);
     if (ialloc_err) {
        fprintf(stderr,"id=%i: hc allocation error\n",myid);
        MPI_Abort(MPI_COMM_WORLD,ialloc_err);
        exit(ialloc_err);
     }
     h_re=alloc_darray2d(local_x_start,local_x_end,0,lm1,&ialloc_err);
     if (ialloc_err) {
        fprintf(stderr,"id=%i: h_re allocation error\n",myid);
        MPI_Abort(MPI_COMM_WORLD,ialloc_err);
        exit(ialloc_err);
     }
     h_im=alloc_darray2d(local_x_start,local_x_end,0,lm1,&ialloc_err);
     if (ialloc_err) {
        fprintf(stderr,"id=%i: h_re allocation error\n",myid);
        MPI_Abort(MPI_COMM_WORLD,ialloc_err);
        exit(ialloc_err);
     }
     if (numprocs > 1){
       work=(fftw_complex *)malloc(n_local*sizeof(fftw_complex));
       if (work == NULL){
         fprintf(stderr,"id=%i: work allocation error\n",myid);
         MPI_Abort(MPI_COMM_WORLD,1);
         exit(1);
       }
     }else{
       work=NULL;
     }/*end if*/
     randv(&h_re[local_x_start][0],n_local);
     randv(&h_im[local_x_start][0],n_local);
     start_time=MPI_Wtime();
     for (it=1;it<=nfft;it++){
       for (i=local_x_start;i<=local_x_end;i++){
         for (j=0;j<l;j++){
           hc[i][j].re=h_re[i][j]/fft_norm;
           hc[i][j].im=h_im[i][j]/fft_norm;
         }/*end for*/
       }/*end for*/
       fft2d_mpi(fwplan,&hc[local_x_start][0],work);
       fft2d_mpi(bwplan,&hc[local_y_start_at][0],work);
     }/*end for*/
     cpu_time=MPI_Wtime()-start_time;
     if (myid > 0)
         MPI_Recv(&cont,0,MPI_INT,myid-1,myid-1,MPI_COMM_WORLD,&status);
     for (i=local_x_start;i<=local_x_end;i++){
       for (j=0;j<l;j++){
         err_re=abs(h_re[i][j]-hc[i][j].re);
         err_im=abs(h_im[i][j]-hc[i][j].im);
         abs_h=sqrt(h_re[i][j]*h_re[i][j]+h_im[i][j]*h_im[i][j]);
         err=max(err_re,err_im);
         if (abs_h > epscom) err=err/abs_h;
         if (err > epscom){
           printf("id=%i: i=%i, j=%i : h=(%f,%f)\n",
                  myid,i,j,h_re[i][j],h_im[i][j]);
           printf("                    (%f,%f)\n",hc[i][j].re,hc[i][j].im);
         }/*end if*/
       }/*end for*/
     }/*end for*/
     fflush(stdout);
     if (myid < numprocs-1)
        MPI_Send(&cont,0,MPI_INT,myid+1,myid,MPI_COMM_WORLD);
     MPI_Barrier(MPI_COMM_WORLD);
     printf("id=%i: cpu-time %f\n",myid,cpu_time);
     MPI_Finalize();
}
