// Matteo Lulli, Massimo Bernaschi, Giorgio Parisi 2013
// Physics Dept. 'Sapienza', University of Rome

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include "myRNG.h"

extern int CRNGArrayAlloc(CRNGArray_t *rngArray){
  if(rngArray->N == 0){
    fprintf(stderr, "\n N = 0 rngArray!!\n");
    rngArray->d_rngSeeds = rngArray->h_rngSeeds = NULL;
    return 1;
  }
  MYCUDA_ERROR( cudaMalloc((void **) &rngArray->d_rngSeeds, rngArray->N*sizeof(RNGT)) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &rngArray->h_rngSeeds, rngArray->N*sizeof(RNGT), cudaHostAllocDefault) );
  return 0;
}

extern int PR_RNGArrayAlloc(PR_RNGArray_t *rngArray){
  MYCUDA_ERROR( cudaMalloc((void **) &rngArray->d_rngSeeds, rngArray->N*sizeof(RNGT)) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &rngArray->h_rngSeeds, rngArray->N*sizeof(RNGT), cudaHostAllocDefault) );
  return 0;
}

extern int CRNGArrayReset(CRNGArray_t *rngArray){
  if(rngArray->h_rngSeeds == NULL || rngArray->d_rngSeeds == NULL){
    fprintf(stderr, "\n Null Pointers in CRNGArrayReset\n");
    return 1;
  }
  rngArray->N = 0;
  MYCUDA_ERROR( cudaFree(rngArray->d_rngSeeds) );
  MYCUDA_ERROR( cudaFreeHost(rngArray->h_rngSeeds) );
  return 0;
}

extern int CRNGArrayFree(CRNGArray_t *rngArray){
  if(CRNGArrayReset(rngArray)){
    fprintf(stderr, "\nError in RNGArrayFree\n");
    return 1;
  }
  free(rngArray);
  return 0;
}

extern int PR_RNGArrayReset(PR_RNGArray_t *rngArray){
  rngArray->N = 0;
  MYCUDA_ERROR( cudaFree(rngArray->d_rngSeeds) );
  MYCUDA_ERROR( cudaFreeHost(rngArray->h_rngSeeds) );
  return 0;
}

extern int PR_RNGArrayFree(PR_RNGArray_t **rngArray){
  if(PR_RNGArrayReset(*rngArray)){
    fprintf(stderr, "\nError in RNGArrayFree\n");
    return 1;
  }
  free(*rngArray);
  return 0;
}

extern int CRNGArraySetURandom(CRNGArray_t *rngArray){
  size_t chkrd = 0, test = 0;
  FILE *urandom = fopen("/dev/urandom", "rb");
  if(urandom == NULL){
    fprintf(stderr, "\nError opening /dev/urandom CRNGArraySetURandom\n");
    return 1;
  }

  for(int i=0; i<rngArray->N; i++){
    RNGT swap = 0;
    test = 0;
    while(swap == 0 && test != 1){
      test = fread(&swap, sizeof(RNGT), 1, urandom);
      swap = swap&TWO31M1;
    }
    rngArray->h_rngSeeds[i] = swap;
    chkrd += 1;
  }

  fclose(urandom);

  if(chkrd^rngArray->N){
    fprintf(stderr, "\nError reading /dev/urandom CRNGArraySetURandom: wrong size\n");
    return 1;
  }
  return 0;
}

extern int PR_RNGArraySetURandom(PR_RNGArray_t *rngArray){
  size_t chkrd = 0, test = 0;
  FILE *urandom = fopen("/dev/urandom", "rb");
  if(urandom == NULL){
    fprintf(stderr, "\nError opening /dev/urandom PR_RNGArraySetURandom\n");
    return 1;
  }

  for(int i=0; i<rngArray->N; i++){
    RNGT swap = 0;
    test = 0;
    while(swap == 0 && test != 1) test = fread(&swap, sizeof(RNGT), 1, urandom);
    rngArray->h_rngSeeds[i] = swap;
    chkrd += 1;
  }

  fclose(urandom);

  if(chkrd^rngArray->N){
    fprintf(stderr, "\nError reading /dev/urandom PR_RNGArraySetURandom: wrong size\n");
    return 1;
  }
  return 0;
}

extern int H2DCRNGArray(CRNGArray_t *rngArray){
  MYCUDA_ERROR( cudaMemcpy(rngArray->d_rngSeeds, rngArray->h_rngSeeds, rngArray->N*sizeof(RNGT), cudaMemcpyHostToDevice) );
  return 0;
}

extern int D2HCRNGArray(CRNGArray_t *rngArray){
  MYCUDA_ERROR( cudaMemcpy(rngArray->h_rngSeeds, rngArray->d_rngSeeds, rngArray->N*sizeof(RNGT), cudaMemcpyDeviceToHost) );
  return 0;
}

extern int H2DPR_RNGArray(PR_RNGArray_t *rngArray){
  MYCUDA_ERROR( cudaMemcpy(rngArray->d_rngSeeds, rngArray->h_rngSeeds, rngArray->N*sizeof(RNGT), cudaMemcpyHostToDevice) );
  return 0;
}

extern int D2HPR_RNGArray(PR_RNGArray_t *rngArray){
  MYCUDA_ERROR( cudaMemcpy(rngArray->h_rngSeeds, rngArray->d_rngSeeds, rngArray->N*sizeof(RNGT), cudaMemcpyDeviceToHost) );
  return 0;
}

extern int ReadCRNGArray(CRNGArray_t *rngArray, unsigned long long int t){
  char rngFileName[200];

  snprintf(rngFileName, sizeof(rngFileName), "./CRNGSeeds%016llX", t);
  FILE *seeds = fopen(rngFileName, "rb");

  if(seeds == NULL){
    fprintf(stderr, "\nError opening congruentialSeeds ReadCRNGArray\n");
    return 1;
  }
  int chkcp = fread(rngArray->h_rngSeeds, sizeof(RNGT), rngArray->N, seeds);
  fclose(seeds);
  if(chkcp^rngArray->N){
    fprintf(stderr, "\nError reading congruntialSeeds ReadCRNGArray: wrong size\n");
    return 1;
  }
  return 0;
}

extern int ReadCRNGArrayMpiRecover2(CRNGArray_t *rngArray, unsigned long long int t, int sysN, int recoverMPIprocs){
  char rngFileName[200];
  int chkcp = 0;

  for(int proc = 0; proc < recoverMPIprocs; proc++){
    snprintf(rngFileName, sizeof(rngFileName), "./CRNGSeeds%016llX_mpi%d", t, proc);
    FILE *seeds = fopen(rngFileName, "rb");
    
    if(seeds == NULL){
      fprintf(stderr, "\nError opening congruentialSeeds ReadCRNGArray_mpi%d\n", proc);
      return 1;
    }

    int offset = proc*rngArray->N/recoverMPIprocs;
    int readDim = rngArray->N/recoverMPIprocs;      

    chkcp += fread(rngArray->h_rngSeeds + offset, sizeof(RNGT), readDim, seeds);
    fclose(seeds);
  }

  if(chkcp^rngArray->N){
    fprintf(stderr, "\nError reading congruntialSeeds ReadCRNGArray: wrong size\n");
    return 1;
  }
  
  return 0;
}

extern int ReadCRNGArrayMpiRecover(CRNGArray_t *rngArray, unsigned long long int t, int sysN, int recoverMPIprocs){
  char rngFileName[200];
  int chkcp = 0;

  RNGT *swap = (RNGT *)malloc(rngArray->N*sizeof(RNGT));
  if(swap == NULL){
    fprintf(stderr, "\nError allocation swap in ReadCRNGArrayMpiRecover\n");
    return 1;
  }

  for(int proc = 0; proc < recoverMPIprocs; proc++){
    snprintf(rngFileName, sizeof(rngFileName), "./CRNGSeeds%016llX_mpi%d", t, proc);
    FILE *seeds = fopen(rngFileName, "rb");
    
    if(seeds == NULL){
      fprintf(stderr, "\nError opening congruentialSeeds ReadCRNGArray_mpi%d\n", proc);
      return 1;
    }

    int offset = proc*rngArray->N/recoverMPIprocs;
    int readDim = rngArray->N/recoverMPIprocs;      

    chkcp += fread(swap + offset, sizeof(RNGT), readDim, seeds);
    fclose(seeds);
  }

  if(chkcp^rngArray->N){
    fprintf(stderr, "\nError reading congruntialSeeds ReadCRNGArray: wrong size\n");
    return 1;
  }

  int mpiSysStride = rngArray->N/recoverMPIprocs/sysN;
  int sysStride = rngArray->N/sysN;
  int mpiStride = rngArray->N/recoverMPIprocs;

  printf("mpiSysStride: %d, sysStride: %d, mpiStride: %d\n",
	 mpiSysStride, sysStride, mpiStride);

  for(int sys=0; sys < sysN; sys++){
    for(int proc = 0; proc < recoverMPIprocs; proc++){

      for(int i=0; i<mpiSysStride; i++){
	rngArray->h_rngSeeds[i + sys*sysStride + proc*mpiSysStride] = swap[i + sys*mpiSysStride + proc*mpiStride]; 
      }
    }
  }
  
  return 0;
}

extern int DumpCRNGArrayDir(CRNGArray_t *rngArray, int V){
  int hV = V/2;
  int sysN = rngArray->N/hV;
  char rngFileName[200];
  int chkwr = 0;

  for(int i=0; i<sysN; i++){
    snprintf(rngFileName, sizeof(rngFileName), "./dummyRun%d/CRNGSeeds", i);
    FILE *seeds = fopen(rngFileName, "wb");
    if(seeds == NULL){
      fprintf(stderr, "\nError opening congruentialSeeds DumpCRNGArray\n");
      return 1;
    }
    int offset = i*hV;
    chkwr += fwrite(rngArray->h_rngSeeds + offset, sizeof(RNGT), hV, seeds);
    fflush(seeds);
    fclose(seeds);
  }

  if(chkwr^rngArray->N){
    fprintf(stderr, "\nError writing congruntialSeeds DumpCRNGArray: wrong size\n");
    return 1;
  }

  return 0;
}

extern int DumpCRNGArray(CRNGArray_t *rngArray,  unsigned long long int t, int nprocs, int mpiid){
  char rngFileName[200];
  int chkwr = 0;

  if(nprocs == 1) snprintf(rngFileName, sizeof(rngFileName), "./CRNGSeeds%016llX", t);
  if(nprocs > 1){ snprintf(rngFileName, sizeof(rngFileName), "./CRNGSeeds%016llX_mpi%d", t, mpiid);}
  FILE *seeds = fopen(rngFileName, "wb");
  if(seeds == NULL){
    fprintf(stderr, "\nError opening congruentialSeeds DumpCRNGArray\n");
    return 1;
  }

  chkwr += fwrite(rngArray->h_rngSeeds, sizeof(RNGT), rngArray->N, seeds);
  fflush(seeds);
  fclose(seeds);

  if(chkwr^rngArray->N){
    fprintf(stderr, "\nError writing congruntialSeeds DumpCRNGArray: wrong size\n");
    return 1;
  }

  return 0;
}

extern int DumpCRNGArrayMpiRecover(CRNGArray_t *rngArray,  unsigned long long int t, int sysN, int recoverMPIprocs){
  char rngFileName[200];
  int chkwr = 0;

  int mpiSysStride = rngArray->N/recoverMPIprocs/sysN;
  int sysStride = rngArray->N/sysN;


  for(int proc = 0; proc < recoverMPIprocs; proc++){

      snprintf(rngFileName, sizeof(rngFileName), "./CRNGSeeds%016llX_mpi%d", t, proc);
      FILE *seeds = fopen(rngFileName, "wb");
      if(seeds == NULL){
	fprintf(stderr, "\nError opening congruentialSeeds DumpCRNGArray\n");
	return 1;
      }

      for(int sys=0; sys < sysN; sys++){
	int offset = sys*sysStride + proc*mpiSysStride; 
	int writeDim = mpiSysStride;
	chkwr += fwrite(rngArray->h_rngSeeds + offset, sizeof(RNGT), writeDim, seeds);
	fflush(seeds);
      }
      fclose(seeds);
  }

  return 0;
}



extern int DumpPR_RNGArray(PR_RNGArray_t *rngArray, unsigned long long int t){
  char rngFileName[200];
  int chkwr = 0;

  snprintf(rngFileName, sizeof(rngFileName), "./PR_RNGSeeds%016llX", t);
  FILE *seeds = fopen(rngFileName, "wb");
  if(seeds == NULL){
    fprintf(stderr, "\nError opening congruentialSeeds DumpCRNGArray\n");
    return 1;
  }

  chkwr = fwrite(rngArray->h_rngSeeds, sizeof(RNGT), rngArray->N, seeds);
  fflush(seeds);
  fclose(seeds);
  
  if(chkwr^rngArray->N){
    fprintf(stderr, "\nError writing congruntialSeeds DumpPR_RNGArray: wrong size\n");
    return 1;
  }

  return 0;
}

extern int ReadCRNGArrayOff(CRNGArray_t *rngArray, int offset, int size){
  FILE *seeds = fopen("congruentialSeeds", "rb");
  if(seeds == NULL){
    fprintf(stderr, "\nError opening congruentialSeeds ReadCRNGArray\n");
    return 1;
  }
  int chkcp = fread(rngArray->h_rngSeeds + offset, sizeof(RNGT), size, seeds);
  fclose(seeds);
  if(chkcp^size){
    fprintf(stderr, "\nError reading congruntialSeeds ReadCRNGArrayOff: wrong size\n");
    return 1;
  }
  return 0;
}

extern int DumpCRNGArrayOff(CRNGArray_t *rngArray, int offset, int size){
  FILE *seeds = fopen("CRNGSeeds", "wb");
  if(seeds == NULL){
    fprintf(stderr, "\nError opening congruentialSeeds DumpCRNGArray\n");
    return 1;
  }
  int chkwr = fwrite(rngArray->h_rngSeeds + offset, sizeof(RNGT), size, seeds);
  fflush(seeds);
  fclose(seeds);
  if(chkwr^size){
    fprintf(stderr, "\nError reading congruntialSeeds DumpCRNGArrayOff: wrong size\n");
    return 1;
  }

  return 0;
}

extern int ReadPR_RNGArray(PR_RNGArray_t *rngArray, unsigned long long int t){
  char rngFileName[200];
  int chkrd = 0;

  snprintf(rngFileName, sizeof(rngFileName), "./PR_RNGSeeds%016llX", t);
  FILE *seeds = fopen(rngFileName, "rb");
  if(seeds == NULL){
    fprintf(stderr, "\nError opening congruentialSeeds DumpCRNGArray\n");
    return 1;
  }
  chkrd = fread(rngArray->h_rngSeeds, sizeof(RNGT), rngArray->N, seeds);
  fflush(seeds);
  fclose(seeds);
  
  if(chkrd^rngArray->N){
    fprintf(stderr, "\nError writing congruntialSeeds DumpPR_RNGArray: wrong size\n");
    return 1;
  }

  return 0;
}

extern int PR32Mix(PR_RNGArray_t *rngArray, int steps){
  int threads = rngArray->N/PRWHEEL;

  for(int th=0; th<threads; th++){
    int ip0 = 61*threads + th, ip1 = (61-24)*threads + th, ip2 = (61-55)*threads + th, ip3 = th;
    for(int i=0; i<steps; i++){  
      PR32(rngArray->h_rngSeeds, ip0, ip1, ip2, ip3);
      ip0 = (ip0 + threads)%rngArray->N;
      ip1 = (ip1 + threads)%rngArray->N;
      ip2 = (ip2 + threads)%rngArray->N;
      ip3 = (ip3 + threads)%rngArray->N;
    }
  }
  return 0;
}

extern int PR32Evolve(PR_RNGArray_t *rngArray, int steps,
		      int ip00, int ip01, int ip02, int ip03){
  int threads = rngArray->N/PRWHEEL;

  for(int th=0; th<threads; th++){
    int ip0 = ip00 + th, ip1 = ip01 + th, ip2 = ip02 + th, ip3 = ip03 + th;
    for(int i=0; i<steps; i++){  
      PR32(rngArray->h_rngSeeds, ip0, ip1, ip2, ip3);
      ip0 = (ip0 + threads)%rngArray->N;
      ip1 = (ip1 + threads)%rngArray->N;
      ip2 = (ip2 + threads)%rngArray->N;
      ip3 = (ip3 + threads)%rngArray->N;
    }
  }
  return 0;
}

extern int PR32EvolveMod(PR_RNGArray_t *rngArray, int steps,
			 int ip0, int ip1, int ip2, int ip3){
  int threads = rngArray->N/PRWHEEL;

  for(int th=0; th<threads; th++){
    for(int i=0; i<steps; i++){  
      PR32(rngArray->h_rngSeeds, ip0*threads + th, ip1*threads + th, ip2*threads + th, ip3*threads + th);
      ip0 = ((ip0 + 1)&PRWHEELMASK);
      ip1 = ((ip1 + 1)&PRWHEELMASK);
      ip2 = ((ip2 + 1)&PRWHEELMASK);
      ip3 = ((ip3 + 1)&PRWHEELMASK);
    }
  }
  return 0;
}



extern CRNGArray_t *CRNGArrayInit(int N){
  CRNGArray_t *rngArray = (CRNGArray_t *)malloc(sizeof(CRNGArray_t));
  if(rngArray == NULL){
    fprintf(stderr, "\nError malloc RNGArrayInit\n");
    return NULL;
  }
  rngArray->N = N;
  if(CRNGArrayAlloc(rngArray)) return NULL;
  if(CRNGArraySetURandom(rngArray)) return NULL;
  if(H2DCRNGArray(rngArray)) return NULL;

  return rngArray;
}

extern PR_RNGArray_t *PR_RNGArrayInit(int N, int steps){
  PR_RNGArray_t *rngArray = (PR_RNGArray_t *)malloc(sizeof(PR_RNGArray_t));
  if(rngArray == NULL){
    fprintf(stderr, "\nError malloc RNGArrayInit\n");
    return NULL;
  }
  rngArray->N = N;
  if(PR_RNGArrayAlloc(rngArray)) return NULL;
  if(PR_RNGArraySetURandom(rngArray)) return NULL;
  PR32Mix(rngArray, steps);
  if(H2DPR_RNGArray(rngArray)) return NULL;

  return rngArray;
}

