// Matteo Lulli, Massimo Bernaschi, Giorgio Parisi 2013
// Physics Dept. 'Sapienza', University of Rome
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#include "cudaEA3DMGPU.h"
#include "myRNG.h"
#include "mpiexch.h"
#include "MSC.h"

#define MYMAXRAND TWO32M1
#define GPURAND(seed) RANDTWO31M1IMP(seed)
#define CPURAND(seed) RANDTWO31M1BIS(seed)

#define SP(a, m) (a&(~(-(a >= m))))
#define SM(a, m) (a+((-(a < 0))&m))
#define SMM(a, m) (a - 1 +((-(a < 1))&m))

#define KB1  (1<<10)
#define GB1  (1<<30)

#define MYCUDA_ERROR(functionCall)\
  {cudaError_t error_id = functionCall;\
  if(error_id != cudaSuccess){\
  fprintf(stderr, "\nError %s at line %d of %s\n",\
          cudaGetErrorString(error_id), __LINE__, __FILE__);\
  return 1;\
  }\
  }

#define MYCUDA_LAST(myString)\
  {cudaError_t error_id = cudaGetLastError();	\
  if(error_id != cudaSuccess){\
  fprintf(stderr, "\nError %s at line %d of %s for %s\n",\
          cudaGetErrorString(error_id), __LINE__, __FILE__, myString);	\
  return 1;\
  }\
  }

#define SUM_111_3(j0,j1,i0,i1,i2)\
MSC j1 = (i0) ^ (i1) ; \
MSC j0 = j1 ^ (i2) ;\
j1 = ((i0)&(i1)) ^ (j1&(i2)) ;\

//j0 e' il bit meno significativo: 2*j1+j0

#define SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2)\
MSC sigma0 = j0 & j2 ;\
MSC sigma2 = j1 ^ j3 ;\
MSC sigma1 = sigma2 ^ sigma0 ;\
sigma2 = (j1 & j3) ^ (sigma2 & sigma0) ;\
sigma0 = j0 ^ j2 ;\
//sigma0 e' il bit meno significativo: 4*sigma2+2sigma1+sigma0

#define UPDATE_RED2(a,an) \
        {SUM_111_3(j0,j1,\
        a[i]^tex1Dfetch(an, spx)^tex1Dfetch(texture_Jpx, i),\
        a[i]^tex1Dfetch(an, spy)^tex1Dfetch(texture_Jpy, i),\
        a[i]^tex1Dfetch(an, spz)^tex1Dfetch(texture_Jpz, i));\
        SUM_111_3(j2,j3,\
        a[i]^tex1Dfetch(an, smx)^tex1Dfetch(texture_Jmx, i),\
        a[i]^tex1Dfetch(an, smy)^tex1Dfetch(texture_Jmy, i),\
        a[i]^tex1Dfetch(an, smz)^tex1Dfetch(texture_Jmz, i));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        a[i]^=flip;\
        }

#define UPDATE_RED(a,an) \
        {volatile MSC spin = a[i];\
	SUM_111_3(j0,j1,\
        spin^tex1Dfetch(an, spx)^tex1Dfetch(texture_Jpx, i),\
        spin^tex1Dfetch(an, spy)^tex1Dfetch(texture_Jpy, i),\
        spin^tex1Dfetch(an, spz)^tex1Dfetch(texture_Jpz, i));\
        SUM_111_3(j2,j3,\
        spin^tex1Dfetch(an, smx)^tex1Dfetch(texture_Jmx, i),\
        spin^tex1Dfetch(an, smy)^tex1Dfetch(texture_Jmy, i),\
        spin^tex1Dfetch(an, smz)^tex1Dfetch(texture_Jmz, i));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        a[i] = spin^flip;\
        }

#define UPDATE_RED_SLICED(a,an)         \
        {SUM_111_3(j0,j1,\
        a[i]^tex1Dfetch(an, spx)^tex1Dfetch(texture_Jpx, i),\
        a[i]^tex1Dfetch(an, spy)^tex1Dfetch(texture_Jpy, i),\
        a[i]^tex1Dfetch(an, i)^tex1Dfetch(texture_Jpz, i));\
        SUM_111_3(j2,j3,\
        a[i]^tex1Dfetch(an, smx)^tex1Dfetch(texture_Jmx, i),\
        a[i]^tex1Dfetch(an, smy)^tex1Dfetch(texture_Jmy, i),\
        a[i]^tex1Dfetch(an, smz)^tex1Dfetch(texture_Jmz, i));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        a[i]^=flip;\
        }

#define UPDATE_RED_SLICED_BND(a,an,anb)         \
        {SUM_111_3(j0,j1,\
        a[i]^tex1Dfetch(an, spx)^tex1Dfetch(texture_Jpx, i),\
        a[i]^tex1Dfetch(anb, spy)^tex1Dfetch(texture_Jpy, i),\
        a[i]^tex1Dfetch(an, i)^tex1Dfetch(texture_Jpz, i));\
        SUM_111_3(j2,j3,\
        a[i]^tex1Dfetch(anb, smx)^tex1Dfetch(texture_Jmx, i),\
        a[i]^tex1Dfetch(an, smy)^tex1Dfetch(texture_Jmy, i),\
        a[i]^tex1Dfetch(anb, smz)^tex1Dfetch(texture_Jmz, i));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        a[i]^=flip;\
        }

#define UPDATE_RED_SLICED_BND_CP(a,b,an,anb)	\
        {SUM_111_3(j0,j1,\
        a[i]^tex1Dfetch(an, spx)^tex1Dfetch(texture_Jpx, i),\
        a[i]^tex1Dfetch(anb, spy)^tex1Dfetch(texture_Jpy, i),\
        a[i]^tex1Dfetch(an, i)^tex1Dfetch(texture_Jpz, i));\
        SUM_111_3(j2,j3,\
        a[i]^tex1Dfetch(anb, smx)^tex1Dfetch(texture_Jmx, i),\
        a[i]^tex1Dfetch(an, smy)^tex1Dfetch(texture_Jmy, i),\
        a[i]^tex1Dfetch(anb, smz)^tex1Dfetch(texture_Jmz, i));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        b[j] = a[i] = a[i]^flip;\
        }


#define UPDATE_BLUE2(a,an) \
        {SUM_111_3(j0,j1,\
        a[i]^tex1Dfetch(an, spx)^tex1Dfetch(texture_Jmx, spx),\
        a[i]^tex1Dfetch(an, spy)^tex1Dfetch(texture_Jmy, spy),\
        a[i]^tex1Dfetch(an, spz)^tex1Dfetch(texture_Jmz, spz));\
        SUM_111_3(j2,j3,\
        a[i]^tex1Dfetch(an, smx)^tex1Dfetch(texture_Jpx, smx),\
        a[i]^tex1Dfetch(an, smy)^tex1Dfetch(texture_Jpy, smy),\
        a[i]^tex1Dfetch(an, smz)^tex1Dfetch(texture_Jpz, smz));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        a[i]^=flip;\
        }

#define UPDATE_BLUE(a,an) \
        {volatile MSC spin = a[i];\
        SUM_111_3(j0,j1,\
        spin^tex1Dfetch(an, spx)^tex1Dfetch(texture_Jmx, spx),\
        spin^tex1Dfetch(an, spy)^tex1Dfetch(texture_Jmy, spy),\
        spin^tex1Dfetch(an, spz)^tex1Dfetch(texture_Jmz, spz));\
        SUM_111_3(j2,j3,\
        spin^tex1Dfetch(an, smx)^tex1Dfetch(texture_Jpx, smx),\
        spin^tex1Dfetch(an, smy)^tex1Dfetch(texture_Jpy, smy),\
        spin^tex1Dfetch(an, smz)^tex1Dfetch(texture_Jpz, smz));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        a[i] = spin^flip;\
        }

#define UPDATE_BLUE_SLICED(a,an) \
        {SUM_111_3(j0,j1,\
        a[i]^tex1Dfetch(an, spx)^tex1Dfetch(texture_Jmx, spx),\
        a[i]^tex1Dfetch(an, spy)^tex1Dfetch(texture_Jmy, spy),\
        a[i]^tex1Dfetch(an, spz)^tex1Dfetch(texture_Jmz, spz));\
        SUM_111_3(j2,j3,\
        a[i]^tex1Dfetch(an, smx)^tex1Dfetch(texture_Jpx, smx),\
        a[i]^tex1Dfetch(an, smy)^tex1Dfetch(texture_Jpy, smy),\
        a[i]^tex1Dfetch(an, i)^tex1Dfetch(texture_Jpz, i));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        a[i]^=flip;\
        }

#define UPDATE_BLUE_SLICED_BND(a,an,anb)                \
        {SUM_111_3(j0,j1,\
        a[i]^tex1Dfetch(anb, spx)^tex1Dfetch(texture_JmxB, spx),\
        a[i]^tex1Dfetch(an, spy)^tex1Dfetch(texture_Jmy, spy),\
        a[i]^tex1Dfetch(anb, spz)^tex1Dfetch(texture_JmzB, spz));\
        SUM_111_3(j2,j3,\
        a[i]^tex1Dfetch(an, smx)^tex1Dfetch(texture_Jpx, smx),\
        a[i]^tex1Dfetch(anb, smy)^tex1Dfetch(texture_JpyB, smy),\
        a[i]^tex1Dfetch(an, i)^tex1Dfetch(texture_Jpz, i));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        a[i]^=flip;\
        }

#define UPDATE_BLUE_SLICED_BND_CP(a,b,an,anb)	\
        {SUM_111_3(j0,j1,\
        a[i]^tex1Dfetch(anb, spx)^tex1Dfetch(texture_JmxB, spx),\
        a[i]^tex1Dfetch(an, spy)^tex1Dfetch(texture_Jmy, spy),\
        a[i]^tex1Dfetch(anb, spz)^tex1Dfetch(texture_JmzB, spz));\
        SUM_111_3(j2,j3,\
        a[i]^tex1Dfetch(an, smx)^tex1Dfetch(texture_Jpx, smx),\
        a[i]^tex1Dfetch(anb, smy)^tex1Dfetch(texture_JpyB, smy),\
        a[i]^tex1Dfetch(an, i)^tex1Dfetch(texture_Jpz, i));\
        SUM_33_6(j0,j1,j2,j3,sigma0,sigma1,sigma2);\
        GPURAND(seed);\
        int condition4 =-(seed<d_betaExp4);\
        int condition8 =-(seed<d_betaExp8);\
        int condition12=-(seed<d_betaExp12);\
        MSC flip = (((condition12 & sigma2) & sigma1) |                 \
                    ((condition8 & sigma2) & sigma0) |                  \
                    (((condition4 & sigma2) & (~sigma1)) & (~sigma0)) | \
                    (~sigma2));                                         \
        b[j] = a[i] = a[i]^flip;\
        }

/* Controllo bit-wise per la trasformazione di coordinate*/
#define RED 0
#define BLU 1

typedef struct{
  int x, y, z, zz, k;
}posType;

static void myK2Vec(posType *pos, int k, int L, int col);
static void my2Real(posType *my, posType *real, int L);
static void realVec2K(posType *pos, int L);
static int my2RealIndex(int i, int L, int col);
static void mapsDefine(int *realI_R, int *realI_B, int *myI_R, int *myI_B, int *myI_B2R, int *myI_R2B, int hV, int L);
extern "C" void BlueMpi(struct mpiexch*);
extern "C" void RedMpi(struct mpiexch*);

static void myK2Vec(posType *pos, int k, int L, int col){
  int A = L*L;
  pos->k = k;
  pos->x = k%L;
  pos->y = (k/L)%L;
  pos->z = k/A;
  if(col == RED) pos->zz = 2*pos->z;
  else  pos->zz = 2*pos->z + 1;
  return;
}

static void my2Real(posType *my, posType *real, int L){
  real->x = my->x;
  real->y = (my->x + my->y)%L;
  real->zz = real->z = (my->y + my->zz)%L;
  return;
}

static void realVec2K(posType *pos, int L){
  int A = L*L;
  pos->k = (pos->x + pos->y*L + pos->z*A)/2;
  return;
}

static int my2RealIndex(int i, int L, int col){
  posType myPos, realPos;
  myK2Vec(&myPos, i, L, col);
  my2Real(&myPos, &realPos, L);
  realVec2K(&realPos, L);
  return realPos.k;
}

static void mapsDefine(int *realI_R, int *realI_B, int *myI_R, int *myI_B, int *myI_B2R, int *myI_R2B, int hV, int L){

  for(int i=0; i<hV; i++){
    realI_R[i] = my2RealIndex(i, L, RED);
    realI_B[i] = my2RealIndex(i, L, BLU);
  }

  for(int i=0; i<hV; i++){
    myI_R[realI_R[i]] = i;
    myI_B[realI_B[i]] = i;
  }

  for(int i=0; i<hV; i++){
    myI_B2R[myI_B[i]] = myI_R[i];
    myI_R2B[myI_R[i]] = myI_B[i];
  }
  return;
}

static void arrayTransformMSC(MSC *array, int *map, int size){
  MSC *swap = (MSC *)malloc(size*sizeof(MSC));
  for(int i=0; i<size; i++) swap[i] = 0;
  for(int i=0; i<size; i++) swap[map[i]] = array[i];
  for(int i=0; i<size; i++) array[i] = swap[i];
  free(swap);
  return;
}

static void arrayTransformRNGT(RNGT *array, int *map, int size){
  RNGT *swap = (RNGT *)malloc(size*sizeof(RNGT));
  for(int i=0; i<size; i++) swap[i] = 0;
  for(int i=0; i<size; i++) swap[map[i]] = array[i];
  for(int i=0; i<size; i++) array[i] = swap[i];
  free(swap);
  return;
}

/* fine controllo trasformazione di coordinate */

__device__ __constant__ MSC d_sdbLs;
__device__ __constant__ MSC d_sdbL;
__device__ __constant__ MSC d_sdbLm1;
__device__ __constant__ MSC d_sdbLsm1;
__device__ __constant__ MSC d_maskL;
__device__ __constant__ MSC d_Ldb2;
__device__ __constant__ MSC d_Lsdb2;
__device__ __constant__ MSC d_maskL2;
__device__ __constant__ MSC d_maskL3;

__device__ __constant__ RNGT d_betaExp4;
__device__ __constant__ RNGT d_betaExp8;
__device__ __constant__ RNGT d_betaExp12;

__device__ __constant__ int d_replicas;
__device__ __constant__ int d_sysN;
__device__ __constant__ int d_hV;
__device__ __constant__ int d_A;
__device__ __constant__ int d_L;
__device__ __constant__ int d_hL;
__device__ __constant__ int d_hA;

__device__ __constant__ int d_threads;
__device__ __constant__ int d_iraLen;

__device__ __constant__ int d_block_xStride;
__device__ __constant__ int d_block_yStride;
__device__ __constant__ int d_block_xStrideS;
__device__ __constant__ int d_block_yStrideS;
__device__ __constant__ int d_loopStride;
__device__ __constant__ int d_loopStrideBlk;
__device__ __constant__ int d_loopStrideBnd;
__device__ __constant__ int d_rngStrideBnd;
__device__ __constant__ int d_rngStrideBlk;
__device__ __constant__ int d_rngOffset;

MSC hh_sdbLs;
MSC hh_sdbL;
MSC hh_sdbLm1;
MSC hh_sdbLsm1;
MSC hh_maskL;
MSC hh_Ldb2;
MSC hh_Lsdb2;
MSC hh_maskL2;
MSC hh_maskL3;

RNGT h_betaExp4;
RNGT h_betaExp8;
RNGT h_betaExp12;

texture<MSC> texture_Jpx;
texture<MSC> texture_Jpy;
texture<MSC> texture_Jpz;
texture<MSC> texture_Jmx;
texture<MSC> texture_Jmy;
texture<MSC> texture_Jmz;

// blue boundary buffers texture references
texture<MSC> texture_JmzB;
texture<MSC> texture_JpyB;
texture<MSC> texture_JmxB;

texture<MSC> texture_R0;
texture<MSC> texture_B0;
texture<MSC> texture_R1;
texture<MSC> texture_B1;
texture<MSC> texture_R2;
texture<MSC> texture_B2;
texture<MSC> texture_R3;
texture<MSC> texture_B3;

// boundaries texture references

texture<MSC> texture_R0B;
texture<MSC> texture_B0B;
texture<MSC> texture_R1B;
texture<MSC> texture_B1B;
texture<MSC> texture_R2B;
texture<MSC> texture_B2B;
texture<MSC> texture_R3B;
texture<MSC> texture_B3B;


RNGT globalSeed;

static int masksInit(sysEA3D_t *sys);
static int MCWeightsInit(sysEA3D_t *sys);
static int dMemInit(sysEA3D_t *sys);
static int hMemInit(sysEA3D_t *sys);

static int H2DSpins(sysEA3D_t *sys);
static int H2DJ(sysEA3D_t *sys);
static int H2DSysCopy(sysEA3D_t *sys);

static int D2HSpins(sysEA3D_t *sys);

static void usage(char **argv);

static RNGT initSeed31M1(void);
static MSC flatMSCbitDist31M1(void);
static int JInit(sysEA3D_t *sys);
static int SpinsInit(sysEA3D_t *sys);

static int masksInit(sysEA3D_t *sys){
  int L = sys->L;
  int V = sys->V;
  hh_sdbLs = findLog2(L*L);
  hh_sdbL = findLog2(L);
  hh_sdbLm1 = hh_sdbL-1;
  hh_sdbLsm1 = hh_sdbLs-1;
  hh_maskL = (L>>1)-1;
  hh_Ldb2 = L>>1;
  hh_Lsdb2 = (L*L)>>1;
  hh_maskL2 = ((L*L)>>1)-1;
  hh_maskL3 = (V>>1)-1;

  MYCUDA_ERROR( cudaMemcpyToSymbol(d_sdbLs, &hh_sdbLs, sizeof(MSC)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_sdbL, &hh_sdbL, sizeof(MSC)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_sdbLm1, &hh_sdbLm1, sizeof(MSC)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_sdbLsm1, &hh_sdbLsm1, sizeof(MSC)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_maskL, &hh_maskL, sizeof(MSC)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_Ldb2, &hh_Ldb2, sizeof(MSC)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_Lsdb2, &hh_Lsdb2, sizeof(MSC)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_maskL2, &hh_maskL2, sizeof(MSC)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_maskL3, &hh_maskL3, sizeof(MSC)) );

  MYCUDA_ERROR( cudaMemcpyToSymbol(d_replicas, &sys->replicas, sizeof(int)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_sysN, &sys->sysN, sizeof(int)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_hV, &sys->hV, sizeof(int)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_A, &sys->A, sizeof(int)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_L, &sys->L, sizeof(int)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_hL, &sys->hL, sizeof(int)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_hA, &sys->hA, sizeof(int)) );

  return 0;
}

static int MCWeightsInit(sysEA3D_t *sys){
  h_betaExp4 = ((RNGT)((double)MYMAXRAND*exp(-4./sys->beta)));
  h_betaExp8 = ((RNGT)((double)MYMAXRAND*exp(-8./sys->beta)));
  h_betaExp12 = ((RNGT)((double)MYMAXRAND*exp(-12./sys->beta)));

  MYCUDA_ERROR( cudaMemcpyToSymbol(d_betaExp4, &h_betaExp4, sizeof(RNGT)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_betaExp8, &h_betaExp8, sizeof(RNGT)) );
  MYCUDA_ERROR( cudaMemcpyToSymbol(d_betaExp12,&h_betaExp12,sizeof(RNGT)) );

  return 0;
}

static int dMemInit(sysEA3D_t *sys){
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_Jpx, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_Jpy, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_Jpz, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_Jmx, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_Jmy, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_Jmz, sys->NJBlk*sizeof(MSC)) );

  MYCUDA_ERROR( cudaBindTexture(NULL, texture_Jpx, sys->d_Jpx, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_Jpy, sys->d_Jpy, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_Jpz, sys->d_Jpz, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_Jmx, sys->d_Jmx, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_Jmy, sys->d_Jmy, sys->NJBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_Jmz, sys->d_Jmz, sys->NJBlk*sizeof(MSC)) );

  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds0, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds1, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds2, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds3, sys->NSBlk*sizeof(MSC)) );

  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues0, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues1, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues2, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues3, sys->NSBlk*sizeof(MSC)) );

  MYCUDA_ERROR( cudaBindTexture(NULL, texture_R0, sys->d_reds0, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_R1, sys->d_reds1, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_R2, sys->d_reds2, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_R3, sys->d_reds3, sys->NSBlk*sizeof(MSC)) );

  MYCUDA_ERROR( cudaBindTexture(NULL, texture_B0, sys->d_blues0, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_B1, sys->d_blues1, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_B2, sys->d_blues2, sys->NSBlk*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_B3, sys->d_blues3, sys->NSBlk*sizeof(MSC)) );

  // SLICED BOUNDARIES

  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_JmzBS, sys->NJBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_JpyBS, sys->NJBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_JmxBS, sys->NJBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_JmzBR, sys->NJBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_JpyBR, sys->NJBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_JmxBR, sys->NJBnd*sizeof(MSC)) );

  MYCUDA_ERROR( cudaBindTexture(NULL, texture_JmzB, sys->d_JmzBR, sys->NJBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_JpyB, sys->d_JpyBR, sys->NJBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_JmxB, sys->d_JmxBR, sys->NJBnd*sizeof(MSC)) );

  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds0BS, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds1BS, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds2BS, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds3BS, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds0BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds1BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds2BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_reds3BR, sys->NSBnd*sizeof(MSC)) );

  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues0BS, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues1BS, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues2BS, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues3BS, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues0BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues1BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues2BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaMalloc((void **) &sys->d_blues3BR, sys->NSBnd*sizeof(MSC)) );

  MYCUDA_ERROR( cudaBindTexture(NULL, texture_R0B, sys->d_reds0BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_R1B, sys->d_reds1BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_R2B, sys->d_reds2BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_R3B, sys->d_reds3BR, sys->NSBnd*sizeof(MSC)) );

  MYCUDA_ERROR( cudaBindTexture(NULL, texture_B0B, sys->d_blues0BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_B1B, sys->d_blues1BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_B2B, sys->d_blues2BR, sys->NSBnd*sizeof(MSC)) );
  MYCUDA_ERROR( cudaBindTexture(NULL, texture_B3B, sys->d_blues3BR, sys->NSBnd*sizeof(MSC)) );

  return 0;
}

static int dMemFree(sysEA3D_t *sys){
  MYCUDA_ERROR( cudaFree(sys->d_Jpx) );
  MYCUDA_ERROR( cudaFree(sys->d_Jpy) );
  MYCUDA_ERROR( cudaFree(sys->d_Jpz) );
  MYCUDA_ERROR( cudaFree(sys->d_Jmx) );
  MYCUDA_ERROR( cudaFree(sys->d_Jmy) );
  MYCUDA_ERROR( cudaFree(sys->d_Jmz) );

  MYCUDA_ERROR( cudaFree(sys->d_JmxBS) );
  MYCUDA_ERROR( cudaFree(sys->d_JpyBS) );
  MYCUDA_ERROR( cudaFree(sys->d_JmzBS) );
  MYCUDA_ERROR( cudaFree(sys->d_JmxBR) );
  MYCUDA_ERROR( cudaFree(sys->d_JpyBR) );
  MYCUDA_ERROR( cudaFree(sys->d_JmzBR) );

  MYCUDA_ERROR( cudaUnbindTexture(texture_Jpx) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_Jpy) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_Jpz) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_Jmx) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_Jmy) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_Jmz) );

  MYCUDA_ERROR( cudaUnbindTexture(texture_JmxB) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_JpyB) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_JmzB) );

  MYCUDA_ERROR( cudaFree(sys->d_reds0) );
  MYCUDA_ERROR( cudaFree(sys->d_reds1) );
  MYCUDA_ERROR( cudaFree(sys->d_reds2) );
  MYCUDA_ERROR( cudaFree(sys->d_reds3) );

  MYCUDA_ERROR( cudaFree(sys->d_blues0) );
  MYCUDA_ERROR( cudaFree(sys->d_blues1) );
  MYCUDA_ERROR( cudaFree(sys->d_blues2) );
  MYCUDA_ERROR( cudaFree(sys->d_blues3) );

  MYCUDA_ERROR( cudaFree(sys->d_reds0BS) );
  MYCUDA_ERROR( cudaFree(sys->d_reds1BS) );
  MYCUDA_ERROR( cudaFree(sys->d_reds2BS) );
  MYCUDA_ERROR( cudaFree(sys->d_reds3BS) );

  MYCUDA_ERROR( cudaFree(sys->d_blues0BS) );
  MYCUDA_ERROR( cudaFree(sys->d_blues1BS) );
  MYCUDA_ERROR( cudaFree(sys->d_blues2BS) );
  MYCUDA_ERROR( cudaFree(sys->d_blues3BS) );

  MYCUDA_ERROR( cudaFree(sys->d_reds0BR) );
  MYCUDA_ERROR( cudaFree(sys->d_reds1BR) );
  MYCUDA_ERROR( cudaFree(sys->d_reds2BR) );
  MYCUDA_ERROR( cudaFree(sys->d_reds3BR) );

  MYCUDA_ERROR( cudaFree(sys->d_blues0BR) );
  MYCUDA_ERROR( cudaFree(sys->d_blues1BR) );
  MYCUDA_ERROR( cudaFree(sys->d_blues2BR) );
  MYCUDA_ERROR( cudaFree(sys->d_blues3BR) );

  MYCUDA_ERROR( cudaUnbindTexture(texture_R0) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_R1) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_R2) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_R3) );

  MYCUDA_ERROR( cudaUnbindTexture(texture_B0) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_B1) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_B2) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_B3) );

  MYCUDA_ERROR( cudaUnbindTexture(texture_R0B) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_R1B) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_R2B) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_R3B) );

  MYCUDA_ERROR( cudaUnbindTexture(texture_B0B) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_B1B) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_B2B) );
  MYCUDA_ERROR( cudaUnbindTexture(texture_B3B) );

  return 0;
}


static int hMemFree(sysEA3D_t *sys){
  MYCUDA_ERROR( cudaFreeHost(sys->h_Jpx) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_Jpy) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_Jpz) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_Jmx) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_Jmy) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_Jmz) );

  MYCUDA_ERROR( cudaFreeHost(sys->h_JmxBS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_JpyBS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_JmzBS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_JmxBR) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_JpyBR) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_JmzBR) );

  MYCUDA_ERROR( cudaFreeHost(sys->h_reds0) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds1) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds2) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds3) );

  MYCUDA_ERROR( cudaFreeHost(sys->h_blues0) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues1) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues2) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues3) );

  MYCUDA_ERROR( cudaFreeHost(sys->h_reds0BS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds1BS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds2BS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds3BS) );

  MYCUDA_ERROR( cudaFreeHost(sys->h_blues0BS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues1BS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues2BS) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues3BS) );

  MYCUDA_ERROR( cudaFreeHost(sys->h_reds0BR) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds1BR) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds2BR) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_reds3BR) );

  MYCUDA_ERROR( cudaFreeHost(sys->h_blues0BR) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues1BR) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues2BR) );
  MYCUDA_ERROR( cudaFreeHost(sys->h_blues3BR) );

  return 0;
}

static int hMemInit(sysEA3D_t *sys){
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jpx, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jpy, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jpz, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jmx, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jmy, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jmz, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds0, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds1, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds2, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds3, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues0, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues1, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues2, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues3, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );

  // SLICED BOUNDARIES

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JmzBR, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JpyBR, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JmxBR, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JmzBS, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JpyBS, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JmxBS, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );


  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds0BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds1BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds2BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds3BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds0BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds1BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds2BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds3BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );


  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues0BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues1BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues2BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues3BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues0BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues1BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues2BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues3BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );



  unsigned long long int newValue = 0, oldValue = 0;
  int newIndex = 0;
  int steps = sys->steps;
  unsigned long long int tMax = (unsigned long long int)rint(pow(2.,steps));

  if(sys->recoverF == 0){
    int counter = 0;

    for(int i=0; i<=4*steps; i++){
      newValue = (unsigned long long int)rint(pow(2.,i/4.));
      if(newValue > oldValue){
        counter++;
        oldValue = newValue;
      }
    }

    int dumpN = (4*steps + 1)*(4*steps + 1) + counter;

    sys->dumpTimes = (unsigned long long int *)malloc(dumpN*sizeof(unsigned long long int));
    if(sys->dumpTimes == NULL){
      fprintf(stderr,"\nNo Alloc dumpTimes hMemInit\n");
      return 1;
    }

    int k = 0;

    sys->dumpTimes[0] = 1; newValue = 0; k = 0;
    for(int i=0; i<=4*steps; i++){
      newValue = (unsigned long long int)rint(pow(2.,i/4.));
      if(newValue > sys->dumpTimes[k]){
        k++;
        sys->dumpTimes[k] = newValue;
      }
    }

    k++;

    for(int i=0; i<=4*steps; i++){
      for(int j=0; j<=4*steps; j++){
        unsigned long long int swap = (unsigned long long int)rint(pow(2.,i/4.)) + (unsigned long long int)rint(pow(2.,j/4.));
        if(swap <= tMax){
          sys->dumpTimes[k] = swap;
          k++;
        }
      }
    }

    // sorting
    for(int i=0; i<k; i++){
      for(int j=0; j<k - i - 1; j++){
        if(sys->dumpTimes[j] > sys->dumpTimes[j + 1]){
          unsigned long long int swap = sys->dumpTimes[j];
          sys->dumpTimes[j] = sys->dumpTimes[j + 1];
          sys->dumpTimes[j + 1] = swap;
        }
      }
    }

    // deleting replicas
    newIndex = 0;
    for(int i=0; i<k; i++){
      if(sys->dumpTimes[newIndex] != sys->dumpTimes[i]){
        newIndex++;
        sys->dumpTimes[newIndex] = sys->dumpTimes[i];
      }
    }

    /*for(int i=0; i<=newIndex; i++){
      printf("%llu\n", sys->dumpTimes[i]);
      }*/

    sys->dumpN = newIndex + 1;

  }

  if(sys->recoverF == 1 || sys->recoverF == 2){
    int counter = 0;

    for(int i=0; i<=4*steps; i++){
      newValue = (unsigned long long int)rint(pow(2.,i/4.));
      if(newValue > oldValue && (newValue + sys->startValue) <= tMax){
        counter++;
        oldValue = newValue;
      }
    }

    int dumpN = counter;

    sys->dumpTimes = (unsigned long long int *)malloc(dumpN*sizeof(unsigned long long int));
    if(sys->dumpTimes == NULL){
      fprintf(stderr,"\nNo Alloc dumpTimes hMemInit\n");
      return 1;
    }

    int k = 0;

    sys->dumpTimes[0] = sys->startValue + 1; newValue = 0; k = 0;
    for(int i=0; i<=4*steps; i++){
      newValue = (unsigned long long int)rint(pow(2.,i/4.));
      if((newValue + sys->startValue) > sys->dumpTimes[k] && (newValue + sys->startValue) <= tMax){
        k++;
        sys->dumpTimes[k] = newValue + sys->startValue;
      }
    }

    /*for(int i=0; i<dumpN; i++){
      printf("%llu\n", sys->dumpTimes[i]);
      }*/

    sys->dumpN = dumpN;
  }

  return 0;
}

static int hMemInit2(sysEA3D_t *sys){
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jpx, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jpy, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jpz, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jmx, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jmy, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_Jmz, sys->NJBlk*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds0, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds1, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds2, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds3, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues0, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues1, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues2, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues3, sys->NSBlk*sizeof(MSC), cudaHostAllocDefault) );

  // SLICED BOUNDARIES

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JmzBR, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JpyBR, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JmxBR, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JmzBS, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JpyBS, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_JmxBS, sys->NJBnd*sizeof(MSC), cudaHostAllocDefault) );


  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds0BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds1BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds2BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds3BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds0BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds1BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds2BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_reds3BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );


  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues0BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues1BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues2BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues3BR, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );

  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues0BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues1BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues2BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );
  MYCUDA_ERROR( cudaHostAlloc((void **) &sys->h_blues3BS, sys->NSBnd*sizeof(MSC), cudaHostAllocDefault) );

  return 0;
}

static int H2DSpins(sysEA3D_t *sys){
  MYCUDA_ERROR( cudaMemcpy(sys->d_reds0, sys->h_reds0, sys->NSBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_reds1, sys->h_reds1, sys->NSBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_reds2, sys->h_reds2, sys->NSBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_reds3, sys->h_reds3, sys->NSBlk*sizeof(MSC), cudaMemcpyHostToDevice) );

  MYCUDA_ERROR( cudaMemcpy(sys->d_blues0, sys->h_blues0, sys->NSBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_blues1, sys->h_blues1, sys->NSBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_blues2, sys->h_blues2, sys->NSBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_blues3, sys->h_blues3, sys->NSBlk*sizeof(MSC), cudaMemcpyHostToDevice) );

  return 0;
}

static int H2DJ(sysEA3D_t *sys){

  MYCUDA_ERROR( cudaMemcpy(sys->d_Jpx, sys->h_Jpx, sys->NJBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_Jpy, sys->h_Jpy, sys->NJBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_Jpz, sys->h_Jpz, sys->NJBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_Jmx, sys->h_Jmx, sys->NJBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_Jmy, sys->h_Jmy, sys->NJBlk*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_Jmz, sys->h_Jmz, sys->NJBlk*sizeof(MSC), cudaMemcpyHostToDevice) );

  return 0;
}

static int H2DSysCopy(sysEA3D_t *sys){
  if(H2DSpins(sys)) return 1;
  if(H2DJ(sys)) return 1;
  return 0;
}

static int D2HSpins(sysEA3D_t *sys){
  MYCUDA_ERROR( cudaMemcpy(sys->h_reds0, sys->d_reds0, sys->NSBlk*sizeof(MSC), cudaMemcpyDeviceToHost) );
  MYCUDA_ERROR( cudaMemcpy(sys->h_reds1, sys->d_reds1, sys->NSBlk*sizeof(MSC), cudaMemcpyDeviceToHost) );
  MYCUDA_ERROR( cudaMemcpy(sys->h_reds2, sys->d_reds2, sys->NSBlk*sizeof(MSC), cudaMemcpyDeviceToHost) );
  MYCUDA_ERROR( cudaMemcpy(sys->h_reds3, sys->d_reds3, sys->NSBlk*sizeof(MSC), cudaMemcpyDeviceToHost) );

  MYCUDA_ERROR( cudaMemcpy(sys->h_blues0, sys->d_blues0, sys->NSBlk*sizeof(MSC), cudaMemcpyDeviceToHost) );
  MYCUDA_ERROR( cudaMemcpy(sys->h_blues1, sys->d_blues1, sys->NSBlk*sizeof(MSC), cudaMemcpyDeviceToHost) );
  MYCUDA_ERROR( cudaMemcpy(sys->h_blues2, sys->d_blues2, sys->NSBlk*sizeof(MSC), cudaMemcpyDeviceToHost) );
  MYCUDA_ERROR( cudaMemcpy(sys->h_blues3, sys->d_blues3, sys->NSBlk*sizeof(MSC), cudaMemcpyDeviceToHost) );

  return 0;
}

static void usage(char **argv){
  fprintf(stderr, "usage: %s <L> <T> <steps> <replicas> <sysN> <blockSize> <k> <kernelChoice> <gpu> <dump> <analysis> <ram> <recover> <recoverTime> <recoverMPIprocs>\n", argv[0]);
  return;
}


static RNGT initSeed31M1(void){
  RNGT seed = 0;
  size_t chkcp = 0;
  FILE *urandom = fopen("/dev/urandom", "rb");
  if(urandom == NULL){
    fprintf(stderr, "\nError opening /dev/urandom\n");
    return 0;
  }
  while(seed == 0){
    chkcp = fread(&seed, sizeof(RNGT), 1, urandom);
    seed = seed&TWO31M1;
    if(chkcp^1) seed = 0;
  }
  fclose(urandom);
  return seed;
}

static MSC flatMSCbitDist31M1(void){
  MSC number = 0;

  for(int i=0; i<MSCBITS; i++){
    number <<= 1;
    RANDTWO31M1BIS(globalSeed);
    MSC bit = (((float)globalSeed/TWO31M1) > 0.5 ? 0 : 1);
    number ^= bit;
  }
  return number;
}

static int JInit(sysEA3D_t *sys){
  for(int i=0; i<sys->NJBlk; i++){
    if(i%(sys->hV) == 0){
      globalSeed = initSeed31M1();
      if(globalSeed == 0) return 1;
    }
    sys->h_Jpx[i] = flatMSCbitDist31M1();
    sys->h_Jpy[i] = flatMSCbitDist31M1();
    sys->h_Jpz[i] = flatMSCbitDist31M1();
    sys->h_Jmx[i] = flatMSCbitDist31M1();
    sys->h_Jmy[i] = flatMSCbitDist31M1();
    sys->h_Jmz[i] = flatMSCbitDist31M1();
  }

  return 0;
}

static int SpinsInit(sysEA3D_t *sys){

  for(int i=0; i<sys->NSBlk; i++){
    if(i%(sys->hV) == 0){
      globalSeed = initSeed31M1();
      if(globalSeed == 0) return 1;
    }

    sys->h_reds0[i] = flatMSCbitDist31M1();
    sys->h_reds1[i] = flatMSCbitDist31M1();
    sys->h_reds2[i] = flatMSCbitDist31M1();
    sys->h_reds3[i] = flatMSCbitDist31M1();

    sys->h_blues0[i] = flatMSCbitDist31M1();
    sys->h_blues1[i] = flatMSCbitDist31M1();
    sys->h_blues2[i] = flatMSCbitDist31M1();
    sys->h_blues3[i] = flatMSCbitDist31M1();
  }

  return 0;
}

extern int dummyDirsMaker(sysEA3D_t *sys){
  int sysRet = 0;
  char cmd[200];

  if(sys->analysisF == 1){
    snprintf(cmd, sizeof(cmd), "mkdir %s", sys->devShm);
    sysRet = system(cmd);
    if(sysRet) return 1;
  }


  if(sys->dumpF == 1 && sys->recoverF == 0 && nprocs == 1){
    for(int i=0; i<sys->sysN; i++){
      snprintf(cmd, sizeof(cmd), "mkdir ./dummyRun%d", i);
      //printf("dummyDirsMaker %s\n", cmd);
      sysRet = system(cmd);
      if(sysRet) return 1;
    }
  }

  if(sys->dumpF == 1 && sys->recoverF == 0 && nprocs > 1){
    for(int i=0; i<sys->sysN; i++){
      snprintf(cmd, sizeof(cmd), "mkdir ./dummyRun%d_mpi%d", i, mpiid);
      sysRet = system(cmd);
      if(sysRet) return 1;
    }
  }

  if(sys->recoverF == 1 || sys->recoverF == 2){
    snprintf(cmd, sizeof(cmd), "mkdir ./recover%016llX/", sys->startValue);
    sysRet = system(cmd);
    if(sysRet) return 1;

    for(int i=0; i<sys->sysN; i++){
      snprintf(cmd, sizeof(cmd), "mkdir ./recover%016llX/dummyRun%d", sys->startValue, i);
      sysRet = system(cmd);
      if(sysRet) return 1;
    }
  }

  return 0;
}

extern int JRead(sysEA3D_t *sys){
  FILE *jdump = fopen("j_dump_start", "rb");
  if(jdump == NULL){
    fprintf(stderr, "\nError opening j_dump_start\n");
    return 1;
  }
  int chkcp = 0;

  for(int i=0; i<sys->sysN; i++){
    int offset = i*(sys->V/2)*6;
    chkcp += fread(sys->h_Jpx + offset, sizeof(MSC), sys->V/2, jdump);
    chkcp += fread(sys->h_Jpy + offset, sizeof(MSC), sys->V/2, jdump);
    chkcp += fread(sys->h_Jpz + offset, sizeof(MSC), sys->V/2, jdump);
    chkcp += fread(sys->h_Jmx + offset, sizeof(MSC), sys->V/2, jdump);
    chkcp += fread(sys->h_Jmy + offset, sizeof(MSC), sys->V/2, jdump);
    chkcp += fread(sys->h_Jmz + offset, sizeof(MSC), sys->V/2, jdump);
  }

  fclose(jdump);
  if(chkcp^(sys->NJBlk*6)){
    fprintf(stderr, "\nError Copying couplings: wrong size\n");
    return 1;
  }

  return 0;
}

extern int JReadSys(sysEA3D_t *sys, int sysIndex){
  FILE *jdump = fopen("j_dump_start", "rb");
  if(jdump == NULL){
    fprintf(stderr, "\nError opening j_dump_start\n");
    return 1;
  }
  int chkcp = 0;

  int offset = sysIndex*(sys->V/2);
  chkcp += fread(sys->h_Jpx + offset, sizeof(MSC), sys->V/2, jdump);
  chkcp += fread(sys->h_Jpy + offset, sizeof(MSC), sys->V/2, jdump);
  chkcp += fread(sys->h_Jpz + offset, sizeof(MSC), sys->V/2, jdump);
  chkcp += fread(sys->h_Jmx + offset, sizeof(MSC), sys->V/2, jdump);
  chkcp += fread(sys->h_Jmy + offset, sizeof(MSC), sys->V/2, jdump);
  chkcp += fread(sys->h_Jmz + offset, sizeof(MSC), sys->V/2, jdump);

  fclose(jdump);
  if(chkcp^((sys->V/2)*6)){
    fprintf(stderr, "\nError Copying couplings: wrong size\n");
    return 1;
  }

  return 0;
}

extern int SpinsRead(sysEA3D_t *sys, int t){
  char spinFileName[200];
  snprintf(spinFileName, sizeof(spinFileName), "spin_dump_L%dT%dtime%d",sys->L,(int)(sys->beta*100+.001), t);
  FILE *spindump = fopen(spinFileName, "rb");
  if(spindump == NULL){
    fprintf(stderr, "\nError opening %s\n", spinFileName);
    return 1;
  }

  int chkcp = 0;
  for(int i=0; i<sys->sysN; i++){
    int offset = i*(sys->V/2)*4;

    chkcp += fread(sys->h_reds0 + offset, sizeof(MSC), sys->V/2, spindump);
    chkcp += fread(sys->h_blues0 + offset, sizeof(MSC), sys->V/2, spindump);
    chkcp += fread(sys->h_reds1 + offset, sizeof(MSC), sys->V/2, spindump);
    chkcp += fread(sys->h_blues1 + offset, sizeof(MSC), sys->V/2, spindump);
    chkcp += fread(sys->h_reds2 + offset, sizeof(MSC), sys->V/2, spindump);
    chkcp += fread(sys->h_blues2 + offset, sizeof(MSC), sys->V/2, spindump);
    chkcp += fread(sys->h_reds3 + offset, sizeof(MSC), sys->V/2, spindump);
    chkcp += fread(sys->h_blues3 + offset, sizeof(MSC), sys->V/2, spindump);
  }
  fclose(spindump);

  if(chkcp^(sys->NSBlk*2*4)){
    fprintf(stderr, "\nError Copying spins: wrong size\n");
    return 1;
  }
  return 0;
}

extern int SpinsReadSys(sysEA3D_t *sys, int t, int sysIndex){
  char spinFileName[200];
  snprintf(spinFileName, sizeof(spinFileName), "spin_dump_L%dT%dtime%d",sys->L,(int)(sys->beta*100+.001), t);
  FILE *spindump = fopen(spinFileName, "rb");
  if(spindump == NULL){
    fprintf(stderr, "\nError opening %s\n", spinFileName);
    return 1;
  }

  int chkcp = 0;
  int offset = sysIndex*(sys->V/2);

  chkcp += fread(sys->h_reds0 + offset, sizeof(MSC), sys->V/2, spindump);
  chkcp += fread(sys->h_blues0 + offset, sizeof(MSC), sys->V/2, spindump);
  chkcp += fread(sys->h_reds1 + offset, sizeof(MSC), sys->V/2, spindump);
  chkcp += fread(sys->h_blues1 + offset, sizeof(MSC), sys->V/2, spindump);
  chkcp += fread(sys->h_reds2 + offset, sizeof(MSC), sys->V/2, spindump);
  chkcp += fread(sys->h_blues2 + offset, sizeof(MSC), sys->V/2, spindump);
  chkcp += fread(sys->h_reds3 + offset, sizeof(MSC), sys->V/2, spindump);
  chkcp += fread(sys->h_blues3 + offset, sizeof(MSC), sys->V/2, spindump);

  fclose(spindump);

  if(chkcp^((sys->V/2)*4*2)){
    fprintf(stderr, "\nError Copying spins: wrong size\n");
    return 1;
  }
  return 0;
}

extern int SpinsTransform(sysEA3D_t *sys){

  for(int i=0; i<sys->sysN; i++){
    int offset = i*sys->hV;

    arrayTransformMSC(sys->h_reds0 + offset, sys->realI_R, sys->hV);
    arrayTransformMSC(sys->h_reds1 + offset, sys->realI_R, sys->hV);
    arrayTransformMSC(sys->h_reds2 + offset, sys->realI_R, sys->hV);
    arrayTransformMSC(sys->h_reds3 + offset, sys->realI_R, sys->hV);

    arrayTransformMSC(sys->h_blues0 + offset, sys->realI_B, sys->hV);
    arrayTransformMSC(sys->h_blues1 + offset, sys->realI_B, sys->hV);
    arrayTransformMSC(sys->h_blues2 + offset, sys->realI_B, sys->hV);
    arrayTransformMSC(sys->h_blues3 + offset, sys->realI_B, sys->hV);

  }

  return 0;
}

extern int SpinsTransformRead(sysEA3D_t *sys){

  for(int i=0; i<sys->sysN; i++){
    int offset = i*sys->hV;

    arrayTransformMSC(sys->h_reds0 + offset, sys->myI_R, sys->hV);
    arrayTransformMSC(sys->h_reds1 + offset, sys->myI_R, sys->hV);
    arrayTransformMSC(sys->h_reds2 + offset, sys->myI_R, sys->hV);
    arrayTransformMSC(sys->h_reds3 + offset, sys->myI_R, sys->hV);

    arrayTransformMSC(sys->h_blues0 + offset, sys->myI_B, sys->hV);
    arrayTransformMSC(sys->h_blues1 + offset, sys->myI_B, sys->hV);
    arrayTransformMSC(sys->h_blues2 + offset, sys->myI_B, sys->hV);
    arrayTransformMSC(sys->h_blues3 + offset, sys->myI_B, sys->hV);

  }

  return 0;
}

extern int SpinsDump(sysEA3D_t *sys, unsigned long long int t){
  char spinFileName[200];

  int chkwr = 0;
  for(int i=0; i<sys->sysN; i++){
    int offset = i*(sys->hV);

    if(sys->recoverF == 0) snprintf(spinFileName, sizeof(spinFileName), "./dummyRun%d/spin_dump_L%dT%dtime%llu",
                                    i, sys->L,(int)(sys->beta*100+.001), t);
    if(sys->recoverF == 1 || sys->recoverF == 2) snprintf(spinFileName, sizeof(spinFileName), "./recover%016llX/dummyRun%d/spin_dump_L%dT%dtime%llu",
                                    sys->startValue, i, sys->L,(int)(sys->beta*100+.001), t);

    if(nprocs > 1) snprintf(spinFileName, sizeof(spinFileName), "./dummyRun%d_mpi%d/spin_dump_L%dT%dtime%llu", i, mpiid, sys->L,(int)(sys->beta*100+.001), t);
    FILE *out = fopen(spinFileName, "wb");
    if(out == NULL){
      fprintf(stderr, "\nError opening pointer for file %s\n", spinFileName);
      return 1;
    }

    chkwr += fwrite(sys->h_reds0 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_blues0 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_reds1 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_blues1 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_reds2 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_blues2 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_reds3 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_blues3 + offset, sizeof(MSC), sys->V/2, out);

    fflush(out);
    fclose(out);

  }

  if(chkwr^(sys->NSBlk*2*sys->replicas)){
    fprintf(stderr, "\nError writing spins SpinsDump\n");
    return 1;
  }
  return 0;
}

extern int SpinsReadDir(sysEA3D_t *sys, unsigned long long int t){
  char spinFileName[200];

  int chkwr = 0;
  for(int i=0; i<sys->sysN; i++){
    int offset = i*(sys->V/2);

    if(sys->recoverF == 1){
      snprintf(spinFileName, sizeof(spinFileName), "./run%d/spin_dump_L%dT%dtime%llu",
               i, sys->L,(int)(sys->beta*100+.001), t);
    }

    FILE *out = fopen(spinFileName, "rb");
    if(out == NULL){
      fprintf(stderr, "\nError opening pointer for file %s\n", spinFileName);
      return 1;
    }

    chkwr += fread(sys->h_reds0 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_blues0 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_reds1 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_blues1 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_reds2 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_blues2 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_reds3 + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_blues3 + offset, sizeof(MSC), sys->V/2, out);

    fflush(out);
    fclose(out);

  }

  if(chkwr^(sys->NSBlk*2*sys->replicas)){
    fprintf(stderr, "\nError reading spins SpinsReadDir %d %d\n", chkwr, sys->NSBlk*2*4);
    return 1;
  }
  return 0;
}

extern int JTransform(sysEA3D_t *sys){
  
  for(int i=0; i<sys->sysN; i++){
    int offset = i*sys->hV;
    arrayTransformMSC(sys->h_Jpx + offset, sys->realI_R, sys->hV);
    arrayTransformMSC(sys->h_Jpy + offset, sys->realI_R, sys->hV);
    arrayTransformMSC(sys->h_Jpz + offset, sys->realI_R, sys->hV);
    arrayTransformMSC(sys->h_Jmx + offset, sys->realI_R, sys->hV);
    arrayTransformMSC(sys->h_Jmy + offset, sys->realI_R, sys->hV);
    arrayTransformMSC(sys->h_Jmz + offset, sys->realI_R, sys->hV);
  }
  return 0;
}

extern int JTransformRead(sysEA3D_t *sys){
  
  for(int i=0; i<sys->sysN; i++){
    int offset = i*sys->hV;
    arrayTransformMSC(sys->h_Jpx + offset, sys->myI_R, sys->hV);
    arrayTransformMSC(sys->h_Jpy + offset, sys->myI_R, sys->hV);
    arrayTransformMSC(sys->h_Jpz + offset, sys->myI_R, sys->hV);
    arrayTransformMSC(sys->h_Jmx + offset, sys->myI_R, sys->hV);
    arrayTransformMSC(sys->h_Jmy + offset, sys->myI_R, sys->hV);
    arrayTransformMSC(sys->h_Jmz + offset, sys->myI_R, sys->hV);
  }
  return 0;
}

extern int JDump(sysEA3D_t *sys){
  char JFileName[200];

  int chkwr = 0;
  for(int i=0; i<sys->sysN; i++){
    int offset = i*(sys->V/2);

    if(sys->recoverF == 0) snprintf(JFileName, sizeof(JFileName), "./dummyRun%d/j_dump_start", i);
    if(sys->recoverF == 1 || sys->recoverF == 2) snprintf(JFileName, sizeof(JFileName), "./recover%016llX/dummyRun%d/j_dump_start",
                                    sys->startValue, i);
    if(nprocs > 1) snprintf(JFileName, sizeof(JFileName), "./dummyRun%d_mpi%d/j_dump_start", i, mpiid);

    FILE *out = fopen(JFileName, "wb");
    if(out == NULL){
      fprintf(stderr, "\nError opening pointer for file %s\n", JFileName);
      return 1;
    }

    chkwr += fwrite(sys->h_Jpx + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_Jpy + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_Jpz + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_Jmx + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_Jmy + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fwrite(sys->h_Jmz + offset, sizeof(MSC), sys->V/2, out);

    fflush(out);
    fclose(out);

  }

  if(chkwr^(sys->NJBlk*6)){
    fprintf(stderr, "\nError writing j_dump_start\n");
    return 1;
  }

  return 0;
}

extern int JReadDir(sysEA3D_t *sys){
  char JFileName[41];

  int chkwr = 0;
  for(int i=0; i<sys->sysN; i++){
    int offset = i*(sys->V/2);

    snprintf(JFileName, sizeof(JFileName), "./run%d/j_dump_start", i);
    FILE *out = fopen(JFileName, "rb");
    if(out == NULL){
      fprintf(stderr, "\nError opening pointer for file %s\n", JFileName);
      return 1;
    }

    chkwr += fread(sys->h_Jpx + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_Jpy + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_Jpz + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_Jmx + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_Jmy + offset, sizeof(MSC), sys->V/2, out);
    chkwr += fread(sys->h_Jmz + offset, sizeof(MSC), sys->V/2, out);

    fflush(out);
    fclose(out);

  }

  if(chkwr^(sys->NJBlk*6)){
    fprintf(stderr, "\nError writing j_dump_start\n");
    return 1;
  }

  return 0;
}

extern int JReadDirMpi(sysEA3D_t *sys, int recoverMPIprocs){
  char JFileName[41];

  int chkwr = 0;
  for(int i=0; i<sys->sysN; i++){

    for(int proc=0; proc < recoverMPIprocs; proc++){
      snprintf(JFileName, sizeof(JFileName), "./run%d_mpi%d/j_dump_start", i, proc);
      FILE *out = fopen(JFileName, "rb");
      if(out == NULL){
	fprintf(stderr, "\nError opening pointer for file %s\n", JFileName);
	return 1;
      }

      int offset = i*sys->hV + proc*sys->hV/recoverMPIprocs;
      int readDim = sys->hV/recoverMPIprocs;      

      chkwr += fread(sys->h_Jpx + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_Jpy + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_Jpz + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_Jmx + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_Jmy + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_Jmz + offset, sizeof(MSC), readDim, out);

      fflush(out);
      fclose(out);
    }

  }

  if(chkwr^(sys->NJBlk*6)){
    fprintf(stderr, "\nError writing j_dump_start\n");
    return 1;
  }

  return 0;
}

extern int SpinsReadDirMpi(sysEA3D_t *sys, unsigned long long int t, int recoverMPIprocs){
  char spinFileName[200];

  int chkwr = 0;
  for(int i=0; i<sys->sysN; i++){

    for(int proc=0; proc < recoverMPIprocs; proc++){

      snprintf(spinFileName, sizeof(spinFileName), "./run%d_mpi%d/spin_dump_L%dT%dtime%llu",
	       i, proc, sys->L,(int)(sys->beta*100+.001), t);
      
      
      FILE *out = fopen(spinFileName, "rb");
      if(out == NULL){
	fprintf(stderr, "\nError opening pointer for file %s\n", spinFileName);
	return 1;
      }

      int offset = i*sys->hV + proc*sys->hV/recoverMPIprocs;
      int readDim = sys->hV/recoverMPIprocs;

      chkwr += fread(sys->h_reds0 + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_blues0 + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_reds1 + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_blues1 + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_reds2 + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_blues2 + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_reds3 + offset, sizeof(MSC), readDim, out);
      chkwr += fread(sys->h_blues3 + offset, sizeof(MSC), readDim, out);

      fflush(out);
      fclose(out);
    }

  }

  if(chkwr^(sys->NSBlk*2*sys->replicas)){
    fprintf(stderr, "\nError reading spins SpinsReadDir %d %d\n", chkwr, sys->NSBlk*2*4);
    return 1;
  }
  return 0;
}

extern int SpinsDumpDirMpi(sysEA3D_t *sys, unsigned long long int trec, unsigned long long int t, int recoverMPIprocs){
  char spinFileName[200];

  int chkwr = 0;
  for(int i=0; i<sys->sysN; i++){

    for(int proc=0; proc < recoverMPIprocs; proc++){

      snprintf(spinFileName, sizeof(spinFileName), "./recover%016llX/dummyRun%d/spin_dump_L%dT%dtime%llu_mpi%d",
	       trec, i, sys->L, (int)(sys->beta*100+.001), t, proc);
      
      
      FILE *out = fopen(spinFileName, "wb");
      if(out == NULL){
	fprintf(stderr, "\nError opening pointer for file %s\n", spinFileName);
	return 1;
      }

      int offset = i*sys->hV + proc*sys->hV/recoverMPIprocs;
      int writeDim = sys->hV/recoverMPIprocs;

      chkwr += fwrite(sys->h_reds0 + offset, sizeof(MSC), writeDim, out);
      chkwr += fwrite(sys->h_blues0 + offset, sizeof(MSC), writeDim, out);
      chkwr += fwrite(sys->h_reds1 + offset, sizeof(MSC), writeDim, out);
      chkwr += fwrite(sys->h_blues1 + offset, sizeof(MSC), writeDim, out);
      chkwr += fwrite(sys->h_reds2 + offset, sizeof(MSC), writeDim, out);
      chkwr += fwrite(sys->h_blues2 + offset, sizeof(MSC), writeDim, out);
      chkwr += fwrite(sys->h_reds3 + offset, sizeof(MSC), writeDim, out);
      chkwr += fwrite(sys->h_blues3 + offset, sizeof(MSC), writeDim, out);

      fflush(out);
      fclose(out);
    }

  }

  if(chkwr^(sys->NSBlk*2*sys->replicas)){
    fprintf(stderr, "\nError reading spins SpinsReadDir %d %d\n", chkwr, sys->NSBlk*2*4);
    return 1;
  }
  return 0;
}

extern int JDumpDirMpi(sysEA3D_t *sys, unsigned long long int t, int recoverMPIprocs){
  char JFileName[200];

  int chkwr = 0;
  for(int i=0; i<sys->sysN; i++){

    for(int proc=0; proc < recoverMPIprocs; proc++){
      snprintf(JFileName, sizeof(JFileName), "./recover%016llX/dummyRun%d/j_dump_start_mpi%d",
	       t, i, proc);

      FILE *out = fopen(JFileName, "wb");
      if(out == NULL){
	fprintf(stderr, "\nError opening pointer for file %s\n", JFileName);
	return 1;
      }

      int offset = i*sys->hV + proc*sys->hV/recoverMPIprocs;
      int readDim = sys->hV/recoverMPIprocs;      

      chkwr += fwrite(sys->h_Jpx + offset, sizeof(MSC), readDim, out);
      chkwr += fwrite(sys->h_Jpy + offset, sizeof(MSC), readDim, out);
      chkwr += fwrite(sys->h_Jpz + offset, sizeof(MSC), readDim, out);
      chkwr += fwrite(sys->h_Jmx + offset, sizeof(MSC), readDim, out);
      chkwr += fwrite(sys->h_Jmy + offset, sizeof(MSC), readDim, out);
      chkwr += fwrite(sys->h_Jmz + offset, sizeof(MSC), readDim, out);

      fflush(out);
      fclose(out);
    }

  }

  if(chkwr^(sys->NJBlk*6)){
    fprintf(stderr, "\nError writing j_dump_start_mpi\n");
    return 1;
  }

  return 0;
}


static int slicedBoudaryInit(sysEA3D_t *sys){
  
  for(int i=0; i<sys->sysN; i++){

      int sysOffset = i*sys->hV, boundOffset = i*sys->A;

      for(int j=0; j<sys->A; j++){
        sys->h_blues0BS[j + boundOffset] = sys->h_blues0[j + sysOffset + sys->hV - sys->A];
        sys->h_blues1BS[j + boundOffset] = sys->h_blues1[j + sysOffset + sys->hV - sys->A];
        sys->h_blues2BS[j + boundOffset] = sys->h_blues2[j + sysOffset + sys->hV - sys->A];
        sys->h_blues3BS[j + boundOffset] = sys->h_blues3[j + sysOffset + sys->hV - sys->A];

        sys->h_reds0BS[j + boundOffset] = sys->h_reds0[j + sysOffset];
        sys->h_reds1BS[j + boundOffset] = sys->h_reds1[j + sysOffset];
        sys->h_reds2BS[j + boundOffset] = sys->h_reds2[j + sysOffset];
        sys->h_reds3BS[j + boundOffset] = sys->h_reds3[j + sysOffset];

        sys->h_JmzBS[j + boundOffset] = sys->h_Jmz[j + sysOffset];
        sys->h_JpyBS[j + boundOffset] = sys->h_Jpy[j + sysOffset];
        sys->h_JmxBS[j + boundOffset] = sys->h_Jmx[j + sysOffset];
      }
  }

  interexch->nbyte=sys->sysN*sys->A*sizeof(MSC);

  interexch->bluerecv=sys->h_blues0BR;
  interexch->bluesend=sys->h_blues0BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues1BR;
  interexch->bluesend=sys->h_blues1BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues2BR;
  interexch->bluesend=sys->h_blues2BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues3BR;
  interexch->bluesend=sys->h_blues3BS;
  BlueMpi(interexch);

  interexch->redrecv=sys->h_reds0BR;
  interexch->redsend=sys->h_reds0BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds1BR;
  interexch->redsend=sys->h_reds1BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds2BR;
  interexch->redsend=sys->h_reds2BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds3BR;
  interexch->redsend=sys->h_reds3BS;
  RedMpi(interexch);

  interexch->redrecv=sys->h_JmzBR;
  interexch->redsend=sys->h_JmzBS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_JpyBR;
  interexch->redsend=sys->h_JpyBS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_JmxBR;
  interexch->redsend=sys->h_JmxBS;
  RedMpi(interexch);

  MYCUDA_ERROR( cudaMemcpy(sys->d_reds0BR, sys->h_reds0BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_reds1BR, sys->h_reds1BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_reds2BR, sys->h_reds2BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_reds3BR, sys->h_reds3BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );

  MYCUDA_ERROR( cudaMemcpy(sys->d_blues0BR, sys->h_blues0BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_blues1BR, sys->h_blues1BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_blues2BR, sys->h_blues2BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_blues3BR, sys->h_blues3BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );

  MYCUDA_ERROR( cudaMemcpy(sys->d_JmzBR, sys->h_JmzBR, sys->NJBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_JpyBR, sys->h_JpyBR, sys->NJBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
  MYCUDA_ERROR( cudaMemcpy(sys->d_JmxBR, sys->h_JmxBR, sys->NJBnd*sizeof(MSC), cudaMemcpyHostToDevice) );

  return 0;
}

#if 0
static int slicedRedBoundaryCopy(sysEA3D_t *sys){
  
  D2HSpins(sys);
  for(int i=0; i<sys->sysN; i++){

      int sysOffset = i*sys->hV, boundOffset = i*sys->A;

      for(int j=0; j<sys->A; j++){
        sys->h_reds0BS[j + boundOffset] = sys->h_reds0[j + sysOffset];
        sys->h_reds1BS[j + boundOffset] = sys->h_reds1[j + sysOffset];
        sys->h_reds2BS[j + boundOffset] = sys->h_reds2[j + sysOffset];
        sys->h_reds3BS[j + boundOffset] = sys->h_reds3[j + sysOffset];
      }
    }

    MYCUDA_ERROR( cudaMemcpy(sys->d_reds0BR, sys->h_reds0BS, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
    MYCUDA_ERROR( cudaMemcpy(sys->d_reds1BR, sys->h_reds1BS, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
    MYCUDA_ERROR( cudaMemcpy(sys->d_reds2BR, sys->h_reds2BS, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
    MYCUDA_ERROR( cudaMemcpy(sys->d_reds3BR, sys->h_reds3BS, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );

  return 0;
}

static int slicedBlueBoundaryCopy(sysEA3D_t *sys){
  
  D2HSpins(sys);
  for(int i=0; i<sys->sysN; i++){

      int sysOffset = i*sys->hV, boundOffset = i*sys->A;

      for(int j=0; j<sys->A; j++){
        sys->h_blues0BS[j + boundOffset] = sys->h_blues0[j + sysOffset + sys->hV - sys->A];
        sys->h_blues1BS[j + boundOffset] = sys->h_blues1[j + sysOffset + sys->hV - sys->A];
        sys->h_blues2BS[j + boundOffset] = sys->h_blues2[j + sysOffset + sys->hV - sys->A];
        sys->h_blues3BS[j + boundOffset] = sys->h_blues3[j + sysOffset + sys->hV - sys->A];
      }
    }

    MYCUDA_ERROR( cudaMemcpy(sys->d_blues0BR, sys->h_blues0BS, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
    MYCUDA_ERROR( cudaMemcpy(sys->d_blues1BR, sys->h_blues1BS, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
    MYCUDA_ERROR( cudaMemcpy(sys->d_blues2BR, sys->h_blues2BS, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );
    MYCUDA_ERROR( cudaMemcpy(sys->d_blues3BR, sys->h_blues3BS, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice) );

  return 0;
}
#endif

__global__ void slicedRedsBoundaryCopy_Kernel(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3){
  const int offSysV = blockIdx.y*d_hV;
  const int offSysA = blockIdx.y*d_A;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;

  if(kk<d_A) {
  	s0[kk + offSysA] = tex1Dfetch(texture_R0, kk + offSysV);
        s1[kk + offSysA] = tex1Dfetch(texture_R1, kk + offSysV);
  	s2[kk + offSysA] = tex1Dfetch(texture_R2, kk + offSysV);
  	s3[kk + offSysA] = tex1Dfetch(texture_R3, kk + offSysV);
  }

  return;
}

__global__ void slicedBluesBoundaryCopy_Kernel(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3){
  const int offSysV = blockIdx.y*d_hV + d_hV - d_A;
  const int offSysA = blockIdx.y*d_A;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;

  if(kk<d_A) {
  	s0[kk + offSysA] = tex1Dfetch(texture_B0, kk + offSysV);
  	s1[kk + offSysA] = tex1Dfetch(texture_B1, kk + offSysV);
  	s2[kk + offSysA] = tex1Dfetch(texture_B2, kk + offSysV);
  	s3[kk + offSysA] = tex1Dfetch(texture_B3, kk + offSysV);
  }

  return;
}


#define BNDTHRESHOLD 256
static int slicedBoundary_BlueSendReceive(sysEA3D_t *sys, cudaStream_t *stream){

  int nt, nb; 
  if((sys->L*sys->L)>BNDTHRESHOLD) {
     nt=BNDTHRESHOLD; 
     nb=(sys->L*sys->L+(BNDTHRESHOLD-1))/BNDTHRESHOLD;
  } else {
     nt=sys->L*sys->L;
     nb=1;
  }

  dim3 blockCopyBuffer(nt, 1, 1);
  dim3 gridCopyBuffer(nb, sys->sysN, 1);

  slicedBluesBoundaryCopy_Kernel<<< gridCopyBuffer, blockCopyBuffer, 0, stream[0]>>>(sys->d_blues0BS, sys->d_blues1BS, sys->d_blues2BS, sys->d_blues3BS);

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_blues0BS, sys->d_blues0BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_blues1BS, sys->d_blues1BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_blues2BS, sys->d_blues2BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_blues3BS, sys->d_blues3BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  interexch->nbyte=sys->sysN*sys->A*sizeof(MSC);
  interexch->bluerecv=sys->h_blues0BR;
  interexch->bluesend=sys->h_blues0BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues1BR;
  interexch->bluesend=sys->h_blues1BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues2BR;
  interexch->bluesend=sys->h_blues2BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues3BR;
  interexch->bluesend=sys->h_blues3BS;
  BlueMpi(interexch);

  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_blues0BR, sys->h_blues0BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_blues1BR, sys->h_blues1BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_blues2BR, sys->h_blues2BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_blues3BR, sys->h_blues3BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  return 0;
}

static int slicedBoundary_RedSendReceive(sysEA3D_t *sys, cudaStream_t *stream){

  int nt, nb; 
  if((sys->L*sys->L)>BNDTHRESHOLD) {
     nt=BNDTHRESHOLD; 
     nb=(sys->L*sys->L+(BNDTHRESHOLD-1))/BNDTHRESHOLD;
  } else {
     nt=sys->L*sys->L;
     nb=1;
  }

  dim3 blockCopyBuffer(nt, 1, 1);
  dim3 gridCopyBuffer(nb, sys->sysN, 1);

  //printf("SR: [{%d, %d, %d}, {%d, %d, %d}]\n",
  //blockCopyBuffer.x, blockCopyBuffer.y, blockCopyBuffer.z,
  //gridCopyBuffer.x, gridCopyBuffer.y, gridCopyBuffer.z);

  slicedRedsBoundaryCopy_Kernel<<< gridCopyBuffer, blockCopyBuffer, 0, stream[0]>>>(sys->d_reds0BS, sys->d_reds1BS, sys->d_reds2BS, sys->d_reds3BS); 

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_reds0BS, sys->d_reds0BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_reds1BS, sys->d_reds1BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_reds2BS, sys->d_reds2BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_reds3BS, sys->d_reds3BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  interexch->nbyte=sys->sysN*sys->A*sizeof(MSC);
  interexch->redrecv=sys->h_reds0BR;
  interexch->redsend=sys->h_reds0BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds1BR;
  interexch->redsend=sys->h_reds1BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds2BR;
  interexch->redsend=sys->h_reds2BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds3BR;
  interexch->redsend=sys->h_reds3BS;
  RedMpi(interexch);

  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_reds0BR, sys->h_reds0BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_reds1BR, sys->h_reds1BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_reds2BR, sys->h_reds2BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_reds3BR, sys->h_reds3BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  return 0;
}

static int slicedBoundary_BlueSendReceiveNew(sysEA3D_t *sys, cudaStream_t *stream){

  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_blues0BS, sys->d_blues0BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_blues1BS, sys->d_blues1BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_blues2BS, sys->d_blues2BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_blues3BS, sys->d_blues3BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  interexch->nbyte=sys->sysN*sys->A*sizeof(MSC);
  interexch->bluerecv=sys->h_blues0BR;
  interexch->bluesend=sys->h_blues0BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues1BR;
  interexch->bluesend=sys->h_blues1BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues2BR;
  interexch->bluesend=sys->h_blues2BS;
  BlueMpi(interexch);
  interexch->bluerecv=sys->h_blues3BR;
  interexch->bluesend=sys->h_blues3BS;
  BlueMpi(interexch);

  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_blues0BR, sys->h_blues0BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_blues1BR, sys->h_blues1BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_blues2BR, sys->h_blues2BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_blues3BR, sys->h_blues3BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  return 0;
}

static int slicedBoundary_RedSendReceiveNew(sysEA3D_t *sys, cudaStream_t *stream){

  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_reds0BS, sys->d_reds0BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_reds1BS, sys->d_reds1BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_reds2BS, sys->d_reds2BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->h_reds3BS, sys->d_reds3BS, sys->NSBnd*sizeof(MSC), cudaMemcpyDeviceToHost, stream[0]) );

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  interexch->nbyte=sys->sysN*sys->A*sizeof(MSC);
  interexch->redrecv=sys->h_reds0BR;
  interexch->redsend=sys->h_reds0BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds1BR;
  interexch->redsend=sys->h_reds1BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds2BR;
  interexch->redsend=sys->h_reds2BS;
  RedMpi(interexch);
  interexch->redrecv=sys->h_reds3BR;
  interexch->redsend=sys->h_reds3BS;
  RedMpi(interexch);

  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_reds0BR, sys->h_reds0BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_reds1BR, sys->h_reds1BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_reds2BR, sys->h_reds2BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );
  MYCUDA_ERROR( cudaMemcpyAsync(sys->d_reds3BR, sys->h_reds3BR, sys->NSBnd*sizeof(MSC), cudaMemcpyHostToDevice, stream[0]) );

  MYCUDA_ERROR( cudaStreamSynchronize(stream[0]) );

  return 0;
}

extern sysEA3D_t *sysInit(int argc, char **argv){
  if(argc != 16){ usage(argv); return NULL; }
  sysEA3D_t *ptr = (sysEA3D_t *)malloc(sizeof(sysEA3D_t));
  if(ptr == NULL){
    fprintf(stderr, "\nNo Alloc ptr in sysInit\n");
    return NULL;
  }
  sscanf(argv[1], "%d", &ptr->L);
  sscanf(argv[2], "%f", &ptr->beta);
  sscanf(argv[3], "%d", &ptr->steps);
  sscanf(argv[4], "%d", &ptr->replicas);
  sscanf(argv[5], "%d", &ptr->sysN);
  sscanf(argv[6], "%d", &ptr->blockSize);
  sscanf(argv[7], "%d", &ptr->k);
  sscanf(argv[8], "%d", &ptr->kernelChoice);
  sscanf(argv[9], "%d", &ptr->cardN);
  sscanf(argv[10], "%d", &ptr->dumpF);
  sscanf(argv[11], "%d", &ptr->analysisF);
  sscanf(argv[12], "%d", &ptr->ram);
  sscanf(argv[13], "%d", &ptr->recoverF);
  sscanf(argv[14], "%llu", &ptr->startValue);
  sscanf(argv[15], "%d", &ptr->recoverMPIprocs);
  
  ptr->h_reds0 = ptr->h_reds1 = ptr->h_reds2 = ptr->h_reds3 = NULL;
  ptr->h_blues0 = ptr->h_blues1 = ptr->h_blues2 = ptr->h_blues3 = NULL;
  ptr->d_reds0 = ptr->d_reds1 = ptr->d_reds2 = ptr->d_reds3 = NULL;
  ptr->d_blues0 = ptr->d_blues1 = ptr->d_blues2 = ptr->d_blues3 = NULL;

  ptr->h_reds0BS = ptr->h_reds1BS = ptr->h_reds2BS = ptr->h_reds3BS = NULL;
  ptr->h_reds0BR = ptr->h_reds1BR = ptr->h_reds2BR = ptr->h_reds3BR = NULL;
  ptr->h_blues0BS = ptr->h_blues1BS = ptr->h_blues2BS = ptr->h_blues3BS = NULL;
  ptr->h_blues0BR = ptr->h_blues1BR = ptr->h_blues2BR = ptr->h_blues3BR = NULL;

  ptr->d_reds0BS = ptr->d_reds1BS = ptr->d_reds2BS = ptr->d_reds3BS = NULL;
  ptr->d_reds0BR = ptr->d_reds1BR = ptr->d_reds2BR = ptr->d_reds3BR = NULL;
  ptr->d_blues0BS = ptr->d_blues1BS = ptr->d_blues2BS = ptr->d_blues3BS = NULL;
  ptr->d_blues0BR = ptr->d_blues1BR = ptr->d_blues2BR = ptr->d_blues3BR = NULL;

  ptr->h_Jpx = ptr->h_Jpy = ptr->h_Jpz = NULL;
  ptr->h_Jmx = ptr->h_Jmy = ptr->h_Jmz = NULL;
  ptr->d_Jpx = ptr->d_Jpy = ptr->d_Jpz = NULL;
  ptr->d_Jmx = ptr->d_Jmy = ptr->d_Jmz = NULL;

  ptr->h_JmxBS = ptr->h_JpyBS = ptr->h_JmzBS = NULL;
  ptr->h_JmxBR = ptr->h_JpyBR = ptr->h_JmzBR = NULL;

  ptr->d_JmxBS = ptr->d_JpyBS = ptr->d_JmzBS = NULL;
  ptr->d_JmxBR = ptr->d_JpyBR = ptr->d_JmzBR = NULL;

  ptr->realI_R = ptr->realI_B = NULL;
  ptr->myI_R = ptr->myI_B = NULL;
  ptr->myI_B2R = ptr->myI_R2B = NULL;

  // GPU selection
#if 0
  int numberOfGPUs = 0;
  cudaGetDeviceCount(&numberOfGPUs);
  if(ptr->cardN >= numberOfGPUs){
    fprintf(stderr, "gpu %d >= gpuN %d\n", ptr->cardN, numberOfGPUs);
    return NULL;
  }
  ptr->gpuNumber = numberOfGPUs;
#endif

  // ram option
  if(ptr->ram == 1 && ptr->analysisF == 1){
    snprintf(ptr->devShm, sizeof(ptr->devShm), "/dev/shm/dev%dL%dB%.3f/",
             ptr->cardN, ptr->L, ptr->beta);
  }else{
    snprintf(ptr->devShm, sizeof(ptr->devShm), "./dev%dL%dB%.3f/",
             ptr->cardN, ptr->L, ptr->beta);
  }

  // recover
  if(ptr->recoverF == 0) ptr->startValue = 0;

  ptr->V = ptr->L*ptr->L*ptr->L/nprocs;
  ptr->A = ptr->L*ptr->L;

  ptr->hV = ptr->V/2;
  ptr->hA = ptr->A/2;
  ptr->hL = ptr->L/2;

  printf("sysInit nprocs: %d\n", nprocs);

  if(nprocs == 1){
    ptr->realI_R = (int *)malloc(ptr->hV*sizeof(int));
    ptr->realI_B = (int *)malloc(ptr->hV*sizeof(int));
    ptr->myI_R = (int *)malloc(ptr->hV*sizeof(int));
    ptr->myI_B = (int *)malloc(ptr->hV*sizeof(int));
    ptr->myI_B2R = (int *)malloc(ptr->hV*sizeof(int));
    ptr->myI_R2B = (int *)malloc(ptr->hV*sizeof(int));
    
    mapsDefine(ptr->realI_R, ptr->realI_B, ptr->myI_R, ptr->myI_B, ptr->myI_B2R, ptr->myI_R2B, ptr->hV, ptr->L);
  }

  ptr->NJBnd = ptr->sysN*ptr->A;
  ptr->NSBnd = ptr->sysN*ptr->A;

  ptr->NJBlk = ptr->sysN*ptr->hV;
  ptr->NSBlk = ptr->sysN*ptr->hV;

//  cudaSetDevice(ptr->cardN);
//  cudaDeviceProp prop;
//  cudaGetDeviceProperties(&prop, ptr->cardN);
// printf("GPU %d: %s\n", ptr->cardN, prop.name);

  if(masksInit(ptr)){
    fprintf(stderr,"\nError in masksInit\n");
    return NULL;
  }
  if(MCWeightsInit(ptr)){
    fprintf(stderr,"\nError in MCWeightsInit\n");
    return NULL;
  }
  if(dMemInit(ptr)){
    fprintf(stderr,"\nError in dMemInit\n");
    return NULL;
  }
  if(hMemInit(ptr)){
    fprintf(stderr,"\nError in hMemInit\n");
    return NULL;
  }

  if(ptr->recoverF == 0){
    if(JInit(ptr)){
      fprintf(stderr,"\nError in JInit\n");
      return NULL;
    }

    if(SpinsInit(ptr)){
      fprintf(stderr,"\nError in SpinsInit\n");
      return NULL;
    }
    if(H2DSysCopy(ptr)){
      fprintf(stderr,"\nError in H2DSysCopy\n");
      return NULL;
    }
  }

  if(ptr->recoverF == 1 && nprocs == 1){
    if(SpinsReadDir(ptr, ptr->startValue)){
      fprintf(stderr,"\nError in SpinsReadDir\n");
      return NULL;
    }

    if(JReadDir(ptr)){
      fprintf(stderr,"\nError in JRead\n");
      return NULL;
    }

    if(ptr->kernelChoice == 1 || ptr->kernelChoice == 2 || ptr->kernelChoice == 5){
      SpinsTransformRead(ptr);
      JTransformRead(ptr);
    }

    if(H2DSysCopy(ptr)){
      fprintf(stderr,"\nError in H2DSysCopy\n");
      return NULL;
    }
  }

  if(ptr->recoverF == 2 && nprocs == 1){
    if(SpinsReadDirMpi(ptr, ptr->startValue, ptr->recoverMPIprocs)){
      fprintf(stderr,"\nError in SpinsReadDirMpi\n");
      return NULL;
    }

    if(JReadDirMpi(ptr, ptr->recoverMPIprocs)){
      fprintf(stderr,"\nError in JReadMpi\n");
      return NULL;
    }

    if(H2DSysCopy(ptr)){
      fprintf(stderr,"\nError in H2DSysCopy\n");
      return NULL;
    }
  }

  if(ptr->dumpF == 1 || ptr->ram == 1 ){
    if(dummyDirsMaker(ptr)){
      fprintf(stderr,"\nError in dummyDirsMaker\n");
      return NULL;
    }

  }

  if(ptr->kernelChoice == 5){
    if(slicedBoudaryInit(ptr)){
      fprintf(stderr,"\nError in slicedBoundaryInit\n");
      return NULL;
    }
  }

  if((ptr->kernelChoice == 1 || ptr->kernelChoice == 2 || ptr->kernelChoice == 5) && nprocs == 1 && ptr->recoverF != 2){
    SpinsTransform(ptr);
    JTransform(ptr);
  }
  
  if(ptr->dumpF == 1){
    if(SpinsDump(ptr, ptr->startValue)){
      fprintf(stderr,"\nError in SpinsDump\n");
      return NULL;
    }

    if(JDump(ptr)){
      fprintf(stderr,"\nError in JDump\n");
      return NULL;
    }

    if(ptr->recoverF == 2){
      if(SpinsDumpDirMpi(ptr, ptr->startValue, ptr->startValue, ptr->recoverMPIprocs)){
	fprintf(stderr,"\nError in SpinsDumpDirMpi\n");
	return NULL;
      }
      if(JDumpDirMpi(ptr, ptr->startValue, ptr->recoverMPIprocs)){
	fprintf(stderr,"\nError in JDumpDirMpi\n");
	return NULL;
      }
    }

  }

  return ptr;
}

// BEGINNING KERNELS


//////////////////////////////////////////////////////////////////////
// KERNEL BITWISE
//////////////////////////////////////////////////////////////////////

__global__ void MCStepReds_Bitwise(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  const int rngIndex = kk + blockIdx.y*d_loopStride;
  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_hV; k += d_loopStride){

    int i = k + off;

    unsigned int iz=k>>d_sdbLsm1;
    unsigned int iy=(k - (iz<<d_sdbLsm1))>>d_sdbLm1;
    int diff=(iz&1)^(iy&1);

    int spx=(k >> (-diff)) + (((k-(k&d_maskL)) + ((k+1)&d_maskL)) >> (diff-1)) + off;
    int smx=(((k-(k&d_maskL)) + ((k-1)&d_maskL)) >> (-diff)) + (k>>(diff-1)) + off;
    int spy=(k-(k&d_maskL2))+((k+d_Ldb2)&d_maskL2) + off;
    int smy=(k-(k&d_maskL2))+((k-d_Ldb2)&d_maskL2) + off;
    int spz=((k+d_Lsdb2) & d_maskL3) + off;
    int smz=((k-d_Lsdb2) & d_maskL3) + off;

    UPDATE_RED(s0, texture_B0);
    UPDATE_RED(s1, texture_B1);
    UPDATE_RED(s2, texture_B2);
    UPDATE_RED(s3, texture_B3);
  }
  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepBlues_Bitwise(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  const int rngIndex = kk + blockIdx.y*d_loopStride;
  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_hV; k += d_loopStride){

    int i = k + off;

    unsigned int iz=k>>d_sdbLsm1;
    unsigned int iy=(k - (iz<<d_sdbLsm1))>>d_sdbLm1;
    int diff=(iz&1)^(iy&1);

    int spx=(k >> (diff-1)) + (((k-(k&d_maskL)) + ((k+1)&d_maskL)) >> (-diff)) + off;
    int smx=(((k-(k&d_maskL)) + ((k-1)&d_maskL)) >> (diff-1)) + (k>>(-diff)) + off;
    int spy=(k-(k&d_maskL2))+((k+d_Ldb2)&d_maskL2) + off;
    int smy=(k-(k&d_maskL2))+((k-d_Ldb2)&d_maskL2) + off;
    int spz=((k+d_Lsdb2) & d_maskL3) + off;
    int smz=((k-d_Lsdb2) & d_maskL3) + off;

    UPDATE_BLUE(s0, texture_R0);
    UPDATE_BLUE(s1, texture_R1);
    UPDATE_BLUE(s2, texture_R2);
    UPDATE_BLUE(s3, texture_R3);
  }
  rand[rngIndex] = seed;

  return;
}

//////////////////////////////////////////////////////////////////////
// KERNEL STANDARD
//////////////////////////////////////////////////////////////////////


__global__ void MCStepReds_Standard(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  const int rngIndex = kk + blockIdx.y*d_loopStride;
  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_hV; k += d_loopStride){

    int i = k + off;

    int x = k%d_hL; int y = (k/d_hL)%d_L; int z = k/d_hA;
    int par = (y^z)&1;

    int spx = i - x + SP(x + 1 - (par^1), d_hL);
    int smx = i - x + SM(x - 1 + par, d_hL);
    int spy = i + (SP(y + 1, d_L) - y)*d_hL;
    int smy = i + (SM(y - 1, d_L) - y)*d_hL;
    int spz = i + (SP(z + 1, d_L) - z)*d_hA;
    int smz = i + (SM(z - 1, d_L) - z)*d_hA;

    UPDATE_RED(s0, texture_B0);
    UPDATE_RED(s1, texture_B1);
    UPDATE_RED(s2, texture_B2);
    UPDATE_RED(s3, texture_B3);
  }

  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepBlues_Standard(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  const int rngIndex = kk + blockIdx.y*d_loopStride;
  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_hV; k += d_loopStride){

    int i = k + off;

    int x = k%d_hL; int y = (k/d_hL)%d_L; int z = k/d_hA;
    int par = (y^z)&1;

    int spx = i - x + SP((x + 1 - par), d_hL);
    int smx = i - x + SM((x - 1 + (par^1)), d_hL);
    int spy = i + (SP(y + 1, d_L) - y)*d_hL;
    int smy = i + (SM(y - 1, d_L) - y)*d_hL;
    int spz = i + (SP(z + 1, d_L) - z)*d_hA;
    int smz = i + (SM(z - 1, d_L) - z)*d_hA;

    UPDATE_BLUE(s0, texture_R0);
    UPDATE_BLUE(s1, texture_R1);
    UPDATE_BLUE(s2, texture_R2);
    UPDATE_BLUE(s3, texture_R3);
  }

  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepReds_StandardGrid(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int i = threadIdx.x + threadIdx.y*blockDim.x + blockIdx.x*d_block_xStrideS + blockIdx.y*d_block_yStrideS + blockIdx.z*d_hV;
  const int y = threadIdx.y + blockIdx.x*blockDim.y;

  RNGT seed = rand[i];
  int par = (y^blockIdx.y)&1;

  int spx = i - threadIdx.x + SP(threadIdx.x + 1 - (par^1), d_hL);
  int smx = i - threadIdx.x + SMM(threadIdx.x + par, d_hL);
  int spy = i + (SP(y + 1, d_L) - y)*d_hL;
  int smy = i + (SM(y - 1, d_L) - y)*d_hL;
  int spz = i + (SP(blockIdx.y + 1, d_L) - blockIdx.y)*d_hA;
  int smz = i + (SMM(blockIdx.y, d_L) - blockIdx.y)*d_hA;

  UPDATE_RED(s0, texture_B0);
  UPDATE_RED(s1, texture_B1);
  UPDATE_RED(s2, texture_B2);
  UPDATE_RED(s3, texture_B3);

  rand[i] = seed;

  return;
}

__global__ void MCStepBlues_StandardGrid(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int i = threadIdx.x + threadIdx.y*blockDim.x + blockIdx.x*d_block_xStrideS + blockIdx.y*d_block_yStrideS + blockIdx.z*d_hV;
  const int y = threadIdx.y + blockIdx.x*blockDim.y;

  RNGT seed = rand[i];

  int par = (y^blockIdx.y)&1;

  int spx = i - threadIdx.x + SP(threadIdx.x + 1 - par, d_hL);
  int smx = i - threadIdx.x + SMM(threadIdx.x + (par^1), d_hL);
  int spy = i + (SP(y + 1, d_L) - y)*d_hL;
  int smy = i + (SM(y - 1, d_L) - y)*d_hL;
  int spz = i + (SP(blockIdx.y + 1, d_L) - blockIdx.y)*d_hA;
  int smz = i + (SMM(blockIdx.y, d_L) - blockIdx.y)*d_hA;

  UPDATE_BLUE(s0, texture_R0);
  UPDATE_BLUE(s1, texture_R1);
  UPDATE_BLUE(s2, texture_R2);
  UPDATE_BLUE(s3, texture_R3);

  rand[i] = seed;

  return;
}


//////////////////////////////////////////////////////////////////////
// KERNEL SLICED
//////////////////////////////////////////////////////////////////////

__global__ void MCStepReds_Sliced(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  const int rngIndex = kk + blockIdx.y*d_loopStride;

  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_hV; k += d_loopStride){

    int i = k + off;

    int x = k%d_L;
    int y = (k/d_L)%d_L;
    int z = k/d_A;

    int smz = i + (SM(z - 1, d_hL) - z)*d_A;
    int spy = smz + (SP(y + 1, d_L) - y)*d_L;
    int smy = i + (SM(y - 1, d_L) - y)*d_L;
    int smx = spy - x + SM(x - 1, d_L);
    int spx = smy - x + SP(x + 1, d_L);

    UPDATE_RED_SLICED(s0, texture_B0);
    UPDATE_RED_SLICED(s1, texture_B1);
    UPDATE_RED_SLICED(s2, texture_B2);
    UPDATE_RED_SLICED(s3, texture_B3);
  }

  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepReds_SlicedBnd_compare(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  const int bndOff = blockIdx.y*d_A;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  // bisogna fare attenzione a d_loopStride; per ora cosi' per fare il confronto bitwise
  const int rngIndex = kk + blockIdx.y*d_rngStrideBnd;

  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_A; k += d_loopStrideBnd){

    int i = k + off;
    int j = k + bndOff;

    int x = k%d_L;
    int y = (k/d_L)%d_L;

    int smz = j;
    int spy = smz + (SP(y + 1, d_L) - y)*d_L;
    int smy = i + (SM(y - 1, d_L) - y)*d_L;
    int smx = spy - x + SM(x - 1, d_L);
    int spx = smy - x + SP(x + 1, d_L);

    UPDATE_RED_SLICED_BND(s0, texture_B0, texture_B0B);
    UPDATE_RED_SLICED_BND(s1, texture_B1, texture_B1B);
    UPDATE_RED_SLICED_BND(s2, texture_B2, texture_B2B);
    UPDATE_RED_SLICED_BND(s3, texture_B3, texture_B3B);
  }

  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepReds_SlicedBnd_compareCp(__restrict MSC *s0, __restrict MSC *s1, 
					       __restrict MSC *s2, __restrict MSC *s3, 
					       __restrict RNGT *rand,
					       __restrict MSC *s0s, __restrict MSC *s1s, 
					       __restrict MSC *s2s, __restrict MSC *s3s){

  const int off = blockIdx.y*d_hV;
  const int bndOff = blockIdx.y*d_A;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  // bisogna fare attenzione a d_loopStride; per ora cosi' per fare il confronto bitwise
  const int rngIndex = kk + blockIdx.y*d_rngStrideBnd;

  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_A; k += d_loopStrideBnd){

    int i = k + off;
    int j = k + bndOff;

    int x = k%d_L;
    int y = (k/d_L)%d_L;

    int smz = j;
    int spy = smz + (SP(y + 1, d_L) - y)*d_L;
    int smy = i + (SM(y - 1, d_L) - y)*d_L;
    int smx = spy - x + SM(x - 1, d_L);
    int spx = smy - x + SP(x + 1, d_L);

    UPDATE_RED_SLICED_BND_CP(s0, s0s, texture_B0, texture_B0B);
    UPDATE_RED_SLICED_BND_CP(s1, s1s, texture_B1, texture_B1B);
    UPDATE_RED_SLICED_BND_CP(s2, s2s, texture_B2, texture_B2B);
    UPDATE_RED_SLICED_BND_CP(s3, s3s, texture_B3, texture_B3B);
  }

  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepReds_SlicedBlk_compare(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  const int kk = d_A + blockIdx.x*blockDim.x + threadIdx.x;
  const int rngIndex = kk + blockIdx.y*d_rngStrideBlk;

  RNGT seed = rand[rngIndex];
  
  for(int k = kk; k < d_hV; k += d_loopStrideBlk){

     int i = k + off;
  
     int x = k%d_L;
     int y = (k/d_L)%d_L;
     //int z = k/d_A;

     //int smz = i + (SM(z - 1, d_hL) - z)*d_A;
     int smz = i - d_A;
     int spy = smz + (SP(y + 1, d_L) - y)*d_L;
     int smy = i + (SM(y - 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);
     int spx = smy - x + SP(x + 1, d_L);

     UPDATE_RED_SLICED(s0, texture_B0);
     UPDATE_RED_SLICED(s1, texture_B1);
     UPDATE_RED_SLICED(s2, texture_B2);
     UPDATE_RED_SLICED(s3, texture_B3);
  }
  
  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepReds_SlicedBnd(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  const int bndOff = blockIdx.y*d_A;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  const int rngIndex = kk + blockIdx.y*d_rngStrideBnd;

  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_A; k += d_loopStrideBnd){

    int i = k + off;
    int j = k + bndOff;

    int x = k%d_L;
    int y = (k/d_L)%d_L;

    int smz = j;
    int spy = smz + (SP(y + 1, d_L) - y)*d_L;
    int smy = i + (SM(y - 1, d_L) - y)*d_L;
    int smx = spy - x + SM(x - 1, d_L);
    int spx = smy - x + SP(x + 1, d_L);

    UPDATE_RED_SLICED_BND(s0, texture_B0, texture_B0B);
    UPDATE_RED_SLICED_BND(s1, texture_B1, texture_B1B);
    UPDATE_RED_SLICED_BND(s2, texture_B2, texture_B2B);
    UPDATE_RED_SLICED_BND(s3, texture_B3, texture_B3B);
  }

  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepReds_SlicedBndCp(__restrict MSC *s0, __restrict MSC *s1,
				       __restrict MSC *s2, __restrict MSC *s3, 
				       __restrict RNGT *rand,
				       __restrict MSC *s0s, __restrict MSC *s1s,
				       __restrict MSC *s2s, __restrict MSC *s3s){

  const int off = blockIdx.y*d_hV;
  const int bndOff = blockIdx.y*d_A;
  const int kk = blockIdx.x*blockDim.x + threadIdx.x;
  const int rngIndex = kk + blockIdx.y*d_rngStrideBnd;

  RNGT seed = rand[rngIndex];

  for(int k = kk; k < d_A; k += d_loopStrideBnd){

    int i = k + off;
    int j = k + bndOff;

    int x = k%d_L;
    int y = (k/d_L)%d_L;

    int smz = j;
    int spy = smz + (SP(y + 1, d_L) - y)*d_L;
    int smy = i + (SM(y - 1, d_L) - y)*d_L;
    int smx = spy - x + SM(x - 1, d_L);
    int spx = smy - x + SP(x + 1, d_L);

    UPDATE_RED_SLICED_BND_CP(s0, s0s, texture_B0, texture_B0B);
    UPDATE_RED_SLICED_BND_CP(s1, s1s, texture_B1, texture_B1B);
    UPDATE_RED_SLICED_BND_CP(s2, s2s, texture_B2, texture_B2B);
    UPDATE_RED_SLICED_BND_CP(s3, s3s, texture_B3, texture_B3B);
  }

  rand[rngIndex] = seed;

  return;
}

__global__ void MCStepReds_SlicedBlk(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

  const int off = blockIdx.y*d_hV;
  int rngIndex = blockIdx.x*blockDim.x + threadIdx.x; 
  const int kk = d_A + rngIndex; 
  rngIndex += blockIdx.y*d_rngStrideBlk;

  RNGT seed = rand[rngIndex];
  
  for(int k = kk; k < d_hV; k += d_loopStrideBlk){

     int i = k + off;
  
     int x = k%d_L;
     int y = (k/d_L)%d_L;
     //int z = k/d_A;

     //int smz = i + (SM(z - 1, d_hL) - z)*d_A;
     int smz = i - d_A;
     int spy = smz + (SP(y + 1, d_L) - y)*d_L;
     int smy = i + (SM(y - 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);
     int spx = smy - x + SP(x + 1, d_L);

     UPDATE_RED_SLICED(s0, texture_B0);
     UPDATE_RED_SLICED(s1, texture_B1);
     UPDATE_RED_SLICED(s2, texture_B2);
     UPDATE_RED_SLICED(s3, texture_B3);
  }
  
  rand[rngIndex] = seed;

  return;
}


__global__ void MCStepBlues_Sliced(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

   const int off = blockIdx.y*d_hV;
   const int kk = blockIdx.x*blockDim.x + threadIdx.x;
   const int rngIndex = kk + blockIdx.y*d_loopStride;
   RNGT seed = rand[rngIndex];


   for(int k = kk; k < d_hV; k += d_loopStride){

     int i = k + off;

     int x = k%d_L;
     int y = (k/d_L)%d_L;
     int z = k/d_A;

     int spz = i + (SP(z + 1, d_hL) - z)*d_A;
     int spy = i + (SP(y + 1, d_L) - y)*d_L;
     int smy = spz + (SM(y - 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);
     int spx = smy - x + SP(x + 1, d_L);

     UPDATE_BLUE_SLICED(s0, texture_R0);
     UPDATE_BLUE_SLICED(s1, texture_R1);
     UPDATE_BLUE_SLICED(s2, texture_R2);
     UPDATE_BLUE_SLICED(s3, texture_R3);
   }

   rand[rngIndex] = seed;

   return;
 }


__global__ void MCStepBlues_SlicedBnd_compare(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

   const int off = blockIdx.y*d_hV;
   const int bndOff = blockIdx.y*d_A - (d_hV - d_A);
   const int kk = d_hV - d_A + blockIdx.x*blockDim.x + threadIdx.x;
   const int rngIndex = kk + blockIdx.y*d_rngStrideBnd;
   RNGT seed = rand[rngIndex];

   for(int k = kk; k < d_hV; k += d_loopStrideBnd){

     int i = k + off;
     int j = k + bndOff;

     int x = k%d_L;
     int y = (k/d_L)%d_L;

     int spz = j;
     int smy = spz + (SM(y - 1, d_L) - y)*d_L;
     int spx = smy - x + SP(x + 1, d_L);

     int spy = i + (SP(y + 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);

     UPDATE_BLUE_SLICED_BND(s0, texture_R0, texture_R0B);
     UPDATE_BLUE_SLICED_BND(s1, texture_R1, texture_R1B);
     UPDATE_BLUE_SLICED_BND(s2, texture_R2, texture_R2B);
     UPDATE_BLUE_SLICED_BND(s3, texture_R3, texture_R3B);
   }

   rand[rngIndex] = seed;

   return;
}

__global__ void MCStepBlues_SlicedBnd_compareCp(__restrict MSC *s0, __restrict MSC *s1, 
						__restrict MSC *s2, __restrict MSC *s3, 
						__restrict RNGT *rand,
						__restrict MSC *s0s, __restrict MSC *s1s, 
						__restrict MSC *s2s, __restrict MSC *s3s){

   const int off = blockIdx.y*d_hV;
   const int bndOff = blockIdx.y*d_A - (d_hV - d_A);
   const int kk = d_hV - d_A + blockIdx.x*blockDim.x + threadIdx.x;
   const int rngIndex = kk + blockIdx.y*d_rngStrideBnd;
   RNGT seed = rand[rngIndex];

   for(int k = kk; k < d_hV; k += d_loopStrideBnd){

     int i = k + off;
     int j = k + bndOff;

     int x = k%d_L;
     int y = (k/d_L)%d_L;

     int spz = j;
     int smy = spz + (SM(y - 1, d_L) - y)*d_L;
     int spx = smy - x + SP(x + 1, d_L);

     int spy = i + (SP(y + 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);

     UPDATE_BLUE_SLICED_BND_CP(s0, s0s, texture_R0, texture_R0B);
     UPDATE_BLUE_SLICED_BND_CP(s1, s1s, texture_R1, texture_R1B);
     UPDATE_BLUE_SLICED_BND_CP(s2, s2s, texture_R2, texture_R2B);
     UPDATE_BLUE_SLICED_BND_CP(s3, s3s, texture_R3, texture_R3B);
   }

   rand[rngIndex] = seed;

   return;
}

__global__ void MCStepBlues_SlicedBlk_compare(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

   const int off = blockIdx.y*d_hV;
   const int kk = blockIdx.x*blockDim.x + threadIdx.x;
   const int rngIndex = kk + blockIdx.y*d_rngStrideBlk;
   RNGT seed = rand[rngIndex];

   for(int k = kk; k < d_hV - d_A; k += d_loopStrideBlk){

     int i = k + off;

     int x = k%d_L;
     int y = (k/d_L)%d_L;
     //int z = k/d_A;

     //int spz = i + (SP(z + 1, d_hL) - z)*d_A;
     int spz = i + d_A;
     int spy = i + (SP(y + 1, d_L) - y)*d_L;
     int smy = spz + (SM(y - 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);
     int spx = smy - x + SP(x + 1, d_L);

     UPDATE_BLUE_SLICED(s0, texture_R0);
     UPDATE_BLUE_SLICED(s1, texture_R1);
     UPDATE_BLUE_SLICED(s2, texture_R2);
     UPDATE_BLUE_SLICED(s3, texture_R3);
   }

   rand[rngIndex] = seed;

   return;
 }

__global__ void MCStepBlues_SlicedBnd(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

   const int off = blockIdx.y*d_hV;
   const int bndOff = blockIdx.y*d_A - (d_hV - d_A);
   int rngIndex = blockIdx.x*blockDim.x + threadIdx.x;
   const int kk = rngIndex + d_hV - d_A;
   rngIndex += blockIdx.y*d_rngStrideBnd;

   RNGT seed = rand[rngIndex];

   for(int k = kk; k < d_hV; k += d_loopStrideBnd){

     int i = k + off;
     int j = k + bndOff;

     int x = k%d_L;
     int y = (k/d_L)%d_L;

     int spz = j;
     int smy = spz + (SM(y - 1, d_L) - y)*d_L;
     int spx = smy - x + SP(x + 1, d_L);

     int spy = i + (SP(y + 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);

     UPDATE_BLUE_SLICED_BND(s0, texture_R0, texture_R0B);
     UPDATE_BLUE_SLICED_BND(s1, texture_R1, texture_R1B);
     UPDATE_BLUE_SLICED_BND(s2, texture_R2, texture_R2B);
     UPDATE_BLUE_SLICED_BND(s3, texture_R3, texture_R3B);
   }

   rand[rngIndex] = seed;

   return;
}

__global__ void MCStepBlues_SlicedBndCp(__restrict MSC *s0, __restrict MSC *s1,
					__restrict MSC *s2, __restrict MSC *s3, 
					__restrict RNGT *rand,
					__restrict MSC *s0s, __restrict MSC *s1s,
					__restrict MSC *s2s, __restrict MSC *s3s){

   const int off = blockIdx.y*d_hV;
   const int bndOff = blockIdx.y*d_A - (d_hV - d_A);
   int rngIndex = blockIdx.x*blockDim.x + threadIdx.x;
   const int kk = rngIndex + d_hV - d_A;
   rngIndex += blockIdx.y*d_rngStrideBnd;

   RNGT seed = rand[rngIndex];

   for(int k = kk; k < d_hV; k += d_loopStrideBnd){

     int i = k + off;
     int j = k + bndOff;

     int x = k%d_L;
     int y = (k/d_L)%d_L;

     int spz = j;
     int smy = spz + (SM(y - 1, d_L) - y)*d_L;
     int spx = smy - x + SP(x + 1, d_L);

     int spy = i + (SP(y + 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);

     UPDATE_BLUE_SLICED_BND_CP(s0, s0s, texture_R0, texture_R0B);
     UPDATE_BLUE_SLICED_BND_CP(s1, s1s, texture_R1, texture_R1B);
     UPDATE_BLUE_SLICED_BND_CP(s2, s2s, texture_R2, texture_R2B);
     UPDATE_BLUE_SLICED_BND_CP(s3, s3s, texture_R3, texture_R3B);
   }

   rand[rngIndex] = seed;

   return;
}

__global__ void MCStepBlues_SlicedBlk(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

   const int off = blockIdx.y*d_hV;
   const int kk = blockIdx.x*blockDim.x + threadIdx.x;
   const int rngIndex = kk + blockIdx.y*d_rngStrideBlk;
   RNGT seed = rand[rngIndex];

   for(int k = kk; k < d_hV - d_A; k += d_loopStrideBlk){

     int i = k + off;

     int x = k%d_L;
     int y = (k/d_L)%d_L;
     //int z = k/d_A;

     //int spz = i + (SP(z + 1, d_hL) - z)*d_A;
     int spz = i + d_A;
     int spy = i + (SP(y + 1, d_L) - y)*d_L;
     int smy = spz + (SM(y - 1, d_L) - y)*d_L;
     int smx = spy - x + SM(x - 1, d_L);
     int spx = smy - x + SP(x + 1, d_L);

     UPDATE_BLUE_SLICED(s0, texture_R0);
     UPDATE_BLUE_SLICED(s1, texture_R1);
     UPDATE_BLUE_SLICED(s2, texture_R2);
     UPDATE_BLUE_SLICED(s3, texture_R3);
   }

   rand[rngIndex] = seed;

   return;
}

 __global__ void MCStepReds_SlicedGrid(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

   const int i = threadIdx.x + threadIdx.y*blockDim.x + blockIdx.x*d_block_xStride + blockIdx.y*d_block_yStride + blockIdx.z*d_hV;
   const int y = threadIdx.y + blockIdx.x*blockDim.y;

   RNGT seed = rand[i];

   int smz = i + (SMM(blockIdx.y, d_hL) - blockIdx.y)*d_A;
   int spy = smz + (SP(y + 1, d_L) - y)*d_L;
   int smy = i + (SMM(y, d_L) - y)*d_L;
   int smx = spy - threadIdx.x + SMM(threadIdx.x, d_L);
   int spx = smy - threadIdx.x + SP(threadIdx.x + 1, d_L);

   UPDATE_RED_SLICED(s0, texture_B0);
   UPDATE_RED_SLICED(s1, texture_B1);
   UPDATE_RED_SLICED(s2, texture_B2);
   UPDATE_RED_SLICED(s3, texture_B3);

   rand[i] = seed;

   return;
 }

 __global__ void MCStepBlues_SlicedGrid(__restrict MSC *s0, __restrict MSC *s1, __restrict MSC *s2, __restrict MSC *s3, __restrict RNGT *rand){

   const int i = threadIdx.x + threadIdx.y*blockDim.x + blockIdx.x*d_block_xStride + blockIdx.y*d_block_yStride + blockIdx.z*d_hV;
   const int y = threadIdx.y + blockIdx.x*blockDim.y;

   RNGT seed = rand[i];

   int spz = i + (SP(blockIdx.y + 1, d_hL) - blockIdx.y)*d_A;
   int spy = i + (SP(y + 1, d_L) - y)*d_L;
   int smy = spz + (SMM(y, d_L) - y)*d_L;
   int smx = spy - threadIdx.x + SMM(threadIdx.x, d_L);
   int spx = smy - threadIdx.x + SP(threadIdx.x + 1, d_L);

   UPDATE_BLUE_SLICED(s0, texture_R0);
   UPDATE_BLUE_SLICED(s1, texture_R1);
   UPDATE_BLUE_SLICED(s2, texture_R2);
   UPDATE_BLUE_SLICED(s3, texture_R3);

   rand[i] = seed;

   return;
 }

 // END KERNELS

 __global__ void testParisiRapuanoKernel(__restrict RNGT *ira, int steps,
                                         int ip, int ip1, int ip2, int ip3){
   const int off = blockIdx.y*d_hV;
   const int k = blockIdx.x*blockDim.x + threadIdx.x;
   const int i = k + off;

   for(int j=0; j<steps; j++){
     PR32(ira, ip + i, ip1 + i, ip2 + i, ip3 + i);
     ip = (ip + d_threads)%d_iraLen;
     ip1 = (ip1 + d_threads)%d_iraLen;
     ip2 = (ip2 + d_threads)%d_iraLen;
     ip3 = (ip3 + d_threads)%d_iraLen;
   }

   return;
 }

 __global__ void testParisiRapuanoKernelMod(__restrict RNGT *ira, int steps,
                                         int ip, int ip1, int ip2, int ip3){
   const int off = blockIdx.y*d_hV;
   const int k = blockIdx.x*blockDim.x + threadIdx.x;
   const int i = k + off;

   for(int j=0; j<steps; j++){
     PR32(ira, ip*d_threads + i, ip1*d_threads + i, ip2*d_threads + i, ip3*d_threads + i);
     ip = ((ip + 1)&PRWHEELMASK);
     ip1 = ((ip1 + 1)&PRWHEELMASK);
     ip2 = ((ip2 + 1)&PRWHEELMASK);
     ip3 = ((ip3 + 1)&PRWHEELMASK);
   }

   return;
 }

 __global__ void testMinStdKernel(__restrict RNGT *rng, int steps){
   const int off = blockIdx.y*d_hV;
   const int k = blockIdx.x*blockDim.x + threadIdx.x;
   const int i = k + off;

   RNGT seed = rng[i];
   for(int j=0; j<steps; j++){
     GPURAND(seed);
   }
   rng[i] = seed;

   return;
 }

 extern int testParisiRapuano(sysEA3D_t *sys){
   dim3 block(sys->blockSize,1,1);
   dim3 grid(sys->V/2/block.x, sys->sysN, 1);
   int threads = block.x*block.y*block.z*grid.x*grid.y*grid.z;
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_threads, &threads, sizeof(int)) );
   int iraLen = threads*PRWHEEL;
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_iraLen, &iraLen, sizeof(int)) );

   int steps = 2<<20;

   PR_RNGArray_t *ira = NULL;
   ira = PR_RNGArrayInit(iraLen, 0);
   RNGT *iraHost = (RNGT *)malloc(iraLen*sizeof(RNGT));

   //int ip = 61*threads, ip1 = (61-24)*threads, ip2 = (61-55)*threads, ip3 = 0;
   int ip00 = 61, ip01 = (61-24), ip02 = (61-55), ip03 = 0;

   /*printf("GPU start\n");
   testParisiRapuanoKernel<<<grid, block>>>(ira->d_rngSeeds, steps, ip, ip1, ip2, ip3);
   MYCUDA_ERROR( cudaDeviceSynchronize() );*/

   printf("GPU start\n");
   testParisiRapuanoKernelMod<<<grid, block>>>(ira->d_rngSeeds, steps, ip00, ip01, ip02, ip03);
   MYCUDA_ERROR( cudaDeviceSynchronize() );

   printf("CPU start\n");
   PR32EvolveMod(ira, steps, ip00, ip01, ip02, ip03);

   for(int i=0; i<iraLen; i++) iraHost[i] = ira->h_rngSeeds[i];
   D2HPR_RNGArray(ira);
   for(int i=0; i<iraLen; i++){
     if(iraHost[i] != ira->h_rngSeeds[i]){
       printf("%d %d %d\n", iraHost[i], ira->h_rngSeeds[i], i);
       getchar();
     }
   }

   return 0;
 }



 extern int SimulationFullRecoverTestCongruential(sysEA3D_t *sys){
   
   dim3 block(sys->blockSize,1,1);
   dim3 grid(sys->V/sys->k/block.x, sys->sysN, 1);

   int threads = block.x*block.y*block.z*grid.x*grid.y*grid.z;
   int rngOffset = grid.x*block.x;
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_threads, &threads, sizeof(int)) );
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_rngOffset, &rngOffset, sizeof(int)) );

   printf("block.x: %d, grid.x: %d, threads: %d, spins: %d\n",
          block.x, grid.x, threads, sys->hV/(block.x*grid.x));

   CRNGArray_t *rng = CRNGArrayInit(threads);

   if(rng == NULL){
     fprintf(stderr, "\nNo Alloc rng Simulation\n");
     return 1;
   }

   if(sys->dumpF == 1 && sys->recoverF != 1 && sys->recoverF != 2){
     // RNG Sequence transform
     if(sys->kernelChoice == 1 || sys->kernelChoice == 2 || sys->kernelChoice == 5 && nprocs == 1){
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->realI_R, sys->hV);
       }
     }

     if(DumpCRNGArray(rng, 0, nprocs, mpiid)){
       fprintf(stderr, "\nError DumpCRNGArray Simulation\n");
       return 1;
     }
   }

   if(sys->recoverF == 1){
     if(ReadCRNGArray(rng, 0)){
       fprintf(stderr, "\nError ReadCRNGArray Simulation\n");
       return 1;
     }

     // RNG Sequence transform
     if(sys->kernelChoice == 1 || sys->kernelChoice == 2 || sys->kernelChoice == 5 && nprocs == 1){
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->myI_R, sys->hV);
       }
     }

     H2DCRNGArray(rng);

     testMinStdKernel<<<grid, block>>>(rng->d_rngSeeds, 8*sys->startValue);
     MYCUDA_ERROR( cudaDeviceSynchronize() );

     D2HCRNGArray(rng);

     if(sys->kernelChoice == 1 || sys->kernelChoice == 2 || sys->kernelChoice == 5 && nprocs == 1){
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->realI_R, sys->hV);
       }
     }

     if(DumpCRNGArray(rng, (unsigned)-1, nprocs, mpiid)){
       fprintf(stderr, "\nError DumpCRNGArray Simulation\n");
       return 1;
     }
   }

   if(sys->recoverF == 2){
     if(ReadCRNGArrayMpiRecover(rng, 0, sys->sysN, sys->recoverMPIprocs)){
       fprintf(stderr, "\nError ReadCRNGArray Simulation\n");
       return 1;
     }

     H2DCRNGArray(rng);

     testMinStdKernel<<<grid, block>>>(rng->d_rngSeeds, 8*sys->startValue);
     MYCUDA_ERROR( cudaDeviceSynchronize() );

     D2HCRNGArray(rng);

     if(DumpCRNGArrayMpiRecover(rng, (unsigned)-1, sys->sysN, sys->recoverMPIprocs)){
       fprintf(stderr, "\nError DumpCRNGArray Simulation\n");
       return 1;
     }

     if(DumpCRNGArray(rng, (unsigned)-1, nprocs, mpiid)){
       fprintf(stderr, "\nError DumpCRNGArray Simulation\n");
       return 1;
     }
   }

   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Bitwise, cudaFuncCachePreferL1) );
   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Bitwise, cudaFuncCachePreferL1) );

   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Standard, cudaFuncCachePreferL1) );
   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Standard, cudaFuncCachePreferL1) );

   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_StandardGrid, cudaFuncCachePreferL1) );
   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_StandardGrid, cudaFuncCachePreferL1) );

   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Sliced, cudaFuncCachePreferL1) );
   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Sliced, cudaFuncCachePreferL1) );

   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_SlicedGrid, cudaFuncCachePreferL1) );
   MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_SlicedGrid, cudaFuncCachePreferL1) );

   int dumpIndex = 0;
   int maxStep = sys->dumpTimes[sys->dumpN - 1];

   dim3 blockG(sys->L, sys->k, 1);
   dim3 gridG(sys->A/(blockG.x*blockG.y), sys->L/2, sys->sysN);

   //int sys->hA = A/2;
   dim3 blockS(sys->L/2, sys->k, 1);
   dim3 gridS(sys->hA/(blockS.x*blockS.y), sys->L, sys->sysN);

   int threadsG = blockG.x*blockG.y*blockG.z*gridG.x*gridG.y*gridG.z;
   int threadsS = blockS.x*blockS.y*blockS.z*gridS.x*gridS.y*gridS.z;

   printf("%d %d %d\n", threadsG, threadsS, gridS.x);

   int h_block_xStride = blockG.x*blockG.y;
   int h_block_yStride = gridG.x*h_block_xStride;
   int h_block_xStrideS = blockS.x*blockS.y;
   int h_block_yStrideS = gridS.x*h_block_xStrideS;

   int h_loopStride = block.x*grid.x;

   MYCUDA_ERROR( cudaMemcpyToSymbol(d_block_xStride, &h_block_xStride, sizeof(int)) );
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_block_yStride, &h_block_yStride, sizeof(int)) );
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_block_xStrideS, &h_block_xStrideS, sizeof(int)) );
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_block_yStrideS, &h_block_yStrideS, sizeof(int)) );
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_loopStride, &h_loopStride, sizeof(int)) );

   // MULTI-GPU

   dim3 blockBlk(sys->blockSize,1,1);
   dim3 gridBlk((sys->hV - sys->A)/blockBlk.x, sys->sysN, 1);

   dim3 blockBnd(sys->blockSize,1,1);
   dim3 gridBnd(sys->A/blockBnd.x, sys->sysN, 1);

   printf("gridBnd.x: %d, gridBlk.x: %d\n", gridBnd.x, gridBlk.x);

   //int h_loopStrideBlk = blockBlk.x*gridBlk.x;
   //int h_loopStrideBnd = blockBnd.x*gridBnd.x;
   // this is needed for bitwise comparison
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_loopStrideBlk, &h_loopStride, sizeof(int)) );
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_loopStrideBnd, &h_loopStride, sizeof(int)) );
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_rngStrideBlk, &h_loopStride, sizeof(int)) );
   MYCUDA_ERROR( cudaMemcpyToSymbol(d_rngStrideBnd, &h_loopStride, sizeof(int)) );

   // STREAMS INIT

   cudaStream_t *stream;
   int nstreams=2;

   stream = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
   if(stream == NULL){
     fprintf(stderr, "\nNo Alloc *stream in %s at line %d\n", __FILE__, __LINE__);
     return 1;
   }

   MYCUDA_ERROR( cudaStreamCreate(&(stream[0])));
   MYCUDA_ERROR( cudaStreamCreate(&(stream[1])));

   // BEGIN TIME STEPS

   for(int i=sys->startValue + 1; i<=maxStep; i++){

     if(sys->kernelChoice == 0){
       MCStepReds_Bitwise<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
                                             sys->d_reds2, sys->d_reds3,
                                             rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );

       MCStepBlues_Bitwise<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
                                              sys->d_blues2, sys->d_blues3,
                                              rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );
     }

     if(sys->kernelChoice == 1){
       MCStepReds_Sliced<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
                                            sys->d_reds2, sys->d_reds3,
                                            rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );

       // RNG sequence transformation
       D2HCRNGArray(rng);
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->myI_R2B, sys->hV);
       }
       H2DCRNGArray(rng);

       MCStepBlues_Sliced<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
                                             sys->d_blues2, sys->d_blues3,
                                             rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );

       // RNG sequence transformation
       D2HCRNGArray(rng);
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->myI_B2R, sys->hV);
       }
       H2DCRNGArray(rng);
     }

     if(sys->kernelChoice == 2){
       MCStepReds_SlicedGrid<<< gridG, blockG >>>(sys->d_reds0, sys->d_reds1,
                                                  sys->d_reds2, sys->d_reds3,
                                                  rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );

       // RNG sequence transformation
       D2HCRNGArray(rng);
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->myI_R2B, sys->hV);
       }
       H2DCRNGArray(rng);

       MCStepBlues_SlicedGrid<<< gridG, blockG >>>(sys->d_blues0, sys->d_blues1,
                                                   sys->d_blues2, sys->d_blues3,
                                                   rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );

       // RNG sequence transformation
       D2HCRNGArray(rng);
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->myI_B2R, sys->hV);
       }
       H2DCRNGArray(rng);
     }

     if(sys->kernelChoice == 3){
       MCStepReds_Standard<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
                                              sys->d_reds2, sys->d_reds3,
                                              rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );

       MCStepBlues_Standard<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
                                               sys->d_blues2, sys->d_blues3,
                                               rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );
     }

     if(sys->kernelChoice == 4){
       MCStepReds_StandardGrid<<< gridS, blockS >>>(sys->d_reds0, sys->d_reds1,
                                                    sys->d_reds2, sys->d_reds3,
                                                    rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );

       MCStepBlues_StandardGrid<<< gridS, blockS >>>(sys->d_blues0, sys->d_blues1,
                                                     sys->d_blues2, sys->d_blues3,
                                                     rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );
     }

     if(sys->kernelChoice == 5 && nprocs == 1){
       MCStepReds_SlicedBnd<<< gridBnd, blockBnd, 0, stream[0] >>>(sys->d_reds0, sys->d_reds1,
                                                                   sys->d_reds2, sys->d_reds3,
                                                                   rng->d_rngSeeds);
       //MYCUDA_ERROR( cudaDeviceSynchronize() );

       MCStepReds_SlicedBlk<<< gridBlk, blockBlk, 0, stream[1] >>>(sys->d_reds0, sys->d_reds1,
                                                                   sys->d_reds2, sys->d_reds3,
                                                                   rng->d_rngSeeds);

       //MYCUDA_ERROR( cudaDeviceSynchronize() );

       //slicedRedBoundaryCopy(sys);
       slicedBoundary_RedSendReceive(sys, stream);
       // RNG sequence transformation
       MYCUDA_ERROR( cudaStreamSynchronize(stream[1]) );

       D2HCRNGArray(rng);
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->myI_R2B, sys->hV);
       }
       H2DCRNGArray(rng);

       /*MCStepBlues_Sliced<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
                                             sys->d_blues2, sys->d_blues3,
                                             rng->d_rngSeeds);
                                             MYCUDA_ERROR( cudaDeviceSynchronize() );*/


       MCStepBlues_SlicedBnd<<< gridBnd, blockBnd, 0, stream[0] >>>(sys->d_blues0, sys->d_blues1,
                                                                   sys->d_blues2, sys->d_blues3,
                                                                   rng->d_rngSeeds);
       // MYCUDA_ERROR( cudaDeviceSynchronize() );

       MCStepBlues_SlicedBlk<<< gridBlk, blockBlk, 0, stream[1] >>>(sys->d_blues0, sys->d_blues1,
                                                                    sys->d_blues2, sys->d_blues3,
                                                                    rng->d_rngSeeds);

       //MYCUDA_ERROR( cudaDeviceSynchronize() );

       //       slicedBlueBoundaryCopy(sys);
       slicedBoundary_BlueSendReceive(sys, stream);
       // RNG sequence transformation
       MYCUDA_ERROR( cudaStreamSynchronize(stream[1]) );
       D2HCRNGArray(rng);
       for(int i=0; i<sys->sysN; i++){
         int offset = i*sys->hV;
         arrayTransformRNGT(rng->h_rngSeeds + offset, sys->myI_B2R, sys->hV);
       }
       H2DCRNGArray(rng);
     }

     if(sys->kernelChoice == 5 && nprocs > 1){
       MCStepReds_SlicedBnd_compareCp<<< gridBnd, blockBnd, 0, stream[0] >>>(sys->d_reds0, sys->d_reds1,
									     sys->d_reds2, sys->d_reds3,
									     rng->d_rngSeeds,
									     sys->d_reds0BS, sys->d_reds1BS,
									     sys->d_reds2BS, sys->d_reds3BS);

       MCStepReds_SlicedBlk_compare<<< gridBlk, blockBlk, 0, stream[1] >>>(sys->d_reds0, sys->d_reds1,
                                                                   sys->d_reds2, sys->d_reds3,
                                                                   rng->d_rngSeeds);

       slicedBoundary_RedSendReceiveNew(sys, stream);

       MYCUDA_LAST("BlkBnd_red");
       MYCUDA_ERROR( cudaStreamSynchronize(stream[1]) );


       
       MCStepBlues_SlicedBnd_compareCp<<< gridBnd, blockBnd, 0, stream[0] >>>(sys->d_blues0, sys->d_blues1,
									      sys->d_blues2, sys->d_blues3,
									      rng->d_rngSeeds,
									      sys->d_blues0BS, sys->d_blues1BS,
									      sys->d_blues2BS, sys->d_blues3BS);

       MCStepBlues_SlicedBlk_compare<<< gridBlk, blockBlk, 0, stream[1] >>>(sys->d_blues0, sys->d_blues1,
                                                                    sys->d_blues2, sys->d_blues3,
                                                                    rng->d_rngSeeds);
       MYCUDA_LAST("BlkBnd_blue");
       slicedBoundary_BlueSendReceiveNew(sys, stream);

       MYCUDA_ERROR( cudaStreamSynchronize(stream[1]) );
     }

     if(sys->kernelChoice == 6){
       MCStepReds_Sliced<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
                                            sys->d_reds2, sys->d_reds3,
                                            rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );


       MCStepBlues_Sliced<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
					     sys->d_blues2, sys->d_blues3,
                                             rng->d_rngSeeds);
       MYCUDA_ERROR( cudaDeviceSynchronize() );
     }

    if(sys->dumpTimes[dumpIndex] == i){
      if(D2HSpins(sys)){
        fprintf(stderr,"\nError in D2HSpins time %d in Simulation\n", i);
        return 1;
      }

      if(sys->kernelChoice == 1 || sys->kernelChoice == 2 || sys->kernelChoice == 5 && nprocs == 1){
        SpinsTransform(sys);
      }

      if(sys->dumpF){
        if(SpinsDump(sys, i)){
          fprintf(stderr,"\nError in SpinsDump time %d Simulation\n", i);
          return 1;
        }

	if(sys->recoverF == 2){
	  if(SpinsDumpDirMpi(sys, sys->startValue, i, sys->recoverMPIprocs)){
	    fprintf(stderr,"\nError in SpinsDumpDirMpi\n");
	    return 1;
	  }
	}
      }

      dumpIndex++;
    }
  }



  D2HCRNGArray(rng);
  if(sys->kernelChoice == 1 || sys->kernelChoice == 2 || sys->kernelChoice == 5 && nprocs == 1){
    for(int i=0; i<sys->sysN; i++){
      int offset = i*sys->hV;
      arrayTransformRNGT(rng->h_rngSeeds + offset, sys->realI_R, sys->hV);
    }
  }

  if(sys->recoverF != 1){
    if(DumpCRNGArray(rng, maxStep, nprocs, mpiid)){
      fprintf(stderr, "\nError DumpCRNGArray Simulation\n");
      return 1;
    }
  }

  if(sys->recoverF == 1){
    if(DumpCRNGArray(rng, maxStep + 1, nprocs, mpiid)){
      fprintf(stderr, "\nError DumpCRNGArray Simulation\n");
      return 1;
    }
  }

  if(sys->recoverF == 2){
    if(DumpCRNGArrayMpiRecover(rng, maxStep + 1, sys->sysN, sys->recoverMPIprocs)){
      fprintf(stderr, "\nError DumpCRNGArrayMpiRecover Simulation\n");
      return 1;
    }
  }

  return 0;
}

extern int benchmark(int argc, char **argv){
  if(argc != 7){
    fprintf(stderr, "usage: %s <L> <gpu> <sysN> <block> <samples> <steps>\n\n", argv[0]);
    fprintf(stderr, "Systems' GPUs are:\n");
    int numberOfGPUs = 0;
    MYCUDA_ERROR( cudaGetDeviceCount(&numberOfGPUs) );
    for(int gpu = 0; gpu < numberOfGPUs; gpu++){
      cudaDeviceProp prop;
      MYCUDA_ERROR( cudaGetDeviceProperties(&prop, gpu) );
      printf("GPU %d: %s\n", gpu, prop.name);
    }
    return 1;
  }

  char fileName[1024];
  //struct timespec start, stop;

  sysEA3D_t *sys = (sysEA3D_t *)malloc(sizeof(sysEA3D_t));
  int initL, initSysN, initSamples, initSteps, initBlock;
  sscanf(argv[2], "%d", &sys->cardN);
  sscanf(argv[1], "%d", &initL);
  sscanf(argv[3], "%d", &initSysN);
  sscanf(argv[4], "%d", &initBlock);
  sscanf(argv[5], "%d", &initSamples);
  sscanf(argv[6], "%d", &initSteps);
  
  if(initL == 0) initL = 8;
  if(initSamples == 0) initSamples = 100;
  if(initSteps == 0) initSteps = 50;

  sys->beta = 1.101;

  int numberOfGPUs = 0;
  cudaGetDeviceCount(&numberOfGPUs);
  if(sys->cardN >= numberOfGPUs){
    fprintf(stderr, "gpu %d >= gpuN %d\n", sys->cardN, numberOfGPUs);
    return 1;
  }

  MYCUDA_ERROR( cudaSetDevice(sys->cardN) );
  cudaDeviceProp prop;
  MYCUDA_ERROR( cudaGetDeviceProperties(&prop, sys->cardN) );
  printf("GPU %d: %s Compute Capability: %d.%d\n", sys->cardN, prop.name, prop.major, prop.minor);

  cudaEvent_t startEvent, stopEvent;
  MYCUDA_ERROR( cudaEventCreate(&startEvent) );
  MYCUDA_ERROR( cudaEventCreate(&stopEvent) );

  int maxThreadsPerBlock = 0;
  if(prop.major == 2) maxThreadsPerBlock = 512;
  if(prop.major == 3) maxThreadsPerBlock = 1024;

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Bitwise, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Bitwise, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Standard, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Standard, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Sliced, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Sliced, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_StandardGrid, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_StandardGrid, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_SlicedGrid, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_SlicedGrid, cudaFuncCachePreferL1) );


  for(int L = initL; L <= 256; L += 2){

    int hV = L*L*L/2;
    size_t NJ_Size = 6*hV*sizeof(MSC), NS_Size = 4*2*hV*sizeof(MSC), RNG_Size = hV*sizeof(RNGT);
    size_t devFree = 0, devTotal = 0;
    MYCUDA_ERROR( cudaMemGetInfo(&devFree, &devTotal) );
    // here to decide the percentage of device memory to use. 
    int MaxSysN = (int)(0.8*(devFree/(NJ_Size + NS_Size + RNG_Size)));
    printf("\n\nL = %d, Free memory on device: %lu, MaxSysN: %d\n", L, devFree, MaxSysN);

    if(initSysN == 0) initSysN = 1;

    for(int sysN = initSysN; sysN <= MaxSysN; sysN *= 2){
      sys->sysN = sysN;
      sys->L = L;
      sys->V = sys->L*sys->L*sys->L/nprocs;
      sys->A = sys->L*sys->L;
      
      sys->hV = sys->V/2;
      sys->hA = sys->A/2;
      sys->hL = sys->L/2;

      sys->NJBnd = sys->sysN*sys->A;
      sys->NSBnd = sys->sysN*sys->A;
      
      sys->NJBlk = sys->sysN*sys->hV;
      sys->NSBlk = sys->sysN*sys->hV;
      
      if(masksInit(sys)){
	fprintf(stderr,"\nError in masksInit\n");
	return 1;
      }
      if(MCWeightsInit(sys)){
	fprintf(stderr,"\nError in MCWeightsInit\n");
	return 1;
      }
      if(dMemInit(sys)){
	fprintf(stderr,"\nError in dMemInit\n");
	return 1;
      }
      if(hMemInit2(sys)){
	fprintf(stderr,"\nError in hMemInit\n");
	return 1;
      }
      if(H2DSysCopy(sys)){
	fprintf(stderr,"\nError in H2DSysCopy\n");
	return 1;
      }

      MYCUDA_ERROR( cudaMemGetInfo(&devFree, &devTotal) );
      printf("\n\n---\tL = %d, sysN = %d, free memory after allocation: %lu\t---\n\n",
	     L, sysN, devFree);

      ////////////////////////////////////////
      // NO-GRID KERNELS
      ///////////////////////////////////////

      int blockInit = 0;

      if(initBlock == 0){	
	blockInit = sys->V/2;
	//printf("blockInitI: %d\n", blockInit);
	while(blockInit > maxThreadsPerBlock) blockInit /= 2;
	//printf("blockInitF: %d\n", blockInit);
      }else blockInit = initBlock;

      int count = 0, test = 1;
      for(sys->blockSize = blockInit; test; ){

	if(count > 0){
	  do{sys->blockSize--;}while((sys->V/2)%sys->blockSize != 0);
	}
	count++;

	//printf("blockSize: %d\n", sys->blockSize);

	if(sys->blockSize >= 32){
	  int kMax = sys->V/sys->blockSize/2;
	  //printf("blockSizeIF: %d\n", sys->blockSize);

	  for(int k = 0; k < kMax ;){
	    
	    float redMean = 0., redMean2 = 0., swap = 0.;
	    float blueMean = 0., blueMean2 = 0.;
	    float errorRed = 0., errorBlue = 0.;

	    do{k++;}while(kMax%k != 0);	    
	    
	    dim3 block(sys->blockSize,1,1);
	    dim3 grid(sys->V/block.x/(2*k), sys->sysN, 1);
	    int h_loopStride = block.x*grid.x;
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_loopStride, &h_loopStride, sizeof(int)) );
	    
	    printf("Measuring Times No-Grid L:%d block.x:%d grid.x:%d spins per thread:%d\n",
		   L, block.x, grid.x, k);
	    
	    int threads = block.x*block.y*block.z*grid.x*grid.y*grid.z;
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_threads, &threads, sizeof(int)) );
	    
	    CRNGArray_t *rng = CRNGArrayInit(threads);
	    
	    // KERNEL: Bitwise; Only L = 2^k are examined
	    if( ((1U<<findLog2(sys->L))^sys->L) == 0 ){
	      for(int sample = 0; sample < initSamples; sample++){
		
		MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
		
		for(int steps = 0; steps < initSteps; steps++){	  
		  MCStepReds_Bitwise<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
							sys->d_reds2, sys->d_reds3,
							rng->d_rngSeeds);
		  MYCUDA_ERROR( cudaDeviceSynchronize() );
		}
		
		MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
		MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
		MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
		redMean += (swap/(initSteps)); 
		redMean2 += (swap/(initSteps))*(swap/(initSteps));
		swap = 0.;
		
		MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
		
		for(int steps = 0; steps < initSteps; steps++){	  
		  MCStepBlues_Bitwise<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
							 sys->d_blues2, sys->d_blues3,
							 rng->d_rngSeeds);
		  MYCUDA_ERROR( cudaDeviceSynchronize() );
		}
		
		MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
		MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
		MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
		blueMean += (swap/(initSteps)); 
		blueMean2 += (swap/(initSteps))*(swap/(initSteps));
		swap = 0.;
	      }
	      
	      redMean /= initSamples;  redMean2 /= initSamples;      
	      blueMean /= initSamples; blueMean2 /= initSamples;
	      
	      errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	      errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	      
	      snprintf(fileName, sizeof(fileName), "GPU%d_bitwiseKernel", sys->cardN);
	      FILE *outputFile = fopen(fileName, "a+");
	      fprintf(outputFile, "%d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		      L, sysN, k, block.x, grid.x,
		      redMean*1.0e-3, errorRed*1.0e-3, blueMean*1.0e-3, errorBlue*1.0e-3);
	      fflush(outputFile);
	      fclose(outputFile);
	    }
	    
	    // KERNEL: Standard
	    redMean = blueMean = redMean2 = blueMean2 = 0.;
	    for(int sample = 0; sample < initSamples; sample++){
	      
	      MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepReds_Standard<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
						       sys->d_reds2, sys->d_reds3,
						       rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	      MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	      MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	      redMean += (swap/(initSteps)); 
	      redMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	      
	      MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepBlues_Standard<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
							sys->d_blues2, sys->d_blues3,
							rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	      MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	      MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	      blueMean += (swap/(initSteps)); 
	      blueMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	    }
	    
	    redMean /= initSamples;  redMean2 /= initSamples;      
	    blueMean /= initSamples; blueMean2 /= initSamples;
	    
	    errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	    errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	    
	    snprintf(fileName, sizeof(fileName), "GPU%d_standardKernel", sys->cardN);
	    FILE *outputFile = fopen(fileName, "a+");
	    fprintf(outputFile, "%d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		    L, sysN, k, block.x, grid.x,
		    redMean*1.0e-3, errorRed*1.0e-3, blueMean*1.0e-3, errorBlue*1.0e-3);
	    fflush(outputFile);
	    fclose(outputFile);
	    
	    // KERNEL: Sliced
	    redMean = blueMean = redMean2 = blueMean2 = 0.;
	    for(int sample = 0; sample < initSamples; sample++){
	      
	      MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepReds_Sliced<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
						     sys->d_reds2, sys->d_reds3,
						     rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	      MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	      MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	      redMean += (swap/(initSteps)); 
	      redMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	      
	      MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepBlues_Sliced<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
						      sys->d_blues2, sys->d_blues3,
						      rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	      MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	      MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	      blueMean += (swap/(initSteps)); 
	      blueMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	    }
	    
	    redMean /= initSamples;  redMean2 /= initSamples;      
	    blueMean /= initSamples; blueMean2 /= initSamples;
	    
	    errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	    errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	    
	    snprintf(fileName, sizeof(fileName), "GPU%d_slicedKernel", sys->cardN);
	    outputFile = fopen(fileName, "a+");
	    fprintf(outputFile, "%d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		    L, sysN, k, block.x, grid.x,
		    redMean*1.0e-3, errorRed*1.0e-3, blueMean*1.0e-3, errorBlue*1.0e-3);
	    fflush(outputFile);
	    fclose(outputFile);
	  	  
	    CRNGArrayReset(rng);
	    free(rng);
	  }
	}else test = 0;
      }

      initBlock = 0;
    
      ////////////////////////////////////////
      // GRID KERNELS
      ///////////////////////////////////////
	  
      int A = sys->L*sys->L;
      int hA = A/2;

      int count3 = 0, test3 = 1;

      for(sys->k = 1; test3; ){

	float redMean = 0., redMean2 = 0., swap = 0.;
	float blueMean = 0., blueMean2 = 0.;
	float errorRed = 0., errorBlue = 0.;

	if(count3 > 0) do{sys->k++;}while(sys->L%sys->k != 0 && sys->k <= sys->L);
	count3++;

	dim3 blockS(sys->L/2, sys->k, 1);
	dim3 gridS(hA/(blockS.x*blockS.y), sys->L, sys->sysN);
	dim3 blockG(sys->L, sys->k, 1);
	dim3 gridG(A/(blockG.x*blockG.y), sys->L/2, sys->sysN);

	int threads = blockS.x*blockS.y*blockS.z*gridS.x*gridS.y*gridS.z;
	CRNGArray_t *rng;

	if(threads > 0) rng = CRNGArrayInit(threads);

	if(sys->k*sys->L <= maxThreadsPerBlock && sys->k <= sys->L && sys->k*sys->L >= 32){	  
	  printf("Measuring Times Grid    L:%d lines:%d\n",
		 L, sys->k);
	}


	if(sys->k*sys->L/2 <= maxThreadsPerBlock && sys->k <= sys->L && sys->k*sys->L/2 >= 32){
	  // KERNEL: Standard
	  	  	  
	  int h_block_xStrideS = blockS.x*blockS.y;
	  int h_block_yStrideS = gridS.x*h_block_xStrideS;
	  
	  MYCUDA_ERROR( cudaMemcpyToSymbol(d_block_xStrideS, &h_block_xStrideS, sizeof(int)) );
	  MYCUDA_ERROR( cudaMemcpyToSymbol(d_block_yStrideS, &h_block_yStrideS, sizeof(int)) );	  
	  
	  for(int sample = 0; sample < initSamples; sample++){
	    
	    MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	    
	    for(int steps = 0; steps < initSteps; steps++){	  
	      MCStepReds_StandardGrid<<< gridS, blockS >>>(sys->d_reds0, sys->d_reds1,
							   sys->d_reds2, sys->d_reds3,
							   rng->d_rngSeeds);
	      MYCUDA_ERROR( cudaDeviceSynchronize() );
	    }
	    
	    MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	    MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	    MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	    redMean += (swap/(initSteps)); 
	    redMean2 += (swap/(initSteps))*(swap/(initSteps));
	    swap = 0.;
	    
	    MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	    
	    for(int steps = 0; steps < initSteps; steps++){	  
	      MCStepBlues_StandardGrid<<< gridS, blockS >>>(sys->d_blues0, sys->d_blues1,
							    sys->d_blues2, sys->d_blues3,
							    rng->d_rngSeeds);
	      MYCUDA_ERROR( cudaDeviceSynchronize() );
	    }
	    
	    MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	    MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	    MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	    blueMean += (swap/(initSteps)); 
	    blueMean2 += (swap/(initSteps))*(swap/(initSteps));
	    swap = 0.;
	  }
	  
	  redMean /= initSamples;  redMean2 /= initSamples;      
	  blueMean /= initSamples; blueMean2 /= initSamples;
	  
	  errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	  errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	  
	  snprintf(fileName, sizeof(fileName), "GPU%d_standardGridKernel", sys->cardN);
	  FILE *outputFile = fopen(fileName, "a+");
	  fprintf(outputFile, "%d %d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		  L, sysN, sys->k, blockS.x, blockS.y, gridS.x,
		  redMean*1.0e-3, errorRed*1.0e-3, blueMean*1.0e-3, errorBlue*1.0e-3);
	  fflush(outputFile);
	  fclose(outputFile);
	}

	if(sys->k*sys->L <= maxThreadsPerBlock && sys->k <= sys->L){	  
	  if(sys->k*sys->L >= 32){
	    // KERNEL: Sliced
	    redMean = blueMean = redMean2 = blueMean2 = 0.;
	    
	    int h_block_xStride = blockG.x*blockG.y;
	    int h_block_yStride = gridG.x*h_block_xStride;
	    
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_block_xStride, &h_block_xStride, sizeof(int)) );
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_block_yStride, &h_block_yStride, sizeof(int)) );
	    
	    for(int sample = 0; sample < initSamples; sample++){
	      
	      MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	    
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepReds_SlicedGrid<<< gridG, blockG >>>(sys->d_reds0, sys->d_reds1,
							   sys->d_reds2, sys->d_reds3,
							   rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	      MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	      MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	      redMean += (swap/(initSteps)); 
	      redMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	      
	      MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepBlues_SlicedGrid<<< gridG, blockG >>>(sys->d_blues0, sys->d_blues1,
							    sys->d_blues2, sys->d_blues3,
							    rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	      MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	      MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	      blueMean += (swap/(initSteps)); 
	      blueMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	    }
	    
	    redMean /= initSamples;  redMean2 /= initSamples;      
	    blueMean /= initSamples; blueMean2 /= initSamples;
	    
	    errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	    errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	    
	    snprintf(fileName, sizeof(fileName), "GPU%d_slicedGridKernel", sys->cardN);
	    FILE *outputFile = fopen(fileName, "a+");
	    fprintf(outputFile, "%d %d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		    L, sysN, sys->k, blockG.x, blockG.y, gridG.x,
		    redMean*1.0e-3, errorRed*1.0e-3, blueMean*1.0e-3, errorBlue*1.0e-3);
	    fflush(outputFile);
	    fclose(outputFile);
	  }
	} else test3 = 0;
	
	if(threads > 0){
	  CRNGArrayReset(rng);
	  free(rng);
	}
      }
      // CLEANING UP
      //CRNGArrayFree(&rng);
      if(dMemFree(sys)){
	fprintf(stderr,"\nError in dMemFree\n");
	return 1;
      }
      if(hMemFree(sys)){
	fprintf(stderr,"\nError in hMemFree\n");
	return 1;
      }
    }

    initSysN = 0;
  }
  // allocazione
  // deallocazione
  // linearizzazione nel numero dei sistemi con limite la memoria
  return 0;
}

extern int benchmarkSliced(int argc, char **argv){
  if(argc != 7){
    fprintf(stderr, "usage: %s <L> <gpu> <sysN> <block> <samples> <steps>\n\n", argv[0]);
    fprintf(stderr, "Systems' GPUs are:\n");
    int numberOfGPUs = 0;
    MYCUDA_ERROR( cudaGetDeviceCount(&numberOfGPUs) );
    for(int gpu = 0; gpu < numberOfGPUs; gpu++){
      cudaDeviceProp prop;
      MYCUDA_ERROR( cudaGetDeviceProperties(&prop, gpu) );
      printf("GPU %d: %s\n", gpu, prop.name);
    }
    return 1;
  }

  char fileName[1024];
  //struct timespec start, stop;

  sysEA3D_t *sys = (sysEA3D_t *)malloc(sizeof(sysEA3D_t));
  int initL, initSysN, initSamples, initSteps, initBlock;
  sscanf(argv[2], "%d", &sys->cardN);
  sscanf(argv[1], "%d", &initL);
  sscanf(argv[3], "%d", &initSysN);
  sscanf(argv[4], "%d", &initBlock);
  sscanf(argv[5], "%d", &initSamples);
  sscanf(argv[6], "%d", &initSteps);
  
  if(initL == 0) initL = 8;
  if(initSamples == 0) initSamples = 100;
  if(initSteps == 0) initSteps = 50;

  sys->beta = 1.101;

  int numberOfGPUs = 0;
  cudaGetDeviceCount(&numberOfGPUs);
  if(sys->cardN >= numberOfGPUs){
    fprintf(stderr, "gpu %d >= gpuN %d\n", sys->cardN, numberOfGPUs);
    return 1;
  }

  MYCUDA_ERROR( cudaSetDevice(sys->cardN) );
  cudaDeviceProp prop;
  MYCUDA_ERROR( cudaGetDeviceProperties(&prop, sys->cardN) );
  printf("GPU %d: %s Compute Capability: %d.%d\n", sys->cardN, prop.name, prop.major, prop.minor);

  cudaEvent_t startEvent, stopEvent;
  MYCUDA_ERROR( cudaEventCreate(&startEvent) );
  MYCUDA_ERROR( cudaEventCreate(&stopEvent) );

  int maxThreadsPerBlock = 0;
  if(prop.major == 2) maxThreadsPerBlock = 512;
  if(prop.major == 3) maxThreadsPerBlock = 1024;

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Sliced, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Sliced, cudaFuncCachePreferL1) );

  for(int L = initL; L <= initL; L += 2){

    int hV = L*L*L/2;
    size_t NJ_Size = 6*hV*sizeof(MSC), NS_Size = 4*2*hV*sizeof(MSC), RNG_Size = hV*sizeof(RNGT);
    size_t devFree = 0, devTotal = 0;
    MYCUDA_ERROR( cudaMemGetInfo(&devFree, &devTotal) );
    // here to decide the percentage of device memory to use. 
    int MaxSysN = (int)(0.8*(devFree/(NJ_Size + NS_Size + RNG_Size)));
    printf("\n\nL = %d, Free memory on device: %lu, MaxSysN: %d\n", L, devFree, MaxSysN);

    if(initSysN == 0) initSysN = 1;

    for(int sysN = initSysN; sysN <= MaxSysN; sysN *= 2){
      sys->sysN = sysN;
      sys->L = L;
      sys->V = sys->L*sys->L*sys->L/nprocs;
      sys->A = sys->L*sys->L;
      
      sys->hV = sys->V/2;
      sys->hA = sys->A/2;
      sys->hL = sys->L/2;

      sys->NJBnd = sys->sysN*sys->A;
      sys->NSBnd = sys->sysN*sys->A;
      
      sys->NJBlk = sys->sysN*sys->hV;
      sys->NSBlk = sys->sysN*sys->hV;
      
      if(masksInit(sys)){
	fprintf(stderr,"\nError in masksInit\n");
	return 1;
      }
      if(MCWeightsInit(sys)){
	fprintf(stderr,"\nError in MCWeightsInit\n");
	return 1;
      }
      if(dMemInit(sys)){
	fprintf(stderr,"\nError in dMemInit\n");
	return 1;
      }
      if(hMemInit2(sys)){
	fprintf(stderr,"\nError in hMemInit\n");
	return 1;
      }
      if(H2DSysCopy(sys)){
	fprintf(stderr,"\nError in H2DSysCopy\n");
	return 1;
      }

      MYCUDA_ERROR( cudaMemGetInfo(&devFree, &devTotal) );
      printf("\n\n---\tL = %d, sysN = %d, free memory after allocation: %lu\t---\n\n",
	     L, sysN, devFree);

      ////////////////////////////////////////
      // NO-GRID KERNELS
      ///////////////////////////////////////

      int blockInit = 0;

      if(initBlock == 0){	
	blockInit = sys->V/2;
	//printf("blockInitI: %d\n", blockInit);
	while(blockInit > maxThreadsPerBlock) blockInit /= 2;
	//printf("blockInitF: %d\n", blockInit);
      }else blockInit = initBlock;

      int count = 0, test = 1;
      for(sys->blockSize = blockInit; test; ){

	if(count > 0){
	  do{sys->blockSize--;}while((sys->V/2)%sys->blockSize != 0);
	}
	count++;

	//printf("blockSize: %d\n", sys->blockSize);

	if(sys->blockSize >= blockInit){
	  int kMax = sys->V/sys->blockSize/2;
	  //printf("blockSizeIF: %d\n", sys->blockSize);
	  kMax = 1;
	  for(int k = 0; k < kMax ;){
	    
	    float redMean = 0., redMean2 = 0., swap = 0.;
	    float blueMean = 0., blueMean2 = 0.;
	    float errorRed = 0., errorBlue = 0.;

	    do{k++;}while(kMax%k != 0);	    
	    
	    dim3 block(sys->blockSize,1,1);
	    dim3 grid(sys->V/block.x/(2*k), sys->sysN, 1);
	    int h_loopStride = block.x*grid.x;
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_loopStride, &h_loopStride, sizeof(int)) );
	    
	    printf("Measuring Times No-Grid L:%d block.x:%d grid.x:%d spins per thread:%d\n",
		   L, block.x, grid.x, k);
	    
	    int threads = block.x*block.y*block.z*grid.x*grid.y*grid.z;
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_threads, &threads, sizeof(int)) );
	    
	    CRNGArray_t *rng = CRNGArrayInit(threads);
	    	    
	    // KERNEL: Sliced
	    redMean = blueMean = redMean2 = blueMean2 = 0.;
	    for(int sample = 0; sample < initSamples; sample++){
	      
	      MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepReds_Sliced<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
						     sys->d_reds2, sys->d_reds3,
						     rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	      MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	      MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	      redMean += (swap/(initSteps)); 
	      redMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	      
	      MYCUDA_ERROR( cudaEventRecord(startEvent, 0) );
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepBlues_Sliced<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
						      sys->d_blues2, sys->d_blues3,
						      rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      MYCUDA_ERROR( cudaEventRecord(stopEvent, 0) );
	      MYCUDA_ERROR( cudaEventSynchronize(stopEvent) );
	      MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEvent, stopEvent) );
	      blueMean += (swap/(initSteps)); 
	      blueMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	    }
	    
	    redMean /= initSamples;  redMean2 /= initSamples;      
	    blueMean /= initSamples; blueMean2 /= initSamples;
	    
	    errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	    errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	    
	    snprintf(fileName, sizeof(fileName), "GPU%d_slicedKernel", sys->cardN);
	    FILE *outputFile = fopen(fileName, "a+");
	    fprintf(outputFile, "%d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		    L, sysN, k, block.x, grid.x,
		    redMean*1.0e-3, errorRed*1.0e-3, blueMean*1.0e-3, errorBlue*1.0e-3);
	    fflush(outputFile);
	    fclose(outputFile);
	  	  
	    CRNGArrayReset(rng);
	    free(rng);
	  }
	}else test = 0;
      }
      
      // CLEANING UP
      //CRNGArrayFree(&rng);
      if(dMemFree(sys)){
	fprintf(stderr,"\nError in dMemFree\n");
	return 1;
      }
      if(hMemFree(sys)){
	fprintf(stderr,"\nError in hMemFree\n");
	return 1;
      }
    }

    initSysN = 0;
  }
  // allocazione
  // deallocazione
  // linearizzazione nel numero dei sistemi con limite la memoria
  return 0;
}

extern int benchmarkMGPU(int argc, char **argv){
  if(argc != 10){
    fprintf(stderr, "usage: %s <L> <sysN> <blockBlk> <blockBnd> <samples> <steps> <memory> <spinsPerThread> <maxL>\n\n", argv[0]);
    fprintf(stderr, "Systems' GPUs are:\n");
    int numberOfGPUs = 0;
    MYCUDA_ERROR( cudaGetDeviceCount(&numberOfGPUs) );
    for(int gpu = 0; gpu < numberOfGPUs; gpu++){
      cudaDeviceProp prop;
      MYCUDA_ERROR( cudaGetDeviceProperties(&prop, gpu) );
      printf("GPU %d: %s\n", gpu, prop.name);
    }
    return 1;
  }

  char fileName[1024];
  //struct timespec start, stop;

  sysEA3D_t *sys = (sysEA3D_t *)malloc(sizeof(sysEA3D_t));
  int initL, initSysN, initSamples, initSteps, usrBlockBlk, usrBlockBnd, spinsPerThread, maxL;
  float memory = 0.;
  sscanf(argv[1], "%d", &initL);
  sscanf(argv[2], "%d", &initSysN);
  sscanf(argv[3], "%d", &usrBlockBlk);
  sscanf(argv[4], "%d", &usrBlockBnd);
  sscanf(argv[5], "%d", &initSamples);
  sscanf(argv[6], "%d", &initSteps);
  sscanf(argv[7], "%f", &memory);
  sscanf(argv[8], "%d", &spinsPerThread);
  sscanf(argv[9], "%d", &maxL);

  if(maxL == 0) maxL = 256;
  if(initL == 0) initL = 8;
  if(initSamples == 0) initSamples = 100;
  if(initSteps == 0) initSteps = 50;

  sys->h_reds0 = sys->h_reds1 = sys->h_reds2 = sys->h_reds3 = NULL;
  sys->h_blues0 = sys->h_blues1 = sys->h_blues2 = sys->h_blues3 = NULL;
  sys->d_reds0 = sys->d_reds1 = sys->d_reds2 = sys->d_reds3 = NULL;
  sys->d_blues0 = sys->d_blues1 = sys->d_blues2 = sys->d_blues3 = NULL;

  sys->h_reds0BS = sys->h_reds1BS = sys->h_reds2BS = sys->h_reds3BS = NULL;
  sys->h_reds0BR = sys->h_reds1BR = sys->h_reds2BR = sys->h_reds3BR = NULL;
  sys->h_blues0BS = sys->h_blues1BS = sys->h_blues2BS = sys->h_blues3BS = NULL;
  sys->h_blues0BR = sys->h_blues1BR = sys->h_blues2BR = sys->h_blues3BR = NULL;

  sys->d_reds0BS = sys->d_reds1BS = sys->d_reds2BS = sys->d_reds3BS = NULL;
  sys->d_reds0BR = sys->d_reds1BR = sys->d_reds2BR = sys->d_reds3BR = NULL;
  sys->d_blues0BS = sys->d_blues1BS = sys->d_blues2BS = sys->d_blues3BS = NULL;
  sys->d_blues0BR = sys->d_blues1BR = sys->d_blues2BR = sys->d_blues3BR = NULL;

  sys->h_Jpx = sys->h_Jpy = sys->h_Jpz = NULL;
  sys->h_Jmx = sys->h_Jmy = sys->h_Jmz = NULL;
  sys->d_Jpx = sys->d_Jpy = sys->d_Jpz = NULL;
  sys->d_Jmx = sys->d_Jmy = sys->d_Jmz = NULL;

  sys->h_JmxBS = sys->h_JpyBS = sys->h_JmzBS = NULL;
  sys->h_JmxBR = sys->h_JpyBR = sys->h_JmzBR = NULL;

  sys->d_JmxBS = sys->d_JpyBS = sys->d_JmzBS = NULL;
  sys->d_JmxBR = sys->d_JpyBR = sys->d_JmzBR = NULL;

  sys->realI_R = sys->realI_B = NULL;
  sys->myI_R = sys->myI_B = NULL;
  sys->myI_B2R = sys->myI_R2B = NULL;

  sys->beta = 1.101;

  /*cudaEvent_t startEventBlk, stopEventBlk, startEventBnd, stopEventBnd;
  MYCUDA_ERROR( cudaEventCreate(&startEventBlk) );
  MYCUDA_ERROR( cudaEventCreate(&stopEventBlk) );
  MYCUDA_ERROR( cudaEventCreate(&startEventBnd) );
  MYCUDA_ERROR( cudaEventCreate(&stopEventBnd) );*/

  struct timespec start, stop;

  int maxThreadsPerBlock = 1024;

  cudaStream_t *stream;
  int nstreams=2;
  
  stream = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
  if(stream == NULL){
    fprintf(stderr, "\nNo Alloc *stream in %s at line %d\n", __FILE__, __LINE__);
    return 1;
  }
  
  MYCUDA_ERROR( cudaStreamCreate(&(stream[0])));
  MYCUDA_ERROR( cudaStreamCreate(&(stream[1])));

  /*if(prop.major == 2) maxThreadsPerBlock = 512;
  if(prop.major == 3) maxThreadsPerBlock = 1024;*/

  MYCUDA_ERROR( cudaFuncSetCacheConfig(slicedRedsBoundaryCopy_Kernel, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(slicedBluesBoundaryCopy_Kernel, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_SlicedBnd, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_SlicedBnd, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_SlicedBlk, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_SlicedBlk, cudaFuncCachePreferL1) );

  while(initL/nprocs < 4) initL += 2;

  for(int L = initL; L <= maxL; L += 2){

    while(L%nprocs != 0) L += 2;

    sys->L = L;
    
    sys->V = sys->L*sys->L*sys->L/nprocs;
    sys->A = sys->L*sys->L;
    
    sys->hV = sys->V/2;
    sys->hA = sys->A/2;
    sys->hL = sys->L/2;     
    
    int hV = sys->hV;
    size_t NJ_Size = 6*hV*sizeof(MSC), NS_Size = 4*2*hV*sizeof(MSC), RNG_Size = hV*sizeof(RNGT);
    size_t devFree = 0, devTotal = 0;
    MYCUDA_ERROR( cudaMemGetInfo(&devFree, &devTotal) );
    // here to decide the percentage of device memory to use. 
    int MaxSysN = (int)((memory)*(devFree/(NJ_Size + NS_Size + RNG_Size)));
    printf("\n\nL = %d, Free memory on device: %lu, MaxSysN: %d\n", L, devFree, MaxSysN);

    if(initSysN == 0) initSysN = 1;

    for(int sysN = initSysN; sysN <= MaxSysN; sysN *= 2){
      sys->sysN = sysN;

      sys->NJBnd = sys->sysN*sys->A;
      sys->NSBnd = sys->sysN*sys->A;
      
      sys->NJBlk = sys->sysN*sys->hV;
      sys->NSBlk = sys->sysN*sys->hV;      
      
      if(masksInit(sys)){
	fprintf(stderr,"\nError in masksInit\n");
	return 1;
      }
      if(MCWeightsInit(sys)){
	fprintf(stderr,"\nError in MCWeightsInit\n");
	return 1;
      }
      if(dMemInit(sys)){
	fprintf(stderr,"\nError in dMemInit\n");
	return 1;
      }
      if(hMemInit2(sys)){
	fprintf(stderr,"\nError in hMemInit\n");
	return 1;
      }
      if(H2DSysCopy(sys)){
	fprintf(stderr,"\nError in H2DSysCopy\n");
	return 1;
      }
      if(slicedBoudaryInit(sys)){
	fprintf(stderr,"\nError in slicedBoundaryInit\n");
	return 1;
      }

      MYCUDA_ERROR( cudaMemGetInfo(&devFree, &devTotal) );
      printf("\n\n---\tL = %d, sysN = %d, free memory after allocation: %lu\t---\n\n",
	     L, sysN, devFree);


      ////////////////////////////////////////
      // NO-GRID KERNELS
      ///////////////////////////////////////

      int blockInitBnd = 0, blockInitBlk = 0;

      if(usrBlockBlk == 0){	
	blockInitBlk = maxThreadsPerBlock;
	while(blockInitBlk > sys->hV - sys->A) blockInitBlk /= 2;
      }else blockInitBlk = usrBlockBlk;

      if(usrBlockBnd == 0){	
	blockInitBnd = maxThreadsPerBlock;
	while(blockInitBnd > sys->A) blockInitBnd /= 2;
      }else blockInitBnd = usrBlockBnd;



      for(int blockBnd = blockInitBnd; blockBnd >= 32; blockBnd /= 2){

	if(blockBnd >= 32){
	  int kMaxBnd = 1;
	  while (sys->A % (kMaxBnd*blockBnd) == 0) kMaxBnd++;
	  if(kMaxBnd > spinsPerThread && spinsPerThread != 0) kMaxBnd = spinsPerThread;

	  printf("kMaxBnd: %d, block: %d\n", 
		 kMaxBnd, blockBnd);

	  for(int kBnd = 0; kBnd < kMaxBnd ;){
	    
	    do{kBnd++;}while(kMaxBnd % kBnd != 0);	    

	    dim3 bblockBnd(blockBnd,1,1);
	    int fitGridBnd = (sys->A/kBnd + blockBnd - 1)/blockBnd;
	    dim3 gridBnd(fitGridBnd, sys->sysN, 1);

	    // the two constants d_loopStrideBnd and d_rngStrideBnd which have two different
	    // logical meanings share the same numerical value.

	    int h_loopStrideBnd = bblockBnd.x*gridBnd.x;
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_loopStrideBnd, &h_loopStrideBnd, sizeof(int)) );
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_rngStrideBnd, &h_loopStrideBnd, sizeof(int)) );



	    for(int blockBlk = blockInitBlk; blockBlk >= 32; blockBlk /= 2){
	      
	      if(blockBlk >= 32){
		int kMaxBlk = 1;
		while ((sys->hV - sys->A) % (kMaxBlk*blockBlk) == 0) kMaxBlk++;
		if(kMaxBlk > spinsPerThread && spinsPerThread != 0) kMaxBlk = spinsPerThread;

		printf("kMaxBlk: %d, block: %d\n", 
		       kMaxBlk, blockBlk);
		
		for(int kBlk = 0; kBlk < kMaxBlk ;){
	    
		  do{kBlk++;}while(kMaxBlk % kBlk != 0);	    

		  dim3 bblockBlk(blockBlk,1,1);
		  int fitGridBlk = ((sys->hV - sys->A)/kBlk + blockBlk - 1)/blockBlk;
		  dim3 gridBlk(fitGridBlk, sys->sysN, 1);

		  // same as before
		  int h_loopStrideBlk = bblockBlk.x*gridBlk.x;	
		  MYCUDA_ERROR( cudaMemcpyToSymbol(d_loopStrideBlk, &h_loopStrideBlk, sizeof(int)));
		  MYCUDA_ERROR( cudaMemcpyToSymbol(d_rngStrideBlk, &h_loopStrideBlk, sizeof(int)) );
		  

		  
		  double redMean = 0., redMean2 = 0., swap = 0.;
		  double blueMean = 0., blueMean2 = 0.;
		  double errorRed = 0., errorBlue = 0.;
		  		  		  
		  printf("Measuring Times No-Grid L:%d Blk: [{%d, %d, %d}, {%d, %d, %d}] - Bnd: [{%d, %d, %d}, {%d, %d, %d}] spins per thread:%d\n",
			 L, 
			 bblockBlk.x, bblockBlk.y, bblockBlk.z, 
			 gridBlk.x, gridBlk.y, gridBlk.z,
			 bblockBnd.x, bblockBnd.y, bblockBnd.z, 
			 gridBnd.x, gridBnd.y, gridBnd.z,
			 kBlk);
		  
		  int threadsBlk = bblockBlk.x*bblockBlk.y*bblockBlk.z*gridBlk.x*gridBlk.y*gridBlk.z; 
		  int threadsBnd = bblockBnd.x*bblockBnd.y*bblockBnd.z*gridBnd.x*gridBnd.y*gridBnd.z;		 
		  
		  if(threadsBlk > 0 && threadsBnd > 0){
		    CRNGArray_t *rngBnd = CRNGArrayInit(threadsBnd);
		    CRNGArray_t *rngBlk = CRNGArrayInit(threadsBlk);
		    
		    // KERNEL: Sliced
		    /*redMeanBlk = blueMeanBlk = redMeanBlk2 = blueMeanBlk2 = 0.;
		      redMeanBnd = blueMeanBnd = redMeanBnd2 = blueMeanBnd2 = 0.;*/
		    
		    for(int sample = 0; sample < initSamples; sample++){
		
		      /*MYCUDA_ERROR( cudaEventRecord(startEventBnd, stream[0]) );
			MYCUDA_ERROR( cudaEventRecord(startEventBlk, stream[1]) );*/
		      
		      clock_gettime(CLOCK_MONOTONIC, &start);
		      
		      for(int steps = 0; steps < initSteps; steps++){	  
			MCStepReds_SlicedBndCp<<< gridBnd, bblockBnd, 0, stream[0] >>>(sys->d_reds0, sys->d_reds1,
										     sys->d_reds2, sys->d_reds3,
										     rngBnd->d_rngSeeds,
										     sys->d_reds0BS, sys->d_reds1BS,
										     sys->d_reds2BS, sys->d_reds3BS);
			
			MCStepReds_SlicedBlk<<< gridBlk, bblockBlk, 0, stream[1] >>>(sys->d_reds0, sys->d_reds1,
										     sys->d_reds2, sys->d_reds3,
										     rngBlk->d_rngSeeds);

			
			slicedBoundary_RedSendReceiveNew(sys, stream);

			MYCUDA_ERROR( cudaStreamSynchronize(stream[1]) );
					     
		      }
		      
		      clock_gettime(CLOCK_MONOTONIC, &stop);
		      
		      /*MYCUDA_ERROR( cudaEventRecord(stopEventBnd, stream[0]) );
			MYCUDA_ERROR( cudaEventRecord(stopEventBlk, stream[1]) );
			MYCUDA_ERROR( cudaEventSynchronize(stopEventBlk) );
			MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEventBlk, stopEventBlk) );
			redMeanBlk += (swap/(initSteps)); 
			redMeanBlk2 += (swap/(initSteps))*(swap/(initSteps));
			swap = 0.;
			MYCUDA_ERROR( cudaEventSynchronize(stopEventBnd) );
			MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEventBnd, stopEventBnd) );
			redMeanBnd += (swap/(initSteps)); 
			redMeanBnd2 += (swap/(initSteps))*(swap/(initSteps));
			swap = 0.;*/
		      
		      swap = ((stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec)/1.0e9);
		      redMean += (swap/(initSteps)); 
		      redMean2 += (swap/(initSteps))*(swap/(initSteps));
		      swap = 0.;
		      
		      /*MYCUDA_ERROR( cudaEventRecord(startEventBnd, stream[0]) );
			MYCUDA_ERROR( cudaEventRecord(startEventBlk, stream[1]) );*/
		      
		      clock_gettime(CLOCK_MONOTONIC, &start);
		      
		      for(int steps = 0; steps < initSteps; steps++){	  
			MCStepBlues_SlicedBndCp<<< gridBnd, bblockBnd, 0, stream[0] >>>(sys->d_blues0, sys->d_blues1,
										      sys->d_blues2, sys->d_blues3,
										      rngBnd->d_rngSeeds,
										      sys->d_blues0BS, sys->d_blues1BS,
										      sys->d_blues2BS, sys->d_blues3BS);

			
			MCStepBlues_SlicedBlk<<< gridBlk, bblockBlk, 0, stream[1] >>>(sys->d_blues0, sys->d_blues1,
										      sys->d_blues2, sys->d_blues3,
										      rngBlk->d_rngSeeds);
			
			slicedBoundary_BlueSendReceiveNew(sys, stream);
			
			MYCUDA_ERROR( cudaStreamSynchronize(stream[1]) );
		      }
		      
		      clock_gettime(CLOCK_MONOTONIC, &stop);
		      
		      /*MYCUDA_ERROR( cudaEventRecord(stopEventBnd, stream[0]) );
			MYCUDA_ERROR( cudaEventRecord(stopEventBlk, stream[1]) );
			MYCUDA_ERROR( cudaEventSynchronize(stopEventBlk) );
			MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEventBlk, stopEventBlk) );
			blueMeanBlk += (swap/(initSteps)); 
			blueMeanBlk2 += (swap/(initSteps))*(swap/(initSteps));
			swap = 0.;
			MYCUDA_ERROR( cudaEventSynchronize(stopEventBnd) );
			MYCUDA_ERROR( cudaEventElapsedTime(&swap, startEventBnd, stopEventBnd) );
			blueMeanBnd += (swap/(initSteps)); 
			blueMeanBnd2 += (swap/(initSteps))*(swap/(initSteps));
			swap = 0.;*/
		      
		      swap = ((stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec)/1.0e9); 	    
		      blueMean += (swap/(initSteps)); 
		      blueMean2 += (swap/(initSteps))*(swap/(initSteps));
		      swap = 0.;
		      
		    }
		    
		    /*redMeanBlk /= initSamples;  redMeanBlk2 /= initSamples;      
		      redMeanBnd /= initSamples;  redMeanBnd2 /= initSamples;      
		      blueMeanBlk /= initSamples; blueMeanBlk2 /= initSamples;
		      blueMeanBnd /= initSamples; blueMeanBnd2 /= initSamples;
		      
		      errorRedBlk = sqrt((redMeanBlk2 - redMeanBlk*redMeanBlk)/(initSteps - 1.));
		      errorBlueBlk = sqrt((blueMeanBlk2 - blueMeanBlk*blueMeanBlk)/(initSteps - 1.));
		      errorRedBnd = sqrt((redMeanBnd2 - redMeanBnd*redMeanBnd)/(initSteps - 1.));
		      errorBlueBnd = sqrt((blueMeanBnd2 - blueMeanBnd*blueMeanBnd)/(initSteps - 1.));*/
		    
		    redMean /= initSamples;  redMean2 /= initSamples;      
		    blueMean /= initSamples; blueMean2 /= initSamples;
		    
		    errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
		    errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
		    
		    snprintf(fileName, sizeof(fileName), "GPU%d_slicedKernel_procs%d_mpi%d", 
			     0, nprocs, mpiid);
		    FILE *outputFile = fopen(fileName, "a+");
		    
		    fprintf(outputFile, "%d %d %d %d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
			    L, sysN, kBnd, bblockBnd.x, gridBnd.x, kBlk, bblockBlk.x, gridBlk.x,
			    redMean, errorRed, blueMean, errorBlue);
	      
	      
		    /*	    fprintf(outputFile, "%d %d %d %d %d %d %21.15e %21.15e %21.15e %21.15e %21.15e %21.15e %21.15e %21.15e\n", 
			    nprocs, L, sysN, k, block.x, grid.x,
			    redMeanBlk*1.0e-3, errorRedBlk*1.0e-3, 
			    redMeanBnd*1.0e-3, errorRedBnd*1.0e-3,
			    blueMeanBlk*1.0e-3, errorBlueBlk*1.0e-3, 
			    blueMeanBnd*1.0e-3, errorBlueBnd*1.0e-3);*/
	      
		    fflush(outputFile);
		    fclose(outputFile);
		    
		    CRNGArrayReset(rngBnd);
		    free(rngBnd);
		    CRNGArrayReset(rngBlk);
		    free(rngBlk);
		  }
		}
	      } 
	    }
	  }
	}
      }

      // CLEANING UP
      //CRNGArrayFree(&rng);
      if(dMemFree(sys)){
	fprintf(stderr,"\nError in dMemFree\n");
	return 1;
      }

      if(hMemFree(sys)){
	fprintf(stderr,"\nError in hMemFree\n");
	return 1;
      }
    }

    initSysN = 0;
  }
  // allocazione
  // deallocazione
  // linearizzazione nel numero dei sistemi con limite la memoria
  return 0;
}

extern int benchmarkSmooth(int argc, char **argv){
  if(argc != 8){
    fprintf(stderr, "usage: %s <L> <sysN> <block> <samples> <steps> <memory> <spinsPerThread>\n\n", argv[0]);
    fprintf(stderr, "Systems' GPUs are:\n");
    int numberOfGPUs = 0;
    MYCUDA_ERROR( cudaGetDeviceCount(&numberOfGPUs) );
    for(int gpu = 0; gpu < numberOfGPUs; gpu++){
      cudaDeviceProp prop;
      MYCUDA_ERROR( cudaGetDeviceProperties(&prop, gpu) );
      printf("GPU %d: %s\n", gpu, prop.name);
    }
    return 1;
  }

  char fileName[1024];
  //struct timespec start, stop;

  sysEA3D_t *sys = (sysEA3D_t *)malloc(sizeof(sysEA3D_t));
  int initL, initSysN, initSamples, initSteps, usrBlock, spinsPerThread;
  float memory = 0.;
  sscanf(argv[1], "%d", &initL);
  sscanf(argv[2], "%d", &initSysN);
  sscanf(argv[3], "%d", &usrBlock);
  sscanf(argv[4], "%d", &initSamples);
  sscanf(argv[5], "%d", &initSteps);
  sscanf(argv[6], "%f", &memory);
  sscanf(argv[7], "%d", &spinsPerThread);

  if(initL == 0) initL = 8;
  if(initSamples == 0) initSamples = 100;
  if(initSteps == 0) initSteps = 50;

  sys->h_reds0 = sys->h_reds1 = sys->h_reds2 = sys->h_reds3 = NULL;
  sys->h_blues0 = sys->h_blues1 = sys->h_blues2 = sys->h_blues3 = NULL;
  sys->d_reds0 = sys->d_reds1 = sys->d_reds2 = sys->d_reds3 = NULL;
  sys->d_blues0 = sys->d_blues1 = sys->d_blues2 = sys->d_blues3 = NULL;

  sys->h_reds0BS = sys->h_reds1BS = sys->h_reds2BS = sys->h_reds3BS = NULL;
  sys->h_reds0BR = sys->h_reds1BR = sys->h_reds2BR = sys->h_reds3BR = NULL;
  sys->h_blues0BS = sys->h_blues1BS = sys->h_blues2BS = sys->h_blues3BS = NULL;
  sys->h_blues0BR = sys->h_blues1BR = sys->h_blues2BR = sys->h_blues3BR = NULL;

  sys->d_reds0BS = sys->d_reds1BS = sys->d_reds2BS = sys->d_reds3BS = NULL;
  sys->d_reds0BR = sys->d_reds1BR = sys->d_reds2BR = sys->d_reds3BR = NULL;
  sys->d_blues0BS = sys->d_blues1BS = sys->d_blues2BS = sys->d_blues3BS = NULL;
  sys->d_blues0BR = sys->d_blues1BR = sys->d_blues2BR = sys->d_blues3BR = NULL;

  sys->h_Jpx = sys->h_Jpy = sys->h_Jpz = NULL;
  sys->h_Jmx = sys->h_Jmy = sys->h_Jmz = NULL;
  sys->d_Jpx = sys->d_Jpy = sys->d_Jpz = NULL;
  sys->d_Jmx = sys->d_Jmy = sys->d_Jmz = NULL;

  sys->h_JmxBS = sys->h_JpyBS = sys->h_JmzBS = NULL;
  sys->h_JmxBR = sys->h_JpyBR = sys->h_JmzBR = NULL;

  sys->d_JmxBS = sys->d_JpyBS = sys->d_JmzBS = NULL;
  sys->d_JmxBR = sys->d_JpyBR = sys->d_JmzBR = NULL;

  sys->realI_R = sys->realI_B = NULL;
  sys->myI_R = sys->myI_B = NULL;
  sys->myI_B2R = sys->myI_R2B = NULL;

  sys->beta = 1.101;

  /*cudaEvent_t startEventBlk, stopEventBlk, startEventBnd, stopEventBnd;
  MYCUDA_ERROR( cudaEventCreate(&startEventBlk) );
  MYCUDA_ERROR( cudaEventCreate(&stopEventBlk) );
  MYCUDA_ERROR( cudaEventCreate(&startEventBnd) );
  MYCUDA_ERROR( cudaEventCreate(&stopEventBnd) );*/

  struct timespec start, stop;

  int maxThreadsPerBlock = 1024;

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Bitwise, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Bitwise, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Standard, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Standard, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_Sliced, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_Sliced, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_StandardGrid, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_StandardGrid, cudaFuncCachePreferL1) );

  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepReds_SlicedGrid, cudaFuncCachePreferL1) );
  MYCUDA_ERROR( cudaFuncSetCacheConfig(MCStepBlues_SlicedGrid, cudaFuncCachePreferL1) );


  for(int L = initL; L <= 256; L += 2){

    sys->L = L;
    
    sys->V = sys->L*sys->L*sys->L/nprocs;
    sys->A = sys->L*sys->L;
    
    sys->hV = sys->V/2;
    sys->hA = sys->A/2;
    sys->hL = sys->L/2;     

    int hV = L*L*L/2;
    size_t NJ_Size = 6*hV*sizeof(MSC), NS_Size = 4*2*hV*sizeof(MSC), RNG_Size = hV*sizeof(RNGT);
    size_t devFree = 0, devTotal = 0;
    MYCUDA_ERROR( cudaMemGetInfo(&devFree, &devTotal) );
    // here to decide the percentage of device memory to use. 
    int MaxSysN = (int)(memory*(devFree/(NJ_Size + NS_Size + RNG_Size)));
    printf("\n\nL = %d, Free memory on device: %lu, MaxSysN: %d\n", L, devFree, MaxSysN);

    if(initSysN == 0) initSysN = 1;

    for(int sysN = initSysN; sysN <= MaxSysN; sysN *= 2){
      sys->sysN = sysN;

      sys->NJBnd = sys->sysN*sys->A;
      sys->NSBnd = sys->sysN*sys->A;
      
      sys->NJBlk = sys->sysN*sys->hV;
      sys->NSBlk = sys->sysN*sys->hV;      
      
      if(masksInit(sys)){
	fprintf(stderr,"\nError in masksInit\n");
	return 1;
      }
      if(MCWeightsInit(sys)){
	fprintf(stderr,"\nError in MCWeightsInit\n");
	return 1;
      }
      if(dMemInit(sys)){
	fprintf(stderr,"\nError in dMemInit\n");
	return 1;
      }
      if(hMemInit2(sys)){
	fprintf(stderr,"\nError in hMemInit\n");
	return 1;
      }
      if(H2DSysCopy(sys)){
	fprintf(stderr,"\nError in H2DSysCopy\n");
	return 1;
      }

      MYCUDA_ERROR( cudaMemGetInfo(&devFree, &devTotal) );
      printf("\n\n---\tL = %d, sysN = %d, free memory after allocation: %lu\t---\n\n",
	     L, sysN, devFree);

      ////////////////////////////////////////
      // NO-GRID KERNELS
      ///////////////////////////////////////

      int blockInit = 0;

      if(usrBlock == 0){
	blockInit = maxThreadsPerBlock;
	while(blockInit > sys->V/2) blockInit /= 2;
      }else blockInit = usrBlock;


      for(sys->blockSize = blockInit; sys->blockSize >= 32; sys->blockSize /= 2){

	//printf("blockSize: %d\n", sys->blockSize);

	if(sys->blockSize >= 32){
	  int kMax = sys->V/sys->blockSize/2;
	  if(kMax > spinsPerThread && spinsPerThread != 0) kMax = spinsPerThread;
	  printf("blockSizeIF: %d\n", sys->blockSize);

	  for(int k = 0; k < kMax ;){
	    
	    float redMean = 0., redMean2 = 0., swap = 0.;
	    float blueMean = 0., blueMean2 = 0.;
	    float errorRed = 0., errorBlue = 0.;

	    do{k++;}while(kMax%k != 0);	    
	    
	    dim3 block(sys->blockSize,1,1);

	    int fitGrid = (sys->V/2/k + sys->blockSize - 1)/sys->blockSize;
	    dim3 grid(fitGrid, sys->sysN, 1);
	    int h_loopStride = block.x*grid.x;
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_loopStride, &h_loopStride, sizeof(int)) );
	    
	    printf("Measuring Times No-Grid L:%d block.x:%d grid.x:%d spins per thread:%d\n",
		   L, block.x, grid.x, k);
	    
	    int threads = block.x*block.y*block.z*grid.x*grid.y*grid.z;
	    MYCUDA_ERROR( cudaMemcpyToSymbol(d_threads, &threads, sizeof(int)) );
	    
	    CRNGArray_t *rng = CRNGArrayInit(threads);
	    
	    // KERNEL: Bitwise; Only L = 2^k are examined
	    if( ((1U<<findLog2(sys->L))^sys->L) == 0 ){
	      for(int sample = 0; sample < initSamples; sample++){
		
		clock_gettime(CLOCK_MONOTONIC, &start);
		
		for(int steps = 0; steps < initSteps; steps++){	  
		  MCStepReds_Bitwise<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
							sys->d_reds2, sys->d_reds3,
							rng->d_rngSeeds);
		  MYCUDA_ERROR( cudaDeviceSynchronize() );
		}
		
		clock_gettime(CLOCK_MONOTONIC, &stop);
		swap = ((stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec)/1.0e9); 	    

		redMean += (swap/(initSteps)); 
		redMean2 += (swap/(initSteps))*(swap/(initSteps));
		swap = 0.;
		
		clock_gettime(CLOCK_MONOTONIC, &start);
		
		for(int steps = 0; steps < initSteps; steps++){	  
		  MCStepBlues_Bitwise<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
							 sys->d_blues2, sys->d_blues3,
							 rng->d_rngSeeds);
		  MYCUDA_ERROR( cudaDeviceSynchronize() );
		}

		clock_gettime(CLOCK_MONOTONIC, &stop);
		swap = ((stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec)/1.0e9); 	    
		
		blueMean += (swap/(initSteps)); 
		blueMean2 += (swap/(initSteps))*(swap/(initSteps));
		swap = 0.;
	      }
	      
	      redMean /= initSamples;  redMean2 /= initSamples;      
	      blueMean /= initSamples; blueMean2 /= initSamples;
	      
	      errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	      errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	      
	      snprintf(fileName, sizeof(fileName), "GPU%d_bitwiseKernel_smooth", sys->cardN);
	      FILE *outputFile = fopen(fileName, "a+");
	      fprintf(outputFile, "%d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		      L, sysN, k, block.x, grid.x,
		      redMean, errorRed, blueMean, errorBlue);
	      fflush(outputFile);
	      fclose(outputFile);
	    }
	    
	    // KERNEL: Standard
	    redMean = blueMean = redMean2 = blueMean2 = 0.;
	    for(int sample = 0; sample < initSamples; sample++){
	      

	      clock_gettime(CLOCK_MONOTONIC, &start);
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepReds_Standard<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
						       sys->d_reds2, sys->d_reds3,
						       rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      clock_gettime(CLOCK_MONOTONIC, &stop);
	      swap = ((stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec)/1.0e9); 	    

	      redMean += (swap/(initSteps)); 
	      redMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	      
	      clock_gettime(CLOCK_MONOTONIC, &start);
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepBlues_Standard<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
							sys->d_blues2, sys->d_blues3,
							rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      clock_gettime(CLOCK_MONOTONIC, &stop);
	      swap = ((stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec)/1.0e9); 	    

	      blueMean += (swap/(initSteps)); 
	      blueMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	    }
	    
	    redMean /= initSamples;  redMean2 /= initSamples;      
	    blueMean /= initSamples; blueMean2 /= initSamples;
	    
	    errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	    errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	    
	    snprintf(fileName, sizeof(fileName), "GPU%d_standardKernel_smooth", sys->cardN);
	    FILE *outputFile = fopen(fileName, "a+");
	    fprintf(outputFile, "%d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		    L, sysN, k, block.x, grid.x,
		    redMean, errorRed, blueMean, errorBlue);
	    fflush(outputFile);
	    fclose(outputFile);
	    
	    // KERNEL: Sliced
	    redMean = blueMean = redMean2 = blueMean2 = 0.;
	    for(int sample = 0; sample < initSamples; sample++){
	      
	      clock_gettime(CLOCK_MONOTONIC, &start);
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepReds_Sliced<<< grid, block >>>(sys->d_reds0, sys->d_reds1,
						     sys->d_reds2, sys->d_reds3,
						     rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      
	      clock_gettime(CLOCK_MONOTONIC, &stop);
	      swap = ((stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec)/1.0e9); 	    

	      redMean += (swap/(initSteps)); 
	      redMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	      
	      clock_gettime(CLOCK_MONOTONIC, &start);
	      
	      for(int steps = 0; steps < initSteps; steps++){	  
		MCStepBlues_Sliced<<< grid, block >>>(sys->d_blues0, sys->d_blues1,
						      sys->d_blues2, sys->d_blues3,
						      rng->d_rngSeeds);
		MYCUDA_ERROR( cudaDeviceSynchronize() );
	      }
	      

	      clock_gettime(CLOCK_MONOTONIC, &stop);
	      swap = ((stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec)/1.0e9); 	    

	      blueMean += (swap/(initSteps)); 
	      blueMean2 += (swap/(initSteps))*(swap/(initSteps));
	      swap = 0.;
	    }
	    
	    redMean /= initSamples;  redMean2 /= initSamples;      
	    blueMean /= initSamples; blueMean2 /= initSamples;
	    
	    errorRed = sqrt((redMean2 - redMean*redMean)/(initSteps - 1.));
	    errorBlue = sqrt((blueMean2 - blueMean*blueMean)/(initSteps - 1.));
	    
	    snprintf(fileName, sizeof(fileName), "GPU%d_slicedKernel_smooth", sys->cardN);
	    outputFile = fopen(fileName, "a+");
	    fprintf(outputFile, "%d %d %d %d %d %21.15e %21.15e %21.15e %21.15e\n", 
		    L, sysN, k, block.x, grid.x,
		    redMean, errorRed, blueMean, errorBlue);
	    fflush(outputFile);
	    fclose(outputFile);
	  	  
	    CRNGArrayReset(rng);
	    free(rng);
	  }
	}
      }

      // CLEANING UP
      //CRNGArrayFree(&rng);
      if(dMemFree(sys)){
	fprintf(stderr,"\nError in dMemFree\n");
	return 1;
      }
      if(hMemFree(sys)){
	fprintf(stderr,"\nError in hMemFree\n");
	return 1;
      }
    }

    initSysN = 0;
  }
  // allocazione
  // deallocazione
  // linearizzazione nel numero dei sistemi con limite la memoria
  return 0;
}
