// Matteo Lulli, Massimo Bernaschi, Giorgio Parisi 2013
// Physics Dept. 'Sapienza', University of Rome
#include <mpi.h>
#include "string.h"
#include "stdio.h"
#include "stdlib.h"
#include "cuda_runtime.h"


int stringCmp( const void *a, const void *b)
{
     return strcmp(a,b);

}

void  assignDeviceToProcess(int *p2myrank)
{
       char     host_name[MPI_MAX_PROCESSOR_NAME];
       char (*host_names)[MPI_MAX_PROCESSOR_NAME];
       MPI_Comm nodeComm;


       int i, n, namelen, color, rank, nprocs, myrank,gpu_per_node;
       size_t bytes;
       int dev, err1;
       struct cudaDeviceProp deviceProp;

       /* Check if the device has been already assigned */

       MPI_Comm_rank(MPI_COMM_WORLD, &rank);
       MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
       MPI_Get_processor_name(host_name,&namelen);

       bytes = nprocs * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
       host_names = (char (*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);

       strcpy(host_names[rank], host_name);

       for (n=0; n<nprocs; n++)
       {
        MPI_Bcast(&(host_names[n]),MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
       }


       qsort(host_names, nprocs,  sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);

       color = 0;

       for (n=0; n<nprocs; n++)
       {
         if(n>0&&strcmp(host_names[n-1], host_names[n])) color++;
         if(strcmp(host_name, host_names[n]) == 0) break;
       }

       MPI_Comm_split(MPI_COMM_WORLD, color, 0, &nodeComm);

       MPI_Comm_rank(nodeComm, &myrank);
       MPI_Comm_size(nodeComm, &gpu_per_node);

       p2myrank[0]=myrank;

        /* Find out how many DP capable GPUs are in the system and their device number */
       int deviceCount,slot=0;
       int *devloc;
       cudaGetDeviceCount(&deviceCount);
       devloc=(int *)malloc(deviceCount*sizeof(int));
       devloc[0]=999;
       for (dev = 0; dev < deviceCount; ++dev)
        {
        cudaGetDeviceProperties(&deviceProp, dev);
        if(deviceProp.major>1)
          {
           devloc[slot]=dev;
           slot++;
          };
        }
       if(devloc[0]==999) {
	printf("No GPU available!\n");
	exit(1);
       }
       //printf ("Assigning device %d  to process on node %s rank %d \n",devloc[myrank],  host_name, rank );
       /* Assign device to MPI process and probe device properties */
       if(myrank>=deviceCount) {
       	cudaSetDevice(devloc[0]); /* Use always first GPU */
       } else {
       	cudaSetDevice(devloc[myrank]);
       }
       cudaGetDevice(&dev);
       cudaGetDeviceProperties(&deviceProp, dev);
       size_t free_bytes, total_bytes;
       cudaMemGetInfo(&free_bytes, &total_bytes);
       printf("Host: %s Rank=%d (%s)  ECC=%s  Free = %lu, Total = %lu\n",host_name,rank, deviceProp.name, deviceProp.ECCEnabled ? "Enabled " : "Disabled", (unsigned long)free_bytes, (unsigned long)total_bytes);

}

