如何使用OPENCL在多GPU环境中唯一标识GPU?

如何使用OPENCL在多GPU环境中唯一标识GPU?,opencl,gpgpu,Opencl,Gpgpu,我正在学习OpenCL,已经编写了一个初步代码,可以查询机器并找出与之相关的平台和设备 #include <stdio.h> #include <stdlib.h> #include <CL/cl.h> int main(int argc,char** argv){ printf("Let's see what devices are there in this Node\n\n"); cl_int errNum,errCPU,errGPU; cl_ui

我正在学习OpenCL,已经编写了一个初步代码,可以查询机器并找出与之相关的平台和设备

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>

int main(int argc,char** argv){

printf("Let's see what devices are there in this Node\n\n");

cl_int errNum,errCPU,errGPU;
cl_uint numPlatforms;
cl_platform_id *platformIds;
cl_context context=NULL;
char dname[500];
int i,dc,dg;
cl_device_id *cpuDevices,*gpuDevices;
cl_uint numCPUDevices,numGPUDevices,entries;
cl_ulong long_entries;
size_t p_size;

errNum = clGetPlatformIDs(0,NULL,&numPlatforms);

if(errNum==CL_SUCCESS){ printf("Number of Platforms on this Node: %d\n\n",numPlatforms); }
else{ printf("Error:Failure in clGetPlatformIDs,error code=%d\n",errNum); }

platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id)*numPlatforms);
errNum = clGetPlatformIDs(numPlatforms,platformIds,NULL);

if(errNum==CL_SUCCESS){

for(i=0;i<numPlatforms;i++){

   printf("Platform Information on %d Platform\n",i+1);
   /*Obtain information about platform*/
   clGetPlatformInfo(platformIds[i],CL_PLATFORM_NAME,500,dname,NULL);
   printf("\tCL_PLATFORM_NAME = %s\n",dname);
   clGetPlatformInfo(platformIds[i],CL_PLATFORM_VERSION,500,dname,NULL);
   printf("\tCL_PLATFORM_VERSION = %s\n",dname);

   /*obtain list of devices available on platform*/
   clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_CPU,0,NULL,&numCPUDevices);
   printf("\t%d CPU devices found\n",numCPUDevices);
   clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,0,NULL,&numGPUDevices);
   printf("\t%d GPU devices found\n",numGPUDevices);       
   cpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numCPUDevices);
   gpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numGPUDevices);
   printf("\tPrinting devices Information\n");

   if(numCPUDevices>0){
    for(dc=0;dc<numCPUDevices;dc++){
     printf("\t\tPrinting CPU Devices Information\n");
     errCPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_CPU,1,&cpuDevices[dc],NULL);
     if(errCPU==CL_SUCCESS){
        printf("\t\tDevice Id is %d\n",cpuDevices[dc]);
        printf("\t\tDevice Information of %d device on %d platform\n",dc+1,i+1);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_NAME,500,dname,NULL);
        printf("\t\tDevice # %d name = %s\n",dc,dname);

        clGetDeviceInfo(cpuDevices[dc],CL_DRIVER_VERSION,500,dname,NULL);
        printf("\t\tDriver version = %s\n",dname);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tGlobal Memory (MB) : \t%llu\n",long_entries/1024/1024);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tGlobal Memory Cache (MB):\t%llu\n",long_entries/1024/1024);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tLocal Memory (KB): \t%llu\n",long_entries/1024);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tMax clock (MHz) : \t%llu\n",long_entries);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,NULL);
        printf("\t\tMax Work Group Size: \t%d\n",p_size);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,NULL);
        printf("\t\tNumber of parallel compute cores:\t%d\n",entries);

     }else{printf("\t\tError:Failure in clGetDeviceIds,error code = %d\n",errCPU);}
    }
   }else{printf("\t\tZero CPU Devices found\n");}
   /* query devices for information */
   if(numGPUDevices>0){
    for(dg = 0;dg<numGPUDevices;dg++ ){
     printf("\t\tPrinting GPU Devices Information\n");
     errGPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,1,&gpuDevices[dg],NULL);
     if(errGPU==CL_SUCCESS){

        printf("\t\tDevice Id is %d\n",gpuDevices[dg]);
        printf("\t\tDevice Information of %d device on %d platform\n",dg+1,i+1);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_NAME,500,dname,NULL);
        printf("\t\tDevice # %d name = %s\n",dg,dname);

        clGetDeviceInfo(gpuDevices[dg],CL_DRIVER_VERSION,500,dname,NULL);
        printf("\t\tDriver version = %s\n",dname);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tGlobal Memory (MB) : \t%llu\n",long_entries/1024/1024);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tGlobal Memory Cache (MB):\t%llu\n",long_entries/1024/1024);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tLocal Memory (KB): \t%llu\n",long_entries/1024);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tMax clock (MHz) : \t%llu\n",long_entries);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,NULL);
        printf("\t\tMax Work Group Size: \t%d\n",p_size);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,NULL);
        printf("\t\tNumber of parallel compute cores:\t%d\n",entries);

     }else{printf("\t\tError:Failure in clGetDeviceIds,error code = %d\n\t\t,Or,This platform cannot interact with the GPUs.Check for the drivers\n",errGPU);}
    }
   }else{printf("\t\tZero GPU Devices found\n");}

 }

  }
  else{
    printf("Error:Failure in clGetPlatformIDs,error code = %d\n",errNum);
  }


  return 0;

}
我的问题是,为什么三个GPU都有相同的deviceID。若我输入%p,我得到的答案是十六进制的

一般来说,如何在给定的平台上唯一地标识设备,以便我可以在特定的设备上运行内核?假设,我想在多gpu环境中向一个gpu发送带有特定数据的内核

这是请求单个设备ID(总是返回第一个),并将其写入
gpuDevices

clgetDeviceID
从设备循环中拉出,并在获取设备数量后立即运行它。这将一次填充所有设备ID

gpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numGPUDevices);
errGPU = clGetDeviceIDs(platformIds[i], CL_DEVICE_TYPE_GPU, numGPUDevices,  gpuDevices, NULL)
这是请求单个设备ID(总是返回第一个),并将其写入
gpuDevices

clgetDeviceID
从设备循环中拉出,并在获取设备数量后立即运行它。这将一次填充所有设备ID

gpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numGPUDevices);
errGPU = clGetDeviceIDs(platformIds[i], CL_DEVICE_TYPE_GPU, numGPUDevices,  gpuDevices, NULL)

谢谢。成功了。我不知道我必须在内部for循环之前填充。非常感谢。成功了。我不知道我必须在内部for循环之前填充。
gpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numGPUDevices);
errGPU = clGetDeviceIDs(platformIds[i], CL_DEVICE_TYPE_GPU, numGPUDevices,  gpuDevices, NULL)