OpenCL内存传输问题(错误代码-6)

OpenCL内存传输问题(错误代码-6),opencl,global,nvidia,allocation,Opencl,Global,Nvidia,Allocation,我内心有点死了。我整天都在做这件事,但毫无结果。我在运行一些以前运行得很好的代码时遇到了问题,所以我编写了一个简短的“玩具”OpenCL程序来尝试找出发生了什么,但我的玩具程序让我感到困惑和难以置信的沮丧 我正在使用一款具有3Gb全局内存的Nvidia 780i。它的最大分配容量约为780 Mb。起初,当我故意过度分配时,它不会出错。解决了这个问题(它是排版的,但是编译器/分析器没有捕捉到它)。现在,即使在试图分配低于设备应该能够处理的方式时,我在第二个大缓冲区分配上也会得到一个错误代码-6(C

我内心有点死了。我整天都在做这件事,但毫无结果。我在运行一些以前运行得很好的代码时遇到了问题,所以我编写了一个简短的“玩具”OpenCL程序来尝试找出发生了什么,但我的玩具程序让我感到困惑和难以置信的沮丧

我正在使用一款具有3Gb全局内存的Nvidia 780i。它的最大分配容量约为780 Mb。起初,当我故意过度分配时,它不会出错。解决了这个问题(它是排版的,但是编译器/分析器没有捕捉到它)。现在,即使在试图分配低于设备应该能够处理的方式时,我在第二个大缓冲区分配上也会得到一个错误代码-6(CL_OUT_of_HOST_MEMORY)

我一直在研究这个错误,但我无法跟踪它在这种情况下的应用。我正在使用的机器中有32 gb的ram,因此肯定不会出现短缺。我想这里发生了一些我不明白的事情

它会正确分配第一个缓冲区,但第二个缓冲区会阻塞。我基本上无法分配我想要和需要的全局内存量

非常感谢您的帮助。如果你能帮我,而且你在洛杉矶附近,我会带你出去喝一杯。这就是我的挫败感

下面是我的机器的代码和输出

谢谢, 约翰

主程序:

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "kernels.cl"
#define KERNEL_NAME "test"

#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>


#define N_PROJ 4000
#define N_CHANNELS 736
#define N_ROWS 32


/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {

    cl_platform_id platform;
    cl_device_id dev;
    int err;

    /* Identify a platform */
    err = clGetPlatformIDs(1, &platform, NULL);
    if(err < 0) {
        perror("Couldn't identify a platform");
        exit(1);
    }

    /* Access a device */
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
    if(err == CL_DEVICE_NOT_FOUND) {
        perror("Just a heads up: I'm not going to run on the GPU");
        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
    }
    if(err < 0) {
        perror("Couldn't access any devices");
        exit(1);
    }
    cl_ulong16 alloc_size,mem_size;
    char name[40];

    clGetDeviceInfo(dev,CL_DEVICE_MAX_MEM_ALLOC_SIZE,sizeof(cl_ulong16),&alloc_size,NULL);
    clGetDeviceInfo(dev,CL_DEVICE_NAME,sizeof(name),name,NULL);
    clGetDeviceInfo(dev,CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong16),&mem_size,NULL);
    printf("Using device: %s\n",name);
    printf("Global memory size: %lu\n",mem_size);
    printf("Max. allocation: %lu\n",alloc_size);

    return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

   cl_program program;
   FILE *program_handle;
   char *program_buffer, *program_log;
   size_t program_size, log_size;
   int err;

   /* Read program file and place content into buffer */
   program_handle = fopen(filename, "r");
   if(program_handle == NULL) {
      perror("Couldn't find the program file");
      exit(1);
   }
   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle)-13;
   rewind(program_handle);
   program_buffer = (char*)malloc(program_size + 1);
   program_buffer[program_size] = '\0';
   fread(program_buffer, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   /* Create program from file */
   program = clCreateProgramWithSource(ctx, 1,
      (const char**)&program_buffer, &program_size, &err);
   if(err < 0) {
      perror("Couldn't create the program");
      exit(1);
   }
   free(program_buffer);

   /* Build program */
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if(err < 0) {

      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            0, NULL, &log_size);
      program_log = (char*) malloc(log_size + 1);
      program_log[log_size] = '\0';
      clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            log_size + 1, program_log, NULL);
      printf("%s\n", program_log);
      free(program_log);
      exit(1);
   }

   return program;
}

int main(int argc, const char * argv[])
{
    /* This file serves as a backbone for OpenCL programs. */
    /* All the user needs to do is enter their OpenCL data */
    /* structures, set kernel args, and kernel dispatches. */

    /* Standard OCL structures */
    cl_device_id device;
    cl_context context;
    cl_program program;
    cl_kernel kernel;
    cl_command_queue queue;

    device=create_device();
    context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
    program = build_program(context, device, PROGRAM_FILE);
    queue = clCreateCommandQueue(context, device,0, NULL);
    kernel = clCreateKernel(program, KERNEL_NAME, NULL);

    /* User code goes here */
    cl_int err;

    /* Declare and set data */
    int a[]={1,2,3};
    float *rebin;
    rebin =(float*) calloc(N_PROJ*N_CHANNELS*N_ROWS,sizeof(float));
    float *mat;
    mat   =(float*) calloc(N_PROJ*N_CHANNELS*N_ROWS,sizeof(float));

    printf("\nAllocation size: %lu\n",N_PROJ*N_CHANNELS*N_ROWS*sizeof(float));

    /* Declare and set buffer objects */
    cl_mem a_buff,rebin_buff,mat_buff;

    printf("Total memory to be allocated: %lu\n",2*N_PROJ*N_CHANNELS*N_ROWS*sizeof(float)+sizeof(a) );

    a_buff      =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,sizeof(a),a,NULL);
    rebin_buff  =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),rebin,&err);
    if (err<0){
        printf("Error: %i\n",err);
        perror("Couldn't create buffer 1");
        exit(1);
    }
    mat_buff    =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),mat  ,&err);
    if (err<0){
        printf("Error: %i\n",err);
        perror("Couldn't create buffer 2");
        exit(1);
    }
    /* Copy data over to the device */
    err=clSetKernelArg(kernel,0,sizeof(cl_mem),&mat_buff);
    if (err<0){
        perror("Couldn't set kernel argument");
        exit(1);
    }
    err=clSetKernelArg(kernel,1,sizeof(cl_mem),&rebin_buff);
    err=clSetKernelArg(kernel,2,sizeof(cl_mem),&a_buff);

    clEnqueueTask(queue,kernel,0,NULL,NULL);

    clEnqueueReadBuffer(queue,mat_buff  ,CL_TRUE,0,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),mat  ,0,NULL,NULL);
    clEnqueueReadBuffer(queue,rebin_buff,CL_TRUE,0,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),rebin,0,NULL,NULL);
    clEnqueueReadBuffer(queue,a_buff,    CL_TRUE,0,sizeof(a),a,0,NULL,NULL);

    printf("%f %f %f\n",mat[1],mat[2],mat[3]);
    printf("%f %f %f\n",rebin[1],rebin[2],rebin[3]);
    printf("%i %i %i",a[0],a[1],a[2]);

    /***********************/



    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);
    //clReleaseDevice(device);

    printf("\n\nProgram apparently executed fully. \n");
    return 0;
}
我的机器的控制台输出:

Using device: GeForce GTX 780
Global memory size: 3221225472
Max. allocation: 805306368

Allocation size: 376832000
Total memory to be allocated: 753664012
Error: -6
Couldn't create buffer 2: No error

Process returned 1 (0x1)   execution time : 0.435 s
Press any key to continue.

如果你把分配的数量减少了很多,它会起作用吗?一个可能的原因是,如果将其构建为32位应用程序,因为这将大大减少您可以寻址的主机内存量。这是您可以在设备上分配的最大值。但这并不能保证alloc会成功。32位应用程序通常只能看到GPU内存的一小部分,这可能会导致这些问题。您应该升级到64位OpenCL。好的,您的应用程序有缺陷,这给您带来了很多痛苦。但是你的问题到底是什么呢?我想,我没有一个非常精确的问题,但我已经到了看不出哪里出了问题的地步,并且已经用尽了我所知道的一切来解决问题(在我发布之前,我已经花了好几个小时)。我想我的问题是“有人看到我发布的代码有什么问题吗?”而且,在我发布这篇文章后回家的路上,我意识到我没有发布我正在创建的巨大浮点数组。我想这可能是在我一天的工作过程中泄漏了大量的内存。最后,我目前正在链接一个32位库,主要是出于方便(它是已经安装的)。我将升级并重试此操作。我不认为在32位库中使用少于4 Gig的内存是一个问题,但是有很多事情我不知道!
Using device: GeForce GTX 780
Global memory size: 3221225472
Max. allocation: 805306368

Allocation size: 376832000
Total memory to be allocated: 753664012
Error: -6
Couldn't create buffer 2: No error

Process returned 1 (0x1)   execution time : 0.435 s
Press any key to continue.