Memory management OpenCl清理导致segfault

Memory management OpenCl清理导致segfault,memory-management,segmentation-fault,opencl,Memory Management,Segmentation Fault,Opencl,我使用网络上的不同资源构建了自己的小Opencl示例。实际的内核工作正常,我得到了想要的输出,但是我在其中一个示例中发现的清理函数会导致错误。我做错了什么 #include <stdio.h> #include <stdlib.h> #include <errno.h> #include <CL/cl.h> //opencl #define CL_CHECK(_expr)

我使用网络上的不同资源构建了自己的小Opencl示例。实际的内核工作正常,我得到了想要的输出,但是我在其中一个示例中发现的清理函数会导致错误。我做错了什么

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl

#define CL_CHECK(_expr)                                                         \
   do {                                                                         \
     cl_int _err = _expr;                                                       \
     if (_err == CL_SUCCESS)                                                    \
       break;                                                                   \
     fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
     abort();                                                                   \
   } while (0)

#define CL_CHECK_ERR(_expr)                                                     \
   ({                                                                           \
     cl_int _err = CL_INVALID_VALUE;                                            \
     typeof(_expr) _ret = _expr;                                                \
     if (_err != CL_SUCCESS) {                                                  \
       fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
       abort();                                                                 \
     }                                                                          \
     _ret;                                                                      \
   })

const char* OpenCLSource[] = {
       "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
       "{",
       "      // Index of the elements to add \n",
       "      unsigned int n = get_global_id(0);",
       "      // Sum the n’th element of vectors a and b and store in c \n",
       "      c[n] = a[n] + b[n];",
       "}"
};

cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
    cl_int _err;
    *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
    printf("\n1-%i\n",_err);
    // Get the list of GPU devices associated with this context
    size_t ParmDataBytes;
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
    cl_device_id* GPUDevices;
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
    // Create a command-queue on the first GPU device
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
    printf("\n2-%i\n",_err);
    // Create OpenCL program with source code
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
    printf("\n3-%i\n",_err);

    CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
              NULL, NULL, NULL, NULL));


     cl_int errcode;
    *cl_forward1 = clCreateKernel(*OpenCLProgram, 
               "VectorAdd", &errcode);
               printf("\n7-%i\n",errcode);

    return GPUDevices;
}


int main(int argc, char** argv)
{
    cl_context GPUContext;
    cl_command_queue GPUCommandQueue;
    cl_program OpenCLProgram;
    cl_kernel OpenCLVectorAdd;
    cl_device_id* GPUDevices;

    GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);

    // Two integer source vectors in Host memory
    int n=5 ;
    int x[5]={1,2,4,6,8};
    int y[5]={1,2,4,6,8};
    int output[n];
    int size_x = n*sizeof(x);
    int size_y = n*sizeof(y);

    int size_output = n*sizeof(output); // this changes for the second forward1
    cl_int _err;
    // Allocate GPU memory for source vectors AND initialize from CPU memory
    cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
                     printf("\n4-%i\n",_err);
    cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
                     printf("\n5-%i\n",_err);


    // Allocate output memory on GPU
    cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
                                          size_output, NULL, &_err);
                                           printf("\n6-%i\n",_err);

     // In the next step we associate the GPU memory with the Kernel arguments
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);


    // 7. Launch OpenCL kernel
    size_t localWorkSize[1], globalWorkSize[1];
    //localWorkSize = ;
    globalWorkSize[0] = n;

    // Launch the Kernel on the GPU
    CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
    // Copy the output in GPU memory back to CPU memory

    //float* h_C = (float*) malloc(size_output);
    CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
              total_cl, CL_TRUE, 0, size_output, 
                output, 0, NULL, NULL));
    for (int i=0; i<n;i++){
        printf("\n%i",output[i]);
    }

    // Cleanup (each of the following lines causes a seg fault
    // ******************************
    CL_CHECK(free(GPUDevices)); 
    CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
    CL_CHECK(clReleaseProgram(OpenCLProgram));
    CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
    CL_CHECK(clReleaseContext(GPUContext));
    CL_CHECK(clReleaseMemObject(total_cl));
    CL_CHECK(clReleaseMemObject(x_cl));
    CL_CHECK(clReleaseMemObject(y_cl));
    /* ****************

    return 0;
}
#包括
#包括
#包括
#包括//opencl
#定义CL_检查(_expr)\
做{\
cl_int_err=_expr\
如果(\u err==CL\u SUCCESS)\
中断\
fprintf(stderr,“OpenCL错误:'%s'返回%d!\n',#_expr,(int)_err)\
中止()\
}而(0)
#定义CL_检查错误(_expr)\
({                                                                           \
cl_int_err=cl_无效值\
typeof(_expr)_ret=_expr\
如果(_err!=CL_SUCCESS){\
fprintf(stderr,“OpenCL错误:'%s'返回%d!\n',#_expr,(int)_err)\
中止()\
}                                                                          \
_ret\
})
常量字符*OpenCLSource[]={
“uuu内核无效向量添加(uuu全局int*c,uuuu全局int*a,uuu全局int*b)”,
"{",
“//要添加的元素的索引\n”,
“unsigned int n=get_global_id(0);”,
“//求向量a和b的第n个元素的和并存储在c\n中”,
“c[n]=a[n]+b[n];”,
"}"
};
cl_设备id*init_opencl(cl_上下文*GPUContext,cl_命令队列*GPUCommandQueue,cl_内核*cl_转发1,cl_程序*OpenCLProgram){
//创建一个上下文以在支持CUDA的NVIDIA GPU上运行OpenCL
cl_int_err;
*GPUContext=clCreateContextFromType(0,CL_设备_类型_GPU,NULL,NULL,&_错误);
printf(“\n1-%i\n”,错误);
//获取与此上下文关联的GPU设备列表
大小\u t ParmDataBytes;
CL_检查(clGetContextInfo(*GPUContext,CL_CONTEXT_设备,0,NULL和ParmDataBytes));
cl_设备id*GPU设备;
GPUDevices=(cl_设备id*)malloc(ParmDataBytes);
CL_检查(clGetContextInfo(*GPUContext,CL_CONTEXT_设备,ParmDataBytes,GPUDevices,NULL));
//在第一个GPU设备上创建命令队列
*GPUCommandQueue=clCreateCommandQueue(*GPUContext,GPUDevices[0],0,&_err);
printf(“\n2-%i\n”,错误);
//用源代码创建OpenCL程序
*OpenCLProgram=clCreateProgramWithSource(*GPUContext,7,OpenCLSource,NULL,&u err);
printf(“\n3-%i\n”,错误);
CL_检查(clBuildProgram(*OpenCLProgram,0,
空,空,空,空);
cl_国际编码;
*cl_forward1=clCreateKernel(*OpenCLProgram,
“矢量添加”、&errcode);
printf(“\n7-%i\n”,错误代码);
返回GPU设备;
}
int main(int argc,字符**argv)
{
CLU上下文GPUContext;
cl_命令_队列GPUCommandQueue;
CLU程序OpenCLProgram;
cl_内核openclvectorad;
cl_设备id*GPU设备;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&openclvectorad,&OpenCLProgram);
//主机内存中的两个整数源向量
int n=5;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
整数输出[n];
int size_x=n*sizeof(x);
int size_y=n*sizeof(y);
int size_output=n*sizeof(output);//对于第二个forward1,此更改
cl_int_err;
//为源向量分配GPU内存,并从CPU内存初始化
cl_mem x_cl=clCreateBuffer(GPUContext,cl_mem_只读|
CL_MEM_COPY_HOST_PTR、尺寸_x、x和_err);
printf(“\n4-%i\n”,错误);
cl_mem y_cl=clCreateBuffer(GPUContext,cl_mem_只读|
CL_MEM_COPY_HOST_PTR、大小_y、y和_err);
printf(“\n5-%i\n”,错误);
//在GPU上分配输出内存
cl_mem total_cl=clCreateBuffer(仅限GPUContext、cl_mem_WRITE_、,
大小(输出、空值和错误);
printf(“\n6-%i\n”,错误);
//在下一步中,我们将GPU内存与内核参数相关联
clSetKernelArg(openclvectorad,0,sizeof(cl_mem),(void*)和total_cl);
clSetKernelArg(openclvectorad,1,sizeof(cl_mem),(void*)和x_cl);
clSetKernelArg(openclvectorad,2,sizeof(cl_mem),(void*)和y_cl);
//7.启动OpenCL内核
大小\u t本地工作大小[1],全局工作大小[1];
//localWorkSize=;
globalWorkSize[0]=n;
//在GPU上启动内核
CL_检查(ClenqueueEndRangeKernel(GPUCommandQueue,OpenCLVectorAdd,1,NULL,globalWorkSize,NULL,0,NULL,NULL));
//将GPU内存中的输出复制回CPU内存
//float*h_C=(float*)malloc(大小输出);
CL_检查(clEnqueueReadBuffer(GPUCommandQueue,
总cl,cl,真,0,大小输出,
输出,0,NULL,NULL));

对于(inti=0;i我纠正并更改了一些小事情。所以这段代码现在应该可以工作了

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl

#define CL_CHECK(_expr)                                                         \
   do {                                                                         \
     cl_int _err = _expr;                                                       \
     if (_err == CL_SUCCESS)                                                    \
       break;                                                                   \
     fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
     abort();                                                                   \
   } while (0)

#define CL_CHECK_ERR(_expr)                                                     \
   ({                                                                           \
     cl_int _err = CL_INVALID_VALUE;                                            \
     typeof(_expr) _ret = _expr;                                                \
     if (_err != CL_SUCCESS) {                                                  \
       fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
       abort();                                                                 \
     }                                                                          \
     _ret;                                                                      \
   })

const char* OpenCLSource[] = {
       "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
       "{",
       "      // Index of the elements to add \n",
       "      unsigned int n = get_global_id(0);",
       "      // Sum the n’th element of vectors a and b and store in c \n",
       "      c[n] = a[n] + b[n];",
       "}"
};

cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
    cl_int _err;
    *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
    printf("\nclCreateContextFromType:%i\n",_err);
    // Get the list of GPU devices associated with this context
    size_t ParmDataBytes;
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
    cl_device_id* GPUDevices;
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
    // Create a command-queue on the first GPU device
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
    printf("\nclCreateCommandQueue:%i\n",_err);
    // Create OpenCL program with source code
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
    printf("\nclCreateProgramWithSource:%i\n",_err);

    CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
              NULL, NULL, NULL, NULL));


     cl_int errcode;
    *cl_forward1 = clCreateKernel(*OpenCLProgram, 
               "VectorAdd", &errcode);
               printf("\nclCreateKernel:%i\n",errcode);

    return GPUDevices;
}


int main(int argc, char** argv)
{
    cl_context GPUContext;
    cl_command_queue GPUCommandQueue;
    cl_program OpenCLProgram;
    cl_kernel OpenCLVectorAdd;
    cl_device_id* GPUDevices;

    GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);

    int n=5 ;
    int x[5]={1,2,4,6,8};
    int y[5]={1,2,4,6,8};
    int output[n];
    int size_x = n*sizeof(x);
    int size_y = n*sizeof(y);
    int size_output = n*sizeof(output);

    cl_int _err;

    // Allocate GPU memory for source vectors AND initialize from CPU memory
    cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
                    printf("\nclCreateBuffer:%i\n",_err);
    cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
                    printf("\nclCreateBuffer:%i\n",_err);


    // Allocate output memory on GPU
    cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
                                          size_output, NULL, &_err);
                                           printf("\nclCreateBuffer:%i\n",_err);

     // In the next step we associate the GPU memory with the Kernel arguments
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);


    size_t globalWorkSize[1];
    globalWorkSize[0] = n;

    // Launch the Kernel on the GPU
    CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
    clFinish(GPUCommandQueue);
    // Copy the output in GPU memory back to CPU memory

    int* h_c = (int*) malloc(size_output);
    CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
              total_cl, CL_TRUE, 0, size_output, 
                h_c, 0, NULL, NULL));
    clFinish(GPUCommandQueue);
    for (int i=0; i<n;i++){
        printf("\noutput[%i]=%i",i,h_c[i]);
    }

    // Cleanup
    free(GPUDevices); 
    CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
    CL_CHECK(clReleaseProgram(OpenCLProgram));
    CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
    CL_CHECK(clReleaseContext(GPUContext));
    CL_CHECK(clReleaseMemObject(x_cl));
    CL_CHECK(clReleaseMemObject(total_cl));
    CL_CHECK(clReleaseMemObject(y_cl));

    return 0;
}
#包括
#包括
#包括
#包括//opencl
#定义CL_检查(_expr)\
做{\
cl_int_err=_expr\
如果(\u err==CL\u SUCCESS)\
中断\
fprintf(stderr,“OpenCL错误:“%s”返回了%d!\n”,#u expr,