Memory management OpenCl清理导致segfault
我使用网络上的不同资源构建了自己的小Opencl示例。实际的内核工作正常,我得到了想要的输出,但是我在其中一个示例中发现的清理函数会导致错误。我做错了什么Memory management OpenCl清理导致segfault,memory-management,segmentation-fault,opencl,Memory Management,Segmentation Fault,Opencl,我使用网络上的不同资源构建了自己的小Opencl示例。实际的内核工作正常,我得到了想要的输出,但是我在其中一个示例中发现的清理函数会导致错误。我做错了什么 #include <stdio.h> #include <stdlib.h> #include <errno.h> #include <CL/cl.h> //opencl #define CL_CHECK(_expr)
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
typeof(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} \
_ret; \
})
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_int _err;
*GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
printf("\n1-%i\n",_err);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
cl_device_id* GPUDevices;
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
// Create a command-queue on the first GPU device
*GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
printf("\n2-%i\n",_err);
// Create OpenCL program with source code
*OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
printf("\n3-%i\n",_err);
CL_CHECK(clBuildProgram(*OpenCLProgram, 0,
NULL, NULL, NULL, NULL));
cl_int errcode;
*cl_forward1 = clCreateKernel(*OpenCLProgram,
"VectorAdd", &errcode);
printf("\n7-%i\n",errcode);
return GPUDevices;
}
int main(int argc, char** argv)
{
cl_context GPUContext;
cl_command_queue GPUCommandQueue;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
cl_device_id* GPUDevices;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
// Two integer source vectors in Host memory
int n=5 ;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
int output[n];
int size_x = n*sizeof(x);
int size_y = n*sizeof(y);
int size_output = n*sizeof(output); // this changes for the second forward1
cl_int _err;
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
printf("\n4-%i\n",_err);
cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
printf("\n5-%i\n",_err);
// Allocate output memory on GPU
cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
size_output, NULL, &_err);
printf("\n6-%i\n",_err);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
// 7. Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
//localWorkSize = ;
globalWorkSize[0] = n;
// Launch the Kernel on the GPU
CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
// Copy the output in GPU memory back to CPU memory
//float* h_C = (float*) malloc(size_output);
CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue,
total_cl, CL_TRUE, 0, size_output,
output, 0, NULL, NULL));
for (int i=0; i<n;i++){
printf("\n%i",output[i]);
}
// Cleanup (each of the following lines causes a seg fault
// ******************************
CL_CHECK(free(GPUDevices));
CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
CL_CHECK(clReleaseProgram(OpenCLProgram));
CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
CL_CHECK(clReleaseContext(GPUContext));
CL_CHECK(clReleaseMemObject(total_cl));
CL_CHECK(clReleaseMemObject(x_cl));
CL_CHECK(clReleaseMemObject(y_cl));
/* ****************
return 0;
}
#包括
#包括
#包括
#包括//opencl
#定义CL_检查(_expr)\
做{\
cl_int_err=_expr\
如果(\u err==CL\u SUCCESS)\
中断\
fprintf(stderr,“OpenCL错误:'%s'返回%d!\n',#_expr,(int)_err)\
中止()\
}而(0)
#定义CL_检查错误(_expr)\
({ \
cl_int_err=cl_无效值\
typeof(_expr)_ret=_expr\
如果(_err!=CL_SUCCESS){\
fprintf(stderr,“OpenCL错误:'%s'返回%d!\n',#_expr,(int)_err)\
中止()\
} \
_ret\
})
常量字符*OpenCLSource[]={
“uuu内核无效向量添加(uuu全局int*c,uuuu全局int*a,uuu全局int*b)”,
"{",
“//要添加的元素的索引\n”,
“unsigned int n=get_global_id(0);”,
“//求向量a和b的第n个元素的和并存储在c\n中”,
“c[n]=a[n]+b[n];”,
"}"
};
cl_设备id*init_opencl(cl_上下文*GPUContext,cl_命令队列*GPUCommandQueue,cl_内核*cl_转发1,cl_程序*OpenCLProgram){
//创建一个上下文以在支持CUDA的NVIDIA GPU上运行OpenCL
cl_int_err;
*GPUContext=clCreateContextFromType(0,CL_设备_类型_GPU,NULL,NULL,&_错误);
printf(“\n1-%i\n”,错误);
//获取与此上下文关联的GPU设备列表
大小\u t ParmDataBytes;
CL_检查(clGetContextInfo(*GPUContext,CL_CONTEXT_设备,0,NULL和ParmDataBytes));
cl_设备id*GPU设备;
GPUDevices=(cl_设备id*)malloc(ParmDataBytes);
CL_检查(clGetContextInfo(*GPUContext,CL_CONTEXT_设备,ParmDataBytes,GPUDevices,NULL));
//在第一个GPU设备上创建命令队列
*GPUCommandQueue=clCreateCommandQueue(*GPUContext,GPUDevices[0],0,&_err);
printf(“\n2-%i\n”,错误);
//用源代码创建OpenCL程序
*OpenCLProgram=clCreateProgramWithSource(*GPUContext,7,OpenCLSource,NULL,&u err);
printf(“\n3-%i\n”,错误);
CL_检查(clBuildProgram(*OpenCLProgram,0,
空,空,空,空);
cl_国际编码;
*cl_forward1=clCreateKernel(*OpenCLProgram,
“矢量添加”、&errcode);
printf(“\n7-%i\n”,错误代码);
返回GPU设备;
}
int main(int argc,字符**argv)
{
CLU上下文GPUContext;
cl_命令_队列GPUCommandQueue;
CLU程序OpenCLProgram;
cl_内核openclvectorad;
cl_设备id*GPU设备;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&openclvectorad,&OpenCLProgram);
//主机内存中的两个整数源向量
int n=5;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
整数输出[n];
int size_x=n*sizeof(x);
int size_y=n*sizeof(y);
int size_output=n*sizeof(output);//对于第二个forward1,此更改
cl_int_err;
//为源向量分配GPU内存,并从CPU内存初始化
cl_mem x_cl=clCreateBuffer(GPUContext,cl_mem_只读|
CL_MEM_COPY_HOST_PTR、尺寸_x、x和_err);
printf(“\n4-%i\n”,错误);
cl_mem y_cl=clCreateBuffer(GPUContext,cl_mem_只读|
CL_MEM_COPY_HOST_PTR、大小_y、y和_err);
printf(“\n5-%i\n”,错误);
//在GPU上分配输出内存
cl_mem total_cl=clCreateBuffer(仅限GPUContext、cl_mem_WRITE_、,
大小(输出、空值和错误);
printf(“\n6-%i\n”,错误);
//在下一步中,我们将GPU内存与内核参数相关联
clSetKernelArg(openclvectorad,0,sizeof(cl_mem),(void*)和total_cl);
clSetKernelArg(openclvectorad,1,sizeof(cl_mem),(void*)和x_cl);
clSetKernelArg(openclvectorad,2,sizeof(cl_mem),(void*)和y_cl);
//7.启动OpenCL内核
大小\u t本地工作大小[1],全局工作大小[1];
//localWorkSize=;
globalWorkSize[0]=n;
//在GPU上启动内核
CL_检查(ClenqueueEndRangeKernel(GPUCommandQueue,OpenCLVectorAdd,1,NULL,globalWorkSize,NULL,0,NULL,NULL));
//将GPU内存中的输出复制回CPU内存
//float*h_C=(float*)malloc(大小输出);
CL_检查(clEnqueueReadBuffer(GPUCommandQueue,
总cl,cl,真,0,大小输出,
输出,0,NULL,NULL));
对于(inti=0;i我纠正并更改了一些小事情。所以这段代码现在应该可以工作了
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
typeof(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} \
_ret; \
})
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_int _err;
*GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
printf("\nclCreateContextFromType:%i\n",_err);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
cl_device_id* GPUDevices;
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
// Create a command-queue on the first GPU device
*GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
printf("\nclCreateCommandQueue:%i\n",_err);
// Create OpenCL program with source code
*OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
printf("\nclCreateProgramWithSource:%i\n",_err);
CL_CHECK(clBuildProgram(*OpenCLProgram, 0,
NULL, NULL, NULL, NULL));
cl_int errcode;
*cl_forward1 = clCreateKernel(*OpenCLProgram,
"VectorAdd", &errcode);
printf("\nclCreateKernel:%i\n",errcode);
return GPUDevices;
}
int main(int argc, char** argv)
{
cl_context GPUContext;
cl_command_queue GPUCommandQueue;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
cl_device_id* GPUDevices;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
int n=5 ;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
int output[n];
int size_x = n*sizeof(x);
int size_y = n*sizeof(y);
int size_output = n*sizeof(output);
cl_int _err;
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
printf("\nclCreateBuffer:%i\n",_err);
cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
printf("\nclCreateBuffer:%i\n",_err);
// Allocate output memory on GPU
cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
size_output, NULL, &_err);
printf("\nclCreateBuffer:%i\n",_err);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
size_t globalWorkSize[1];
globalWorkSize[0] = n;
// Launch the Kernel on the GPU
CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
clFinish(GPUCommandQueue);
// Copy the output in GPU memory back to CPU memory
int* h_c = (int*) malloc(size_output);
CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue,
total_cl, CL_TRUE, 0, size_output,
h_c, 0, NULL, NULL));
clFinish(GPUCommandQueue);
for (int i=0; i<n;i++){
printf("\noutput[%i]=%i",i,h_c[i]);
}
// Cleanup
free(GPUDevices);
CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
CL_CHECK(clReleaseProgram(OpenCLProgram));
CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
CL_CHECK(clReleaseContext(GPUContext));
CL_CHECK(clReleaseMemObject(x_cl));
CL_CHECK(clReleaseMemObject(total_cl));
CL_CHECK(clReleaseMemObject(y_cl));
return 0;
}
#包括
#包括
#包括
#包括//opencl
#定义CL_检查(_expr)\
做{\
cl_int_err=_expr\
如果(\u err==CL\u SUCCESS)\
中断\
fprintf(stderr,“OpenCL错误:“%s”返回了%d!\n”,#u expr,