为什么我的OpenCL内核赢了'；不能使用特定参数执行_Opencl_Execution_Nvidia

为什么我的OpenCL内核赢了'；不能使用特定参数执行

opencl

为什么我的OpenCL内核赢了'；不能使用特定参数执行,opencl,execution,nvidia,Opencl,Execution,Nvidia,我有一个在JOCL中运行的OpenCL内核，它通过了我所有的JUnit测试。我把我的代码移植到C++中，这样我就可以在相同的条件下对内核进行配置。除了一个，司机在任何情况下都能正常工作。它在JOCL中运行得非常好，所以我相信C++代码中的某些东西是错误的。我的代码在下面，我已经审计到死。如果有人能帮我找出问题所在，我将不胜感激驱动程序代码在arg1和arg2为8192，arg3为512时工作正常。它在arg1和arg2作为512和arg3作为8192时也可以正常工作。arg4总是只有1，这将内

我有一个在JOCL中运行的OpenCL内核，它通过了我所有的JUnit测试。我把我的代码移植到C++中，这样我就可以在相同的条件下对内核进行配置。除了一个，司机在任何情况下都能正常工作。它在JOCL中运行得非常好，所以我相信C++代码中的某些东西是错误的。我的代码在下面，我已经审计到死。如果有人能帮我找出问题所在，我将不胜感激

驱动程序代码在arg1和arg2为8192，arg3为512时工作正常。它在arg1和arg2作为512和arg3作为8192时也可以正常工作。arg4总是只有1，这将内核设置为实数。当我将arg1和arg2设置为262144，arg3设置为16时，它会执行，不会报告错误，不会出现seg错误，但内核最终不会更改数据。请注意，在上述所有情况下，arg 1*3等于2^22。我相信我在所有情况下都分配了相同数量的浮动。我被难住了。我无法让OpenCL告诉我出了什么问题：(

void HelperFunctions:：callKernel（int windowSize、int primitivesPerDataFrame、int nInFramesThisCall、int realOrComplex）
{
//OpenCL变量
cl_平台\u id平台；//OpenCL平台
cl\U设备\U id设备；//OpenCL设备
cl_context gpuContext；//OpenCL context
cl_命令_队列commandQueue；//OpenCL命令队列
cl_程序clProgram；//OpenCL程序
cl_内核clkernel；//OpenCL内核
void*dataHostBuffer；//主机缓冲区
void*windowDataHostBuffer；//主机缓冲区
cl_mem inData；//OpenCL设备缓冲区
cl_mem windowData；//OpenCL设备源缓冲区
size_t szKernelLength；//内核代码的字节大小
cl_int errCode；//错误代码var
长网格x=256；
长网格=16384；
长网格z=1；
大小\u t全局工作\u大小[]={gridX，gridY，gridZ}；
大小\u t本地工作\u大小[]={gridX，1，1}；
const char*cSourceCL=NULL；//用于保存源代码以进行编译的缓冲区
//分配和初始化主机阵列
dataHostBuffer=（void*）malloc（sizeof（cl_float）*primitivesPerDataFrame*NinFramestThis调用）；
windowDataHostBuffer=（void*）malloc（sizeof（cl_float）*windowSize）；
//填充数据缓冲区
dataHostBuffer=generateRampData（primitivesPerDataFrame*NinFramestThisCall）；
windowDataHostBuffer=blackman（WindowsSize）；
//获取OpenCL平台
errCode=clGetPlatformIDs（1，&platform，NULL）；
cout更新，我发现了！问题出在我的内核中。我使用常量内存。我的java代码解释了这一点，并对代码进行文本操作，因此如果arg 2的缓冲区大小>16384，它会将_常量更改为_全局。我应该知道这一点，但我忘记了
void HelperFunctions::callKernel(int windowSize, int primitivesPerDataFrame, int nInFramesThisCall, int realOrComplex)
{
// OpenCL Vars
cl_platform_id platform;       // OpenCL platform
cl_device_id device;           // OpenCL device
cl_context gpuContext;         // OpenCL context
cl_command_queue commandQueue; // OpenCL command queue
cl_program clProgram;           // OpenCL program
cl_kernel clkernel;             // OpenCL kernel
void *dataHostBuffer;        // Host buffer
void *windowDataHostBuffer;        // Host buffer
cl_mem inData;   // OpenCL device buffer
cl_mem windowData;  // OpenCL device source buffer
size_t szKernelLength;        // Byte size of kernel code
cl_int errCode;                // Error code var

long gridX = 256;
long gridY = 16384;
long gridZ = 1;
size_t global_work_size[] = {gridX, gridY, gridZ};
size_t local_work_size[] = {gridX, 1, 1};
const char* cSourceCL = NULL;     // Buffer to hold source for compilation

// Allocate and initialize host arrays
dataHostBuffer = (void *)malloc(sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall);
windowDataHostBuffer = (void *)malloc(sizeof(cl_float) * windowSize);

//Populate the data buffers
dataHostBuffer = generateRampData(primitivesPerDataFrame * nInFramesThisCall);

windowDataHostBuffer = blackman(windowSize);

//Get an OpenCL platform
errCode = clGetPlatformIDs(1, &platform, NULL);
cout << "Error Code: " << errCode << endl;

//Get the devices
errCode = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
cout << "Error Code: " << errCode << endl;

//Create the context
gpuContext = clCreateContext(0, 1, &device, NULL, NULL, &errCode);
cout << "Error Code: " << errCode << endl;

// Create a command-queue
commandQueue = clCreateCommandQueue(gpuContext, device, 0, &errCode);

// Read the OpenCL kernel in from source file
cSourceCL = oclLoadProgSource("/home/djkasht/workspaceBlueprint/bp/bp-trunk/bundles/CopperShark/src/coppershark/dsp/blocks/opencl/dsp/window/Window.cl", "", &szKernelLength);

szKernelLength = strlen(cSourceCL);
// Create the program
clProgram = clCreateProgramWithSource(gpuContext, 1, (const char **)&cSourceCL, &szKernelLength, &errCode);
cout << "Error Code: " << errCode << endl;

// Build the program
errCode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
cout << "Error Code: " << errCode << endl;

size_t log_size = 1000000 * sizeof(char);
char build_log[log_size];
size_t len;
errCode = clGetProgramBuildInfo(clProgram, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, &len);
cout << build_log << endl;

// Create the kernel
clkernel = clCreateKernel(clProgram, "window", &errCode);
cout << "Error Code: " << errCode << endl;

// Allocate the OpenCL buffer memory objects
inData = clCreateBuffer(gpuContext, CL_MEM_READ_WRITE, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, NULL, &errCode);
cout << "Error Code: " << errCode << endl;
windowData = clCreateBuffer(gpuContext, CL_MEM_READ_ONLY, sizeof(cl_float) * windowSize, NULL, &errCode);
cout << "Error Code: " << errCode << endl;

// Set the Argument values
errCode = clSetKernelArg(clkernel, 0, sizeof(cl_mem), (void*)&inData);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 1, sizeof(cl_mem), (void*)&windowData);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 2, sizeof(cl_int), (void*)&windowSize);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 3, sizeof(cl_int), (void*)&primitivesPerDataFrame);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 4, sizeof(cl_int), (void*)&nInFramesThisCall);
cout << "Error Code: " << errCode << endl;
errCode = clSetKernelArg(clkernel, 5, sizeof(cl_int), (void*)&realOrComplex);
cout << "Error Code: " << errCode << endl;

// Asynchronous write of data to GPU device
errCode = clEnqueueWriteBuffer(commandQueue, inData, CL_FALSE, 0, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, dataHostBuffer, 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;

// Synchronous/blocking read of results, and check accumulated errors
errCode = clEnqueueWriteBuffer(commandQueue, windowData, CL_FALSE, 0, sizeof(cl_float) * windowSize, windowDataHostBuffer, 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;

errCode = clEnqueueNDRangeKernel(commandQueue, clkernel, 3, NULL, &(global_work_size[0]), &(local_work_size[0]), 0, NULL, NULL);
cout << "Error Code: " << errCode << endl;

void* dataHostBuffer2 = (void *)malloc(sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall);
errCode = clEnqueueReadBuffer(commandQueue, inData, CL_TRUE, 0, sizeof(cl_float) * primitivesPerDataFrame * nInFramesThisCall, dataHostBuffer2, 0, NULL, NULL);