Opencl 将cl_float2复制到常量内存不适用于Nvidia平台当我尝试将一组CLYFLAT2值复制到常量内存时，它不会像英伟达平台那样工作，Y部分似乎是零。对于AMD和Intel平台，我没有遇到这个问题 // Host c_Quadrature_Filter_1 = clCreateBuffer(context, CL_MEM_READ_ONLY, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), NULL, &createBufferErrorQuadratureFilter1); cl_float2* filter_temp = (cl_float2*)malloc(FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2)); cl_float2 test; test.s[0] = 3.0f; test.s[1] = 13.0f; for (int xx = 0; xx < FILTER_SIZE; xx++) { for (int yy = 0; yy < FILTER_SIZE; yy++) { filter_temp[xx + yy * FILTER_SIZE].s[0] = test.s[0]; filter_temp[xx + yy * FILTER_SIZE].s[1] = test.s[1]; } } clEnqueueWriteBuffer(commandQueue, c_Quadrature_Filter_1, CL_TRUE, 0, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), filter_temp, 0, NULL, NULL); free(filter_temp); //Device __kernel(__global float2* Filter_Response, __constant float2* c_Quadrature_Filter_1, __private int DATA_W, __private int DATA_H, __private int DATA_D) { int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); Filter_Response[Calculate3DIndex(x,y,z,DATA_W,DATA_H)].y = c_Quadrature_Filter_1[0].y; }_Opencl

Opencl 将cl_float2复制到常量内存不适用于Nvidia平台当我尝试将一组CLYFLAT2值复制到常量内存时，它不会像英伟达平台那样工作，Y部分似乎是零。对于AMD和Intel平台，我没有遇到这个问题 // Host c_Quadrature_Filter_1 = clCreateBuffer(context, CL_MEM_READ_ONLY, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), NULL, &createBufferErrorQuadratureFilter1); cl_float2* filter_temp = (cl_float2*)malloc(FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2)); cl_float2 test; test.s[0] = 3.0f; test.s[1] = 13.0f; for (int xx = 0; xx < FILTER_SIZE; xx++) { for (int yy = 0; yy < FILTER_SIZE; yy++) { filter_temp[xx + yy * FILTER_SIZE].s[0] = test.s[0]; filter_temp[xx + yy * FILTER_SIZE].s[1] = test.s[1]; } } clEnqueueWriteBuffer(commandQueue, c_Quadrature_Filter_1, CL_TRUE, 0, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), filter_temp, 0, NULL, NULL); free(filter_temp); //Device __kernel(__global float2* Filter_Response, __constant float2* c_Quadrature_Filter_1, __private int DATA_W, __private int DATA_H, __private int DATA_D) { int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); Filter_Response[Calculate3DIndex(x,y,z,DATA_W,DATA_H)].y = c_Quadrature_Filter_1[0].y; }

opencl

Opencl 将cl_float2复制到常量内存不适用于Nvidia平台当我尝试将一组CLYFLAT2值复制到常量内存时，它不会像英伟达平台那样工作，Y部分似乎是零。对于AMD和Intel平台，我没有遇到这个问题 // Host c_Quadrature_Filter_1 = clCreateBuffer(context, CL_MEM_READ_ONLY, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), NULL, &createBufferErrorQuadratureFilter1); cl_float2* filter_temp = (cl_float2*)malloc(FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2)); cl_float2 test; test.s[0] = 3.0f; test.s[1] = 13.0f; for (int xx = 0; xx < FILTER_SIZE; xx++) { for (int yy = 0; yy < FILTER_SIZE; yy++) { filter_temp[xx + yy * FILTER_SIZE].s[0] = test.s[0]; filter_temp[xx + yy * FILTER_SIZE].s[1] = test.s[1]; } } clEnqueueWriteBuffer(commandQueue, c_Quadrature_Filter_1, CL_TRUE, 0, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), filter_temp, 0, NULL, NULL); free(filter_temp); //Device __kernel(__global float2* Filter_Response, __constant float2* c_Quadrature_Filter_1, __private int DATA_W, __private int DATA_H, __private int DATA_D) { int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); Filter_Response[Calculate3DIndex(x,y,z,DATA_W,DATA_H)].y = c_Quadrature_Filter_1[0].y; },opencl,Opencl,我可能错了，但我从来没有让向量类型的数学在GTX680和GTX260上工作过。正因为如此，一些AMD库（如FFT）不适用于NVIDIA卡，但适用于AMD和Intel硬件。英伟达似乎在OpenCL方面落后了另一件需要注意的事情是OpenCL设备最佳利用资源的首选向量长度。例如，我的ATI7990的浮点首选向量长度为1，而我的i5的v向量长度为8。因此，为了充分利用i5，我将使用float8来最大限度地利用SIMD 要检查首选向量长度，请与CL_设备_首选_向量_宽度_浮动选项一起使用编辑： O

我可能错了，但我从来没有让向量类型的数学在GTX680和GTX260上工作过。正因为如此，一些AMD库（如FFT）不适用于NVIDIA卡，但适用于AMD和Intel硬件。英伟达似乎在OpenCL方面落后了

另一件需要注意的事情是OpenCL设备最佳利用资源的首选向量长度。例如，我的ATI7990的浮点首选向量长度为1，而我的i5的v向量长度为8。因此，为了充分利用i5，我将使用float8来最大限度地利用SIMD

要检查首选向量长度，请与CL_设备_首选_向量_宽度_浮动选项一起使用

编辑：

Oops，显示了本机向量宽度，但它与首选向量宽度相同

您可以尝试使用float4并将最后两个组件设置为零吗？这可能是英伟达实施的一个bug，我看不出有什么问题。一切看起来都很好。重新检查内核外部的输入和输出。要么是这样，要么是驱动程序错误。