Memory OpenCL矩阵乘法
我是OpenCL的初学者。我一直在尝试写一个矩阵乘法代码。 它工作得很好,只是它将垃圾值作为C数组的输出。我无法纠正这个错误。 任何帮助都将不胜感激 下面是主机和内核代码Memory OpenCL矩阵乘法,memory,opencl,matrix-multiplication,Memory,Opencl,Matrix Multiplication,我是OpenCL的初学者。我一直在尝试写一个矩阵乘法代码。 它工作得很好,只是它将垃圾值作为C数组的输出。我无法纠正这个错误。 任何帮助都将不胜感激 下面是主机和内核代码 #include <CL/cl.h> #include <iostream> #include <cstdio> #include <fstream> #include <stdlib.h> #include <assert.h> #include <
#include <CL/cl.h>
#include <iostream>
#include <cstdio>
#include <fstream>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
using namespace std;
#define SUCCESS 0
#define FAILURE 1
// Function to convert file name into a string
int convertToString(const char *filename, std::string &s)
{
size_t size;
char *str;
std::fstream f(filename, (std::fstream::in | std::fstream::binary));
if (f.is_open())
{
size_t fileSize;
f.seekg(0, std::fstream::end);
size = fileSize = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);
str = new char[size + 1];
if (!str)
{
f.close();
return 0;
}
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return 0;
}
cout << "Error: failed to open file\n:" << filename << endl;
return FAILURE;
}
int main()
{
cl_uint status;
cl_int *error;
int A[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
int B[9] = {2, 2, 2, 2, 2, 2, 2, 2, 2};
int C[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
// Setting up platforms
cl_platform_id platform = NULL;
cl_uint numPlatforms = 0;
// Getting no of platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (status != CL_SUCCESS)
{
cout << "\nUnable to query platforms";
return 0;
}
// Get the platform
if (numPlatforms > 0)
{
cl_platform_id*platforms=
cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
}
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status =
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, devices, &numDevices);
if (numDevices == 0)
{
cout << "No GPU device available! Choosing CPU.\n";
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, devices,
&numDevices);
devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices,
devices, NULL);
}
else
{
devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices,
devices, NULL);
if (status == 0)
{
cout << "Device error!";
return 0;
}
}
// Creating contexts
cl_context context =
clCreateContext(NULL, 1, devices, NULL, NULL, (cl_int *)status);
if (status != CL_SUCCESS)
{
cout << status;
}
// Creating command queues
cl_command_queue command =
clCreateCommandQueue(context, devices[0], 0, NULL);
// if(error!=CL_SUCCESS)
//{
// cout<<error;
//}
// Creating buffers
cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
3 * 3 * sizeof(int), NULL, NULL);
status = clEnqueueWriteBuffer(command, bufferA, CL_TRUE, 0, 9 * sizeof(int),
(void *)A, 0, NULL, NULL);
status = clEnqueueWriteBuffer(command, bufferB, CL_TRUE, 0, 9 * sizeof(int),
(void *)B, 0, NULL, NULL);
// status=clEnqueueReadBuffer(command,bufferA,CL_TRUE,0,9*sizeof(int),(void*)C,0,NULL,NULL);
const char *filename = "kernel.cl";
string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = {strlen(source)};
cl_program program =
clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, numDevices, 0, NULL, NULL, NULL);
cl_kernel myKernel = clCreateKernel(program, "multiply", NULL);
// Setting kernel arguments
clSetKernelArg(myKernel, 0, sizeof(cl_mem), &bufferC);
clSetKernelArg(myKernel, 1, sizeof(cl_mem), &bufferA);
clSetKernelArg(myKernel, 2, sizeof(cl_mem), &bufferB);
size_t localws[2] = {9, 9};
size_t globalws[2] = {3, 3};
status = clEnqueueNDRangeKernel(command, myKernel, 2, NULL, globalws,
localws, 0, NULL, NULL);
status = clEnqueueReadBuffer(command, bufferC, CL_TRUE, 0, 9 * sizeof(int),
(void *)C, 0, NULL, NULL);
for (int i = 0; i < 9; i++) cout << C[i] << " ";
status = clReleaseKernel(myKernel); // Release kernel.
status = clReleaseProgram(program); // Release program object.
status = clReleaseMemObject(bufferA); // Release mem object.
status = clReleaseMemObject(bufferB);
status = clReleaseMemObject(bufferC);
status = clReleaseCommandQueue(command); // Release Command queue.
status = clReleaseContext(context); // Release context.
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
使用名称空间std;
#定义成功0
#定义故障1
//函数将文件名转换为字符串
int convertToString(const char*filename,std::string&s)
{
大小;
char*str;
std::fstream f(文件名,(std::fstream::in | std::fstream::binary));
如果(f.是开着的())
{
文件大小;
f、 seekg(0,标准::流::结束);
size=fileSize=(size_t)f.tellg();
f、 seekg(0,std::fstream::beg);
str=新字符[大小+1];
如果(!str)
{
f、 close();
返回0;
}
f、 读取(str,文件大小);
f、 close();
str[size]='\0';
s=str;
删除[]str;
返回0;
}
cout正如@Marco13已经指出的那样,内核有很多问题
当通过类似这样的工具运行此内核时,您可以看到有许多编译错误:
> clcc matmul.cl
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 2: error: identifier "_global" is undefined
_global int inputB)
^
"/tmp/OCLu7FyFF.cl", line 2: error: invalid combination of type specifiers
_global int inputB)
^
6 errors detected in the compilation of "/tmp/OCLu7FyFF.cl".
像clcc
这样的工具在早期捕获错误时非常有用。大多数供应商也有自己版本的独立内核编译器/检查器:例如,英特尔有自己的,AMD有一个静态内核分析器。另一个选项是通过调用clGetProgramBuildInfo 要检索编译器输出,在clBuildProgram
之后返回CL\u BUILD\u PROGRAM\u FAILURE
一旦修复了这些编译错误,您的内核似乎仍然没有达到预期的效果:如前所述,输入和输出应该是指针,因为您将向内核传递缓冲区。此外,输入和输出数组的索引也不正确:for循环中的inputA[row*3+1]
应该是inputA[row*3+i]
(i
而不是1
)。将结果保存到outputUTC
时,我希望outputUTC[row*3+col]
(row*3
)而不是row+3
)
我还没有详细看过宿主代码,但我至少要确保,尤其是刚开始使用OpenCL时,总是检查每个返回代码和错误。这将为您节省大量时间和挫折
最后,如果您想通过动手的方式快速开始学习OpenCL,我强烈建议您参加Simon McIntosh Smith和Tom Deakin提供的开源培训。培训时间不长,非常实用,并提供了许多有用的见解。优化矩阵乘法是展示的用例之一一步一步。这很难是正确的内核代码:参数至少应该是指针-即\u global int*outputC
而不是\u global int outputC
。这个内核甚至不应该编译,但您不会注意到:您应该明确地检查status=clBuildProgram>返回的“status”代码(...)
是不是CL\u成功
了!我对OpenCL了解不够,但你有3个错误的包含项,它们应该是
、
和
。是的。不是。我也尝试了一个简单的hello world程序。它仍然没有给CL\u带来成功。我如何调试它?你为什么要自己编写而不是自己编写sing已经包含矩阵乘法的库,如clMath或ArrayFire(我正在研究)?只是想知道这是否是一个适合您的学习练习,或者您是否只是在OpenCL中寻找好的矩阵乘法选项:-)这是一个学习练习。我尝试过为一个简单的hello world程序编辑代码。仍然不起作用。我也更改了内核代码。非常感谢。我尝试过构建日志。不打印任何内容:(不客气。如果这回答了你的问题,请你(如果允许的话)将此标记为答案,这样它就不会再显示为未回答的问题了?谢谢!
> clcc matmul.cl
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
__kernel void multiply(_global int outputC, _global int inputA,
^
"/tmp/OCLu7FyFF.cl", line 2: error: identifier "_global" is undefined
_global int inputB)
^
"/tmp/OCLu7FyFF.cl", line 2: error: invalid combination of type specifiers
_global int inputB)
^
6 errors detected in the compilation of "/tmp/OCLu7FyFF.cl".