Parallel processing OpenCL矩阵乘法排队/缓冲区读取
我正在用OpenCL编写一个基本的矩阵乘法程序。我相信我的问题在于我的排队和/或缓冲区读取,因为我得到的结果矩阵的输出完全不正确,矩阵A和B的第一行也不正确。我是OpenCL的新手,我一直在为此绞尽脑汁很长一段时间,也许这里有人能给我一个关于我哪里出错的提示 主机代码:Parallel processing OpenCL矩阵乘法排队/缓冲区读取,parallel-processing,opencl,matrix-multiplication,Parallel Processing,Opencl,Matrix Multiplication,我正在用OpenCL编写一个基本的矩阵乘法程序。我相信我的问题在于我的排队和/或缓冲区读取,因为我得到的结果矩阵的输出完全不正确,矩阵A和B的第一行也不正确。我是OpenCL的新手,我一直在为此绞尽脑汁很长一段时间,也许这里有人能给我一个关于我哪里出错的提示 主机代码: #define __NO_STD_VECTOR // Uses cl::vector instead of standard version #include <CL/cl.hpp> #include <std
#define __NO_STD_VECTOR // Uses cl::vector instead of standard version
#include <CL/cl.hpp>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <math.h>
#include <string>
/* Defined matrix width/height constants */
#define numRowsA 3
#define numColsA 3
#define numRowsB 3
#define numColsB 3
#define numRowsC numRowsA
#define numColsC numColsB
using namespace std;
/* Function declarations */
inline void checkErr(cl_int err, string name);
void initMatrix (float* matrix, int numIndices);
void printMatrix (string displayName, float* matrix, int numIndices,
int rowSize);
//*************
// Main Program
//*************
int main(int argc, char* argv[]) {
/* Check for valid matrix sizes */
if (numColsA != numRowsB) {
cout << "ERROR: Invalid matrix dimensions." << endl;
} else {
srand(2013); // Set random seed
/* Allocate memory for matrices A, B, and C */
unsigned int sizeA = numRowsA * numColsA;
unsigned int sizeB = numRowsB * numColsB;
unsigned int sizeC = numRowsC * numColsC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/*
Allocate memoryA/memoryB/memoryC size blocks of bytes
(cast from void*)
*/
float* blockA = (float*) malloc(memoryA);
float* blockB = (float*) malloc(memoryB);
float* blockC = (float*) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(blockA, sizeA);
initMatrix(blockB, sizeB);
/* Display matrices A and B */
printMatrix("Matrix A", blockA, sizeA, numColsA);
printMatrix("Matrix B", blockB, sizeB, numColsB);
cl_int err; // Error code
string platformVendor; // Platform vendor
/* Create list of platforms */
cl::vector < cl::Platform > platformList;
cl::Platform::get(&platformList);
/*
Display potential Platform list generation error. If the
platform list size does not equal 0, CL_SUCCESS (0) is
sent to the function. If the platform list size does
equal 0, -1 is sent to the function.
*/
checkErr(platformList.size()!=0 ? CL_SUCCESS : -1,
"Platform");
/*
Replace empty value of platformVendor with device vendor
name
*/
platformList[0].getInfo((cl_platform_info) CL_PLATFORM_VENDOR,
&platformVendor);
/* Properties for Context constructor (Use unknown) */
cl_context_properties cprops[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties) (platformList[0]) (),
0
};
/* Create context */
cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL,
&err);
/* Display potential Context constructor error */
checkErr(err, "Context");
/* Create buffer for matrix A */
cl::Buffer deviceMemA(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeA, blockA, &err);
/* Create buffer for matrix B */
cl::Buffer deviceMemB(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeB, blockB, &err);
/* Create buffer for matrix C */
cl::Buffer deviceMemC(context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeC, blockC, &err);
/* Create buffer for row (A) and col (C) */
cl::Buffer rowA(context, CL_MEM_READ_ONLY, sizeof(int),
(void *) numRowsA, &err);
cl::Buffer colC(context, CL_MEM_READ_ONLY, sizeof(int),
(void *) numColsC, &err);
/* Display potential Buffer constructor error */
checkErr(err, "Buffers");
/* Get list of devices */
cl::vector<cl::Device> devices =
context.getInfo<CL_CONTEXT_DEVICES>();
/* Check for at least one device, if not throw error */
checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "No Devices");
/* Read input from .cl file */
ifstream file("matrixMult1_kernels.cl");
/* Check for potential problem opening .cl input file */
checkErr(file.is_open() ? CL_SUCCESS:-1, "File Not Open");
/* Store file contents in a string */
string prog(istreambuf_iterator<char>(file),
(istreambuf_iterator<char>()));
/* Create source object */
cl::Program::Sources source(1, make_pair(prog.c_str(),
prog.length()+1));
/* Create program for given context and source */
cl::Program program(context, source);
err = program.build(devices, ""); // Check for build error
/* Display potential program build error */
checkErr(err, "Program Build");
/* Create kernel */
cl::Kernel kernel(program, "matrixMul", &err);
/* Display potential Kernel constructor error */
checkErr(err, "Kernel");
/*
Set matrixMul arguments, error checking after each
argument
*/
err = kernel.setArg(0, deviceMemA);
checkErr(err, "Arg0");
err = kernel.setArg(1, deviceMemB);
checkErr(err, "Arg1");
err = kernel.setArg(2, deviceMemC);
checkErr(err, "Arg2");
err = kernel.setArg(3, rowA);
checkErr(err, "Arg3");
err = kernel.setArg(4, colC);
checkErr(err, "Arg4");
/* Create command queue */
cl::CommandQueue queue(context, devices[0], 0, &err);
/* Display potential CommandQueue constructor error */
checkErr(err, "Command Queue");
/* Create event object */
cl::Event event;
cl::NDRange global(3, 3);
cl::NDRange local(1, 1);
/* Enqueue the kernel */
err = queue.enqueueNDRangeKernel(kernel, 2, global, local,
NULL, &event);
/* Display potential enqueueing error */
checkErr(err, "Enqueue");
/* Wait until kernel has completed execution before continuing */
event.wait();
/* Read kernel result back into host memory */
err = queue.enqueueReadBuffer(deviceMemC, CL_TRUE, 0, memoryC,
blockC, NULL, &event);
checkErr(err, "C");
err = queue.enqueueReadBuffer(deviceMemA, CL_TRUE, 0, sizeA,
blockA, NULL, &event);
err = queue.enqueueReadBuffer(deviceMemB, CL_TRUE, 0, sizeB,
blockB, NULL, &event);
/* Display potential kernel read error */
checkErr(err, "Read Buffer");
/* Display matrices */
cout << endl;
cout << "After:" << endl;
printMatrix("Matrix A", blockA, sizeA, numColsA);
printMatrix("Matrix B", blockB, sizeB, numColsB);
printMatrix("Matrix C", blockC, sizeC, numColsC);
/* Free up memory */
free(blockA);
free(blockB);
free(blockC);
}
}
//--------------------------------------------------------------------
// checkErr - Inline error checking function for OpenCL portion of
// host program.
//
// PRE: err is of type int in OpenCL; name is a string.
// POST: The program is terminated after display an error message
// indicating the location of the error and the error code.
//--------------------------------------------------------------------
inline void checkErr(cl_int err, string name) {
/* Check error code against OpenCL success constant */
if (err != CL_SUCCESS) {
/*
Display an error message stating the error origin and
error number.
*/
std::cerr << "ERROR: " << name << " (" << err << ")"
<< std::endl;
exit(EXIT_FAILURE); // Terminates process with status code 0
}
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix (float* matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; i++) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float) RAND_MAX;
}
}
//--------------------------------------------------------------------
// printMatrix - Outputs a readable version of the matrix.
//
// PRE: displayName is a string; matrix is a pointer to a block of
// bytes in memory; numIndices an integer indicating the number
// of indices in the matrix being displayed (read left-to-right,
// top-to-bottom); rowSize is an integer indicating the number
// of elements in one row of the matrix.
// POST: A readable version of the matrix is displayed.
//--------------------------------------------------------------------
void printMatrix (string displayName, float* matrix, int numIndices,
int rowSize) {
/* Output display name of matrix */
cout << "\n" << displayName << ":" << endl;
/* Loop through each indice of the matrix */
for (int i = 0; i < numIndices; i++) {
cout << matrix[i]; // Display value at this indice
/* Check for next row of the matrix */
if (((i + 1) % rowSize) == 0) {
cout << endl; // Line break
} else {
cout << " | "; // Indice separator
}
}
}
\define\u NO\u STD\u VECTOR//使用cl::VECTOR而不是标准版本
#包括
#包括
#包括
#包括
#包括
#包括
#包括
/*定义的矩阵宽度/高度常数*/
#定义numRowsA 3
#定义numColsA 3
#定义numrowsb3
#定义numColsB 3
#定义numRowsC numRowsA
#定义numColsC numColsB
使用名称空间std;
/*函数声明*/
内联无效校验错误(cl_int err,字符串名称);
void initMatrix(浮点*矩阵,整数numIndices);
void printMatrix(字符串显示名、浮点*矩阵、整数numIndices、,
整数行大小);
//*************
//主程序
//*************
int main(int argc,char*argv[]){
/*检查有效的矩阵大小*/
if(numColsA!=numRowsB){
是否可以0?CL_成功:-1,“无设备”);
/*从.cl文件读取输入*/
ifstream文件(“matrixMult1_kernels.cl”);
/*检查打开.cl输入文件是否存在潜在问题*/
checkErr(file.is_open()?CL_SUCCESS:-1,“文件未打开”);
/*以字符串形式存储文件内容*/
字符串程序(istreambuf_迭代器(文件),
(istreambuf_迭代器());
/*创建源对象*/
cl::Program::Sources source(1,make_pair(prog.c_str()),
程序长度()+1));
/*为给定的上下文和源创建程序*/
cl::程序(上下文、源);
err=program.build(devices,“”;//检查生成错误
/*显示潜在的程序生成错误*/
checkErr(err,“程序构建”);
/*创建内核*/
cl::内核(程序,“matrixMul”&err);
/*显示潜在的内核构造函数错误*/
checkErr(err,内核);
/*
设置matrixMul参数,每次之后进行错误检查
论点
*/
err=kernel.setArg(0,设备内存);
checkErr(err,“Arg0”);
err=kernel.setArg(1,设备emb);
checkErr(err,“Arg1”);
err=kernel.setArg(2,设备emc);
checkErr(err,“Arg2”);
err=kernel.setArg(3,rowA);
checkErr(err,“Arg3”);
err=kernel.setArg(4,colC);
checkErr(err,“Arg4”);
/*创建命令队列*/
cl::CommandQueue队列(上下文、设备[0]、0和错误);
/*显示潜在的CommandQueue构造函数错误*/
checkErr(err,“命令队列”);
/*创建事件对象*/
cl:事件;
cl::NDRange global(3,3);
cl::NDRange local(1,1);
/*使内核排队*/
err=queue.enqueueNDRangeKernel(内核,2,全局,局部,
空值(&事件);
/*显示潜在排队错误*/
checker(err,“排队”);
/*等待内核完成执行后再继续*/
event.wait();
/*将内核结果读回主机内存*/
err=queue.enqueueReadBuffer(设备EMC,CL_TRUE,0,memoryC,
blockC、NULL和事件);
checker(err,“C”);
err=queue.enqueueReadBuffer(deviceMemA,CL_TRUE,0,sizeA,
blockA、NULL和event);
err=queue.enqueueReadBuffer(deviceMemB,CL_TRUE,0,sizeB,
blockB、NULL和事件);
/*显示潜在的内核读取错误*/
checkErr(err,“读取缓冲区”);
/*显示矩阵*/
cout输入矩阵A和B的数据不会传递到设备。创建缓冲区时:
cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)
blockA参数被忽略,因为标志没有指定如何使用它。您需要添加至少CL\u MEM\u COPY\u HOST\u PTR以使用blockA的内容初始化缓冲区
或者,您可以在创建缓冲区后调用clEnqueueWriteBuffer发送数据。谢谢!缓冲区现在的格式为cl::Buffer deviceMemA(上下文,cl_MEM_只读,cl_MEM_USE_HOST_PTR,memoryA,blockA,&err)(C除外,它是只写的).现在,基于内核代码的最后一行,我在缓冲区读取时收到一个无效的命令队列错误,该错误将值设置为矩阵C中的指定位置。缓冲区读取应该传递缓冲区的大小(以字节为单位),如memoryC,而不是大小{A,B,C}值。另外,您在代码中读取了3倍的缓冲区C。嗯,我将缓冲区大小更改为sizeA、sizeB和sizeC,并且我已经在缓冲区x 3部分中找到了。现在这是一个无效的命令队列错误。我承认,我不知道那些NDRange家伙是怎么回事,他们正确吗?再次感谢!全局大小应该是维度输出矩阵的大小。本地大小应除以全局大小。尝试使用1,1来调试本地大小。enqueueReadBuffer希望以字节为单位的大小,memoryC,而不是以浮点大小为单位的大小。嗯,相同的无效命令队列错误…您有推荐的教程吗?我很抱歉耽误了您的3x3矩阵代码。
cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)