Parallel processing OpenCL矩阵乘法排队/缓冲区读取

Parallel processing OpenCL矩阵乘法排队/缓冲区读取,parallel-processing,opencl,matrix-multiplication,Parallel Processing,Opencl,Matrix Multiplication,我正在用OpenCL编写一个基本的矩阵乘法程序。我相信我的问题在于我的排队和/或缓冲区读取,因为我得到的结果矩阵的输出完全不正确,矩阵A和B的第一行也不正确。我是OpenCL的新手,我一直在为此绞尽脑汁很长一段时间,也许这里有人能给我一个关于我哪里出错的提示 主机代码: #define __NO_STD_VECTOR // Uses cl::vector instead of standard version #include <CL/cl.hpp> #include <std

我正在用OpenCL编写一个基本的矩阵乘法程序。我相信我的问题在于我的排队和/或缓冲区读取,因为我得到的结果矩阵的输出完全不正确,矩阵A和B的第一行也不正确。我是OpenCL的新手,我一直在为此绞尽脑汁很长一段时间,也许这里有人能给我一个关于我哪里出错的提示

主机代码:

#define __NO_STD_VECTOR // Uses cl::vector instead of standard version
#include <CL/cl.hpp>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <math.h>
#include <string>

/* Defined matrix width/height constants */
#define numRowsA 3
#define numColsA 3
#define numRowsB 3
#define numColsB 3
#define numRowsC numRowsA
#define numColsC numColsB

using namespace std;

/* Function declarations */
inline void checkErr(cl_int err, string name);
void initMatrix (float* matrix, int numIndices);
void printMatrix (string displayName, float* matrix, int numIndices,  
      int rowSize);

//*************
// Main Program
//*************
int main(int argc, char* argv[]) {

    /* Check for valid matrix sizes */
    if (numColsA != numRowsB) {
        cout << "ERROR: Invalid matrix dimensions." << endl;
    } else {

    srand(2013); // Set random seed

    /* Allocate memory for matrices A, B, and C */
    unsigned int sizeA = numRowsA * numColsA;
    unsigned int sizeB = numRowsB * numColsB;
    unsigned int sizeC = numRowsC * numColsC;
    unsigned int memoryA = sizeof(float) * sizeA;
    unsigned int memoryB = sizeof(float) * sizeB;
    unsigned int memoryC = sizeof(float) * sizeC;

    /*
        Allocate memoryA/memoryB/memoryC size blocks of bytes
        (cast from void*)
    */
    float* blockA = (float*) malloc(memoryA);
    float* blockB = (float*) malloc(memoryB);
    float* blockC = (float*) malloc(memoryC);

    /* Initialize matrices A and B */
    initMatrix(blockA, sizeA);
    initMatrix(blockB, sizeB);

    /* Display matrices A and B */
    printMatrix("Matrix A", blockA, sizeA, numColsA);
    printMatrix("Matrix B", blockB, sizeB, numColsB);

    cl_int err;            // Error code
    string platformVendor; // Platform vendor

    /* Create list of platforms */
    cl::vector < cl::Platform > platformList;
    cl::Platform::get(&platformList);

    /*
        Display potential Platform list generation error. If the
        platform list size does not equal 0, CL_SUCCESS (0) is
        sent to the function. If the platform list size does
        equal 0, -1 is sent to the function.
    */
    checkErr(platformList.size()!=0 ? CL_SUCCESS : -1,
            "Platform");

    /*
        Replace empty value of platformVendor with device vendor
        name
    */
    platformList[0].getInfo((cl_platform_info) CL_PLATFORM_VENDOR,
        &platformVendor);

    /* Properties for Context constructor (Use unknown) */
    cl_context_properties cprops[3] =
        {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties) (platformList[0]) (),
        0
        };

    /* Create context */
    cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL,
        &err);

    /* Display potential Context constructor error */
    checkErr(err, "Context");

    /* Create buffer for matrix A */
    cl::Buffer deviceMemA(context,
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeA, blockA, &err);

    /* Create buffer for matrix B */
    cl::Buffer deviceMemB(context,
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeB, blockB, &err);

    /* Create buffer for matrix C */
    cl::Buffer deviceMemC(context,
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeC, blockC, &err);

    /* Create buffer for row (A) and col (C) */
    cl::Buffer rowA(context, CL_MEM_READ_ONLY, sizeof(int),
            (void *) numRowsA, &err);
    cl::Buffer colC(context, CL_MEM_READ_ONLY, sizeof(int),
            (void *) numColsC, &err);

    /* Display potential Buffer constructor error */
    checkErr(err, "Buffers");

    /* Get list of devices */
    cl::vector<cl::Device> devices =
        context.getInfo<CL_CONTEXT_DEVICES>();

    /* Check for at least one device, if not throw error */
    checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "No Devices");

    /* Read input from .cl file */
    ifstream file("matrixMult1_kernels.cl");

    /* Check for potential problem opening .cl input file */
    checkErr(file.is_open() ? CL_SUCCESS:-1, "File Not Open");

    /* Store file contents in a string */
    string prog(istreambuf_iterator<char>(file),
            (istreambuf_iterator<char>()));

    /* Create source object */
    cl::Program::Sources source(1, make_pair(prog.c_str(),
        prog.length()+1));

    /* Create program for given context and source */
    cl::Program program(context, source);

    err = program.build(devices, ""); // Check for build error

    /* Display potential program build error */
    checkErr(err, "Program Build");

    /* Create kernel */
    cl::Kernel kernel(program, "matrixMul", &err);

    /* Display potential Kernel constructor error */
    checkErr(err, "Kernel");

    /*
        Set matrixMul arguments, error checking after each
        argument
    */
    err = kernel.setArg(0, deviceMemA);
    checkErr(err, "Arg0");
    err = kernel.setArg(1, deviceMemB);
    checkErr(err, "Arg1");
    err = kernel.setArg(2, deviceMemC);
    checkErr(err, "Arg2");
    err = kernel.setArg(3, rowA);
    checkErr(err, "Arg3");
    err = kernel.setArg(4, colC);
    checkErr(err, "Arg4");

    /* Create command queue */
    cl::CommandQueue queue(context, devices[0], 0, &err);

    /* Display potential CommandQueue constructor error */
    checkErr(err, "Command Queue");

    /* Create event object */
    cl::Event event;

    cl::NDRange global(3, 3);
    cl::NDRange local(1, 1);

    /* Enqueue the kernel */
    err = queue.enqueueNDRangeKernel(kernel, 2, global, local,
        NULL, &event);

    /* Display potential enqueueing error */
    checkErr(err, "Enqueue");

    /* Wait until kernel has completed execution before continuing */
    event.wait();

    /* Read kernel result back into host memory */
    err = queue.enqueueReadBuffer(deviceMemC, CL_TRUE, 0, memoryC,
        blockC, NULL, &event);

        checkErr(err, "C");

    err = queue.enqueueReadBuffer(deviceMemA, CL_TRUE, 0, sizeA,
        blockA, NULL, &event);
    err = queue.enqueueReadBuffer(deviceMemB, CL_TRUE, 0, sizeB,
        blockB, NULL, &event);

    /* Display potential kernel read error */
    checkErr(err, "Read Buffer");

    /* Display matrices */
        cout << endl;
        cout << "After:" << endl;
    printMatrix("Matrix A", blockA, sizeA, numColsA);
    printMatrix("Matrix B", blockB, sizeB, numColsB);
    printMatrix("Matrix C", blockC, sizeC, numColsC);

    /* Free up memory */
    free(blockA);
    free(blockB);
    free(blockC);
    }
}

//--------------------------------------------------------------------
// checkErr - Inline error checking function for OpenCL portion of
//            host program.
//
// PRE:  err is of type int in OpenCL; name is a string.
// POST: The program is terminated after display an error message
//       indicating the location of the error and the error code.
//--------------------------------------------------------------------
inline void checkErr(cl_int err, string name) {

    /* Check error code against OpenCL success constant */
    if (err != CL_SUCCESS) {

    /*
        Display an error message stating the error origin and
        error number.
    */
    std::cerr << "ERROR: " << name << " (" << err << ")"
              << std::endl;

    exit(EXIT_FAILURE); // Terminates process with status code 0
    }
}

//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
//              matrix.
//
// PRE:  matrix is a pointer to a block of bytes in memory; numIndices
//       is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
//       float value.
//--------------------------------------------------------------------
void initMatrix (float* matrix, int numIndices) {

    /*
    Loop through the block of bytes, assigning a random float
    for each index of the matrix
    */
    for (int i = 0; i < numIndices; i++) {

    /* Assign a random float between 0 and 1 at this byte */
    matrix[i] = rand() / (float) RAND_MAX;
    }
}

//--------------------------------------------------------------------
// printMatrix - Outputs a readable version of the matrix.
//
// PRE:  displayName is a string; matrix is a pointer to a block of
//       bytes in memory; numIndices an integer indicating the number
//       of indices in the matrix being displayed (read left-to-right,
//       top-to-bottom); rowSize is an integer indicating the number
//       of elements in one row of the matrix.
// POST: A readable version of the matrix is displayed.
//--------------------------------------------------------------------
void printMatrix (string displayName, float* matrix, int numIndices,
          int rowSize) {

    /* Output display name of matrix */
    cout << "\n" << displayName << ":" << endl;

    /* Loop through each indice of the matrix */
    for (int i = 0; i < numIndices; i++) {
    cout << matrix[i]; // Display value at this indice

    /* Check for next row of the matrix */
    if (((i + 1) % rowSize) == 0) {
        cout << endl; // Line break
    } else {
        cout << "  |  "; // Indice separator
    }
    }
}
\define\u NO\u STD\u VECTOR//使用cl::VECTOR而不是标准版本
#包括
#包括
#包括
#包括
#包括
#包括
#包括
/*定义的矩阵宽度/高度常数*/
#定义numRowsA 3
#定义numColsA 3
#定义numrowsb3
#定义numColsB 3
#定义numRowsC numRowsA
#定义numColsC numColsB
使用名称空间std;
/*函数声明*/
内联无效校验错误(cl_int err,字符串名称);
void initMatrix(浮点*矩阵,整数numIndices);
void printMatrix(字符串显示名、浮点*矩阵、整数numIndices、,
整数行大小);
//*************
//主程序
//*************
int main(int argc,char*argv[]){
/*检查有效的矩阵大小*/
if(numColsA!=numRowsB){
是否可以0?CL_成功:-1,“无设备”);
/*从.cl文件读取输入*/
ifstream文件(“matrixMult1_kernels.cl”);
/*检查打开.cl输入文件是否存在潜在问题*/
checkErr(file.is_open()?CL_SUCCESS:-1,“文件未打开”);
/*以字符串形式存储文件内容*/
字符串程序(istreambuf_迭代器(文件),
(istreambuf_迭代器());
/*创建源对象*/
cl::Program::Sources source(1,make_pair(prog.c_str()),
程序长度()+1));
/*为给定的上下文和源创建程序*/
cl::程序(上下文、源);
err=program.build(devices,“”;//检查生成错误
/*显示潜在的程序生成错误*/
checkErr(err,“程序构建”);
/*创建内核*/
cl::内核(程序,“matrixMul”&err);
/*显示潜在的内核构造函数错误*/
checkErr(err,内核);
/*
设置matrixMul参数,每次之后进行错误检查
论点
*/
err=kernel.setArg(0,设备内存);
checkErr(err,“Arg0”);
err=kernel.setArg(1,设备emb);
checkErr(err,“Arg1”);
err=kernel.setArg(2,设备emc);
checkErr(err,“Arg2”);
err=kernel.setArg(3,rowA);
checkErr(err,“Arg3”);
err=kernel.setArg(4,colC);
checkErr(err,“Arg4”);
/*创建命令队列*/
cl::CommandQueue队列(上下文、设备[0]、0和错误);
/*显示潜在的CommandQueue构造函数错误*/
checkErr(err,“命令队列”);
/*创建事件对象*/
cl:事件;
cl::NDRange global(3,3);
cl::NDRange local(1,1);
/*使内核排队*/
err=queue.enqueueNDRangeKernel(内核,2,全局,局部,
空值(&事件);
/*显示潜在排队错误*/
checker(err,“排队”);
/*等待内核完成执行后再继续*/
event.wait();
/*将内核结果读回主机内存*/
err=queue.enqueueReadBuffer(设备EMC,CL_TRUE,0,memoryC,
blockC、NULL和事件);
checker(err,“C”);
err=queue.enqueueReadBuffer(deviceMemA,CL_TRUE,0,sizeA,
blockA、NULL和event);
err=queue.enqueueReadBuffer(deviceMemB,CL_TRUE,0,sizeB,
blockB、NULL和事件);
/*显示潜在的内核读取错误*/
checkErr(err,“读取缓冲区”);
/*显示矩阵*/

cout输入矩阵A和B的数据不会传递到设备。创建缓冲区时:

cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)
blockA参数被忽略,因为标志没有指定如何使用它。您需要添加至少CL\u MEM\u COPY\u HOST\u PTR以使用blockA的内容初始化缓冲区


或者,您可以在创建缓冲区后调用clEnqueueWriteBuffer发送数据。

谢谢!缓冲区现在的格式为cl::Buffer deviceMemA(上下文,cl_MEM_只读,cl_MEM_USE_HOST_PTR,memoryA,blockA,&err)(C除外,它是只写的).现在,基于内核代码的最后一行,我在缓冲区读取时收到一个无效的命令队列错误,该错误将值设置为矩阵C中的指定位置。缓冲区读取应该传递缓冲区的大小(以字节为单位),如memoryC,而不是大小{A,B,C}值。另外,您在代码中读取了3倍的缓冲区C。嗯,我将缓冲区大小更改为sizeA、sizeB和sizeC,并且我已经在缓冲区x 3部分中找到了。现在这是一个无效的命令队列错误。我承认,我不知道那些NDRange家伙是怎么回事,他们正确吗?再次感谢!全局大小应该是维度输出矩阵的大小。本地大小应除以全局大小。尝试使用1,1来调试本地大小。enqueueReadBuffer希望以字节为单位的大小,memoryC,而不是以浮点大小为单位的大小。嗯,相同的无效命令队列错误…您有推荐的教程吗?我很抱歉耽误了您的3x3矩阵代码。
cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)