Parallel processing OpenCL矩阵乘法排队/缓冲区读取_Parallel Processing_Opencl_Matrix Multiplication

Parallel processing OpenCL矩阵乘法排队/缓冲区读取

parallel-processing opencl

Parallel processing OpenCL矩阵乘法排队/缓冲区读取,parallel-processing,opencl,matrix-multiplication,Parallel Processing,Opencl,Matrix Multiplication,我正在用OpenCL编写一个基本的矩阵乘法程序。我相信我的问题在于我的排队和/或缓冲区读取，因为我得到的结果矩阵的输出完全不正确，矩阵A和B的第一行也不正确。我是OpenCL的新手，我一直在为此绞尽脑汁很长一段时间，也许这里有人能给我一个关于我哪里出错的提示主机代码： #define __NO_STD_VECTOR // Uses cl::vector instead of standard version #include <CL/cl.hpp> #include <std

我正在用OpenCL编写一个基本的矩阵乘法程序。我相信我的问题在于我的排队和/或缓冲区读取，因为我得到的结果矩阵的输出完全不正确，矩阵A和B的第一行也不正确。我是OpenCL的新手，我一直在为此绞尽脑汁很长一段时间，也许这里有人能给我一个关于我哪里出错的提示

主机代码：

#define __NO_STD_VECTOR // Uses cl::vector instead of standard version
#include <CL/cl.hpp>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <math.h>
#include <string>

/* Defined matrix width/height constants */
#define numRowsA 3
#define numColsA 3
#define numRowsB 3
#define numColsB 3
#define numRowsC numRowsA
#define numColsC numColsB

using namespace std;

/* Function declarations */
inline void checkErr(cl_int err, string name);
void initMatrix (float* matrix, int numIndices);
void printMatrix (string displayName, float* matrix, int numIndices,  
      int rowSize);

//*************
// Main Program
//*************
int main(int argc, char* argv[]) {

    /* Check for valid matrix sizes */
    if (numColsA != numRowsB) {
        cout << "ERROR: Invalid matrix dimensions." << endl;
    } else {

    srand(2013); // Set random seed

    /* Allocate memory for matrices A, B, and C */
    unsigned int sizeA = numRowsA * numColsA;
    unsigned int sizeB = numRowsB * numColsB;
    unsigned int sizeC = numRowsC * numColsC;
    unsigned int memoryA = sizeof(float) * sizeA;
    unsigned int memoryB = sizeof(float) * sizeB;
    unsigned int memoryC = sizeof(float) * sizeC;

    /*
        Allocate memoryA/memoryB/memoryC size blocks of bytes
        (cast from void*)
    */
    float* blockA = (float*) malloc(memoryA);
    float* blockB = (float*) malloc(memoryB);
    float* blockC = (float*) malloc(memoryC);

    /* Initialize matrices A and B */
    initMatrix(blockA, sizeA);
    initMatrix(blockB, sizeB);

    /* Display matrices A and B */
    printMatrix("Matrix A", blockA, sizeA, numColsA);
    printMatrix("Matrix B", blockB, sizeB, numColsB);

    cl_int err;            // Error code
    string platformVendor; // Platform vendor

    /* Create list of platforms */
    cl::vector < cl::Platform > platformList;
    cl::Platform::get(&platformList);

    /*
        Display potential Platform list generation error. If the
        platform list size does not equal 0, CL_SUCCESS (0) is
        sent to the function. If the platform list size does
        equal 0, -1 is sent to the function.
    */
    checkErr(platformList.size()!=0 ? CL_SUCCESS : -1,
            "Platform");

    /*
        Replace empty value of platformVendor with device vendor
        name
    */
    platformList[0].getInfo((cl_platform_info) CL_PLATFORM_VENDOR,
        &platformVendor);

    /* Properties for Context constructor (Use unknown) */
    cl_context_properties cprops[3] =
        {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties) (platformList[0]) (),
        0
        };

    /* Create context */
    cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL,
        &err);

    /* Display potential Context constructor error */
    checkErr(err, "Context");

    /* Create buffer for matrix A */
    cl::Buffer deviceMemA(context,
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeA, blockA, &err);

    /* Create buffer for matrix B */
    cl::Buffer deviceMemB(context,
            CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeB, blockB, &err);

    /* Create buffer for matrix C */
    cl::Buffer deviceMemC(context,
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeC, blockC, &err);

    /* Create buffer for row (A) and col (C) */
    cl::Buffer rowA(context, CL_MEM_READ_ONLY, sizeof(int),
            (void *) numRowsA, &err);
    cl::Buffer colC(context, CL_MEM_READ_ONLY, sizeof(int),
            (void *) numColsC, &err);

    /* Display potential Buffer constructor error */
    checkErr(err, "Buffers");

    /* Get list of devices */
    cl::vector<cl::Device> devices =
        context.getInfo<CL_CONTEXT_DEVICES>();

    /* Check for at least one device, if not throw error */
    checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "No Devices");

    /* Read input from .cl file */
    ifstream file("matrixMult1_kernels.cl");

    /* Check for potential problem opening .cl input file */
    checkErr(file.is_open() ? CL_SUCCESS:-1, "File Not Open");

    /* Store file contents in a string */
    string prog(istreambuf_iterator<char>(file),
            (istreambuf_iterator<char>()));

    /* Create source object */
    cl::Program::Sources source(1, make_pair(prog.c_str(),
        prog.length()+1));

    /* Create program for given context and source */
    cl::Program program(context, source);

    err = program.build(devices, ""); // Check for build error

    /* Display potential program build error */
    checkErr(err, "Program Build");

    /* Create kernel */
    cl::Kernel kernel(program, "matrixMul", &err);

    /* Display potential Kernel constructor error */
    checkErr(err, "Kernel");

    /*
        Set matrixMul arguments, error checking after each
        argument
    */
    err = kernel.setArg(0, deviceMemA);
    checkErr(err, "Arg0");
    err = kernel.setArg(1, deviceMemB);
    checkErr(err, "Arg1");
    err = kernel.setArg(2, deviceMemC);
    checkErr(err, "Arg2");
    err = kernel.setArg(3, rowA);
    checkErr(err, "Arg3");
    err = kernel.setArg(4, colC);
    checkErr(err, "Arg4");

    /* Create command queue */
    cl::CommandQueue queue(context, devices[0], 0, &err);

    /* Display potential CommandQueue constructor error */
    checkErr(err, "Command Queue");

    /* Create event object */
    cl::Event event;

    cl::NDRange global(3, 3);
    cl::NDRange local(1, 1);

    /* Enqueue the kernel */
    err = queue.enqueueNDRangeKernel(kernel, 2, global, local,
        NULL, &event);

    /* Display potential enqueueing error */
    checkErr(err, "Enqueue");

    /* Wait until kernel has completed execution before continuing */
    event.wait();

    /* Read kernel result back into host memory */
    err = queue.enqueueReadBuffer(deviceMemC, CL_TRUE, 0, memoryC,
        blockC, NULL, &event);

        checkErr(err, "C");

    err = queue.enqueueReadBuffer(deviceMemA, CL_TRUE, 0, sizeA,
        blockA, NULL, &event);
    err = queue.enqueueReadBuffer(deviceMemB, CL_TRUE, 0, sizeB,
        blockB, NULL, &event);

    /* Display potential kernel read error */
    checkErr(err, "Read Buffer");

    /* Display matrices */
        cout << endl;
        cout << "After:" << endl;
    printMatrix("Matrix A", blockA, sizeA, numColsA);
    printMatrix("Matrix B", blockB, sizeB, numColsB);
    printMatrix("Matrix C", blockC, sizeC, numColsC);

    /* Free up memory */
    free(blockA);
    free(blockB);
    free(blockC);
    }
}

//--------------------------------------------------------------------
// checkErr - Inline error checking function for OpenCL portion of
//            host program.
//
// PRE:  err is of type int in OpenCL; name is a string.
// POST: The program is terminated after display an error message
//       indicating the location of the error and the error code.
//--------------------------------------------------------------------
inline void checkErr(cl_int err, string name) {

    /* Check error code against OpenCL success constant */
    if (err != CL_SUCCESS) {

    /*
        Display an error message stating the error origin and
        error number.
    */
    std::cerr << "ERROR: " << name << " (" << err << ")"
              << std::endl;

    exit(EXIT_FAILURE); // Terminates process with status code 0
    }
}

//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
//              matrix.
//
// PRE:  matrix is a pointer to a block of bytes in memory; numIndices
//       is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
//       float value.
//--------------------------------------------------------------------
void initMatrix (float* matrix, int numIndices) {

    /*
    Loop through the block of bytes, assigning a random float
    for each index of the matrix
    */
    for (int i = 0; i < numIndices; i++) {

    /* Assign a random float between 0 and 1 at this byte */
    matrix[i] = rand() / (float) RAND_MAX;
    }
}

//--------------------------------------------------------------------
// printMatrix - Outputs a readable version of the matrix.
//
// PRE:  displayName is a string; matrix is a pointer to a block of
//       bytes in memory; numIndices an integer indicating the number
//       of indices in the matrix being displayed (read left-to-right,
//       top-to-bottom); rowSize is an integer indicating the number
//       of elements in one row of the matrix.
// POST: A readable version of the matrix is displayed.
//--------------------------------------------------------------------
void printMatrix (string displayName, float* matrix, int numIndices,
          int rowSize) {

    /* Output display name of matrix */
    cout << "\n" << displayName << ":" << endl;

    /* Loop through each indice of the matrix */
    for (int i = 0; i < numIndices; i++) {
    cout << matrix[i]; // Display value at this indice

    /* Check for next row of the matrix */
    if (((i + 1) % rowSize) == 0) {
        cout << endl; // Line break
    } else {
        cout << "  |  "; // Indice separator
    }
    }
}

\define\u NO\u STD\u VECTOR//使用cl:：VECTOR而不是标准版本
#包括
#包括
#包括
#包括
#包括
#包括
#包括
/*定义的矩阵宽度/高度常数*/
#定义numRowsA 3
#定义numColsA 3
#定义numrowsb3
#定义numColsB 3
#定义numRowsC numRowsA
#定义numColsC numColsB
使用名称空间std；
/*函数声明*/
内联无效校验错误（cl_int err，字符串名称）；
void initMatrix（浮点*矩阵，整数numIndices）；
void printMatrix（字符串显示名、浮点*矩阵、整数numIndices、，
整数行大小）；
//*************
//主程序
//*************
int main（int argc，char*argv[]）{
/*检查有效的矩阵大小*/
if（numColsA！=numRowsB）{
是否可以0？CL_成功：-1，“无设备”）；
/*从.cl文件读取输入*/
ifstream文件（“matrixMult1_kernels.cl”）；
/*检查打开.cl输入文件是否存在潜在问题*/
checkErr（file.is_open（）？CL_SUCCESS:-1，“文件未打开”）；
/*以字符串形式存储文件内容*/
字符串程序（istreambuf_迭代器（文件），
（istreambuf_迭代器（））；
/*创建源对象*/
cl:：Program:：Sources source（1，make_pair（prog.c_str（）），
程序长度（）+1））；
/*为给定的上下文和源创建程序*/
cl:：程序（上下文、源）；
err=program.build（devices，“”；//检查生成错误
/*显示潜在的程序生成错误*/
checkErr（err，“程序构建”）；
/*创建内核*/
cl：：内核（程序，“matrixMul”&err）；
/*显示潜在的内核构造函数错误*/
checkErr（err，内核）；
/*
设置matrixMul参数，每次之后进行错误检查
论点
*/
err=kernel.setArg（0，设备内存）；
checkErr（err，“Arg0”）；
err=kernel.setArg（1，设备emb）；
checkErr（err，“Arg1”）；
err=kernel.setArg（2，设备emc）；
checkErr（err，“Arg2”）；
err=kernel.setArg（3，rowA）；
checkErr（err，“Arg3”）；
err=kernel.setArg（4，colC）；
checkErr（err，“Arg4”）；
/*创建命令队列*/
cl:：CommandQueue队列（上下文、设备[0]、0和错误）；
/*显示潜在的CommandQueue构造函数错误*/
checkErr（err，“命令队列”）；
/*创建事件对象*/
cl：事件；
cl：：NDRange global（3,3）；
cl：：NDRange local（1,1）；
/*使内核排队*/
err=queue.enqueueNDRangeKernel（内核，2，全局，局部，
空值（&事件）；
/*显示潜在排队错误*/
checker（err，“排队”）；
/*等待内核完成执行后再继续*/
event.wait（）；
/*将内核结果读回主机内存*/
err=queue.enqueueReadBuffer（设备EMC，CL_TRUE，0，memoryC，
blockC、NULL和事件）；
checker（err，“C”）；
err=queue.enqueueReadBuffer（deviceMemA，CL_TRUE，0，sizeA，
blockA、NULL和event）；
err=queue.enqueueReadBuffer（deviceMemB，CL_TRUE，0，sizeB，
blockB、NULL和事件）；
/*显示潜在的内核读取错误*/
checkErr（err，“读取缓冲区”）；
/*显示矩阵*/
cout输入矩阵A和B的数据不会传递到设备。创建缓冲区时：
cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)

blockA参数被忽略，因为标志没有指定如何使用它。您需要添加至少CL\u MEM\u COPY\u HOST\u PTR以使用blockA的内容初始化缓冲区
或者，您可以在创建缓冲区后调用clEnqueueWriteBuffer发送数据。谢谢！缓冲区现在的格式为cl:：Buffer deviceMemA（上下文，cl_MEM_只读，cl_MEM_USE_HOST_PTR，memoryA，blockA，&err）（C除外，它是只写的）.现在，基于内核代码的最后一行，我在缓冲区读取时收到一个无效的命令队列错误，该错误将值设置为矩阵C中的指定位置。缓冲区读取应该传递缓冲区的大小（以字节为单位），如memoryC，而不是大小{A，B，C}值。另外，您在代码中读取了3倍的缓冲区C。嗯，我将缓冲区大小更改为sizeA、sizeB和sizeC，并且我已经在缓冲区x 3部分中找到了。现在这是一个无效的命令队列错误。我承认，我不知道那些NDRange家伙是怎么回事，他们正确吗？再次感谢！全局大小应该是维度输出矩阵的大小。本地大小应除以全局大小。尝试使用1,1来调试本地大小。enqueueReadBuffer希望以字节为单位的大小，memoryC，而不是以浮点大小为单位的大小。嗯，相同的无效命令队列错误…您有推荐的教程吗？我很抱歉耽误了您的3x3矩阵代码。
cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)