Input OpenCL,简单的矢量加法,但对于大输入输出错误

Input OpenCL,简单的矢量加法,但对于大输入输出错误,input,vector,opencl,Input,Vector,Opencl,所以,在花了几个小时阅读和理解之后,我终于完成了我的第一个OpenCL程序,它实际上做了一些事情,它向一个文件添加了两个向量和输出 #include <iostream> #include <vector> #include <cstdlib> #include <string> #include <fstream> #define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp>

所以,在花了几个小时阅读和理解之后,我终于完成了我的第一个OpenCL程序,它实际上做了一些事情,它向一个文件添加了两个向量和输出

#include <iostream>
#include <vector>
#include <cstdlib>
#include <string>
#include <fstream>

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>

int main(int argc, char *argv[])
{
    try
    {
// get platforms, devices and display their info.
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);

        std::vector<cl::Platform>::iterator i=platforms.begin();

        std::cout<<"OpenCL \tPlatform : "<<i->getInfo<CL_PLATFORM_NAME>()<<std::endl;
        std::cout<<"\tVendor: "<<i->getInfo<CL_PLATFORM_VENDOR>()<<std::endl;
        std::cout<<"\tVersion : "<<i->getInfo<CL_PLATFORM_VERSION>()<<std::endl;
        std::cout<<"\tExtensions : "<<i->getInfo<CL_PLATFORM_EXTENSIONS>()<<std::endl;

// get devices
        std::vector<cl::Device> devices;
        i->getDevices(CL_DEVICE_TYPE_ALL,&devices);
        int o=99;
        std::cout<<"\n\n";
// iterate over available devices
        for(std::vector<cl::Device>::iterator j=devices.begin(); j!=devices.end(); j++)
        {
            std::cout<<"\tOpenCL\tDevice : " << j->getInfo<CL_DEVICE_NAME>()<<std::endl;
            std::cout<<"\t\t Type : " << j->getInfo<CL_DEVICE_TYPE>()<<std::endl;
            std::cout<<"\t\t Vendor : " << j->getInfo<CL_DEVICE_VENDOR>()<<std::endl;
            std::cout<<"\t\t Driver : " << j->getInfo<CL_DRIVER_VERSION>()<<std::endl;
            std::cout<<"\t\t Global Mem : " << j->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()/(1024*1024)<<" MBytes"<<std::endl;
            std::cout<<"\t\t Local Mem : " << j->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/1024<<" KBbytes"<<std::endl;
            std::cout<<"\t\t Compute Unit : " << j->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()<<std::endl;
            std::cout<<"\t\t Clock Rate : " << j->getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()<<" MHz"<<std::endl;
        }
        std::cout<<"\n\n\n";





 //MAIN CODE BEGINS HERE


 //get Kernel
        std::ifstream ifs("vector_add_kernel.cl");
        std::string kernelSource((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
        std::cout<<kernelSource;

   //Create context, select device and command queue.
        cl::Context context(devices);
        cl::Device &device=devices.front();
        cl::CommandQueue cmdqueue(context,device);

  // Generate Source vector and push the kernel source in it.
        cl::Program::Sources sourceCode;
        sourceCode.push_back(std::make_pair(kernelSource.c_str(), kernelSource.size()));

  //Generate program using sourceCode
        cl::Program program=cl::Program(context, sourceCode);

  //Build program..
        try
        {
            program.build(devices);
        }
        catch(cl::Error &err)
        {
            std::cerr<<"Building failed, "<<err.what()<<"("<<err.err()<<")"
                     <<"\nRetrieving build log"
                     <<"\n Build Log Follows \n"
                     <<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices.front());
        }


  //Declare and initialize vectors
        std::vector<cl_float>B(993448,1.3);
        std::vector<cl_float>C(993448,1.3);
        std::vector<cl_float>A(993448,1.3);
        cl_int N=A.size();

  //Declare and intialize proper work group size and global size. Global size raised to the nearest multiple of workGroupSize.
        int workGroupSize=128;
        int GlobalSize;
        if(N%workGroupSize) GlobalSize=N - N%workGroupSize + workGroupSize;
        else GlobalSize=N;
  //Declare buffers.
        cl::Buffer vecA(context, CL_MEM_READ_WRITE, sizeof(cl_float)*N);
        cl::Buffer vecB(context, CL_MEM_READ_ONLY , (B.size())*sizeof(cl_float));
        cl::Buffer vecC(context, CL_MEM_READ_ONLY , (C.size())*sizeof(cl_float));
  //Write vectors into buffers
        cmdqueue.enqueueWriteBuffer(vecB, 0, 0, (B.size())*sizeof(cl_float), &B[0] );
        cmdqueue.enqueueWriteBuffer(vecB, 0, 0, (C.size())*sizeof(cl_float), &C[0] );

 //Executing kernel
        cl::Kernel kernel(program, "vector_add");
        cl::KernelFunctor kernel_func=kernel.bind(cmdqueue, cl::NDRange(GlobalSize), cl::NDRange(workGroupSize));

        kernel_func(vecA, vecB, vecC, N);

 //Reading back values into vector A
        cmdqueue.enqueueReadBuffer(vecA,true,0,N*sizeof(cl_float), &A[0]);
        cmdqueue.finish();


  //Saving into file.
        std::ofstream output("vectorAdd.txt");
        for(int i=0;i<N;i++) output<<A[i]<<"\n";

    }
    catch(cl::Error& err)
    {
        std::cerr << "OpenCL error: " << err.what() << "(" << err.err() <<
                  ")" << std::endl;

        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}
问题是,对于较小的N值,我得到的正确结果是2.6 但是对于更大的值,比如上面代码993448中的值,我得到的垃圾输出在1到2.4之间

以下是内核代码:

__kernel void vector_add(__global float *A, __global float *B, __global float *C, int N) {

// Get the index of the current element
int i = get_global_id(0);


//Do the operation
if(i<N)    A[i] = C[i] + B[i];
}
更新:好的,代码似乎正在运行。我已经修复了上面代码中的一些小错误 1初始化GlobalSize的部分已修复。 2EnqueueWriteBuffer中的设置错误给定了错误的参数
它现在正在输出N的大值的正确结果。

尝试将数据类型从float更改为double等