C++ OpenCL-结果与CPU版本不同_C++_Image Processing_Opencl

C++ OpenCL-结果与CPU版本不同

c++ image-processing opencl

C++ OpenCL-结果与CPU版本不同,c++,image-processing,opencl,C++,Image Processing,Opencl,我想使用openCl在两个图像之间进行像素值比较。一幅图像应通过变换矩阵进行“变换” 1）我面临一个问题，openCL版本的结果与CPU版本不一样。像素值的差异在我的示例图像（imageA：所有像素都是5，imagesB：所有像素都是6）中始终是1，因此1000*1000像素的总数应该是1000000。 CPU版本总是正确的，但openCL版本总是有点不精确，并且有时也会有所不同（例如998895或998829） 2）我遇到的另一个问题是运行时，因为将两个比较像素的差值添加到结果变量需要很长时

我想使用openCl在两个图像之间进行像素值比较。一幅图像应通过变换矩阵进行“变换”

1）我面临一个问题，openCL版本的结果与CPU版本不一样。像素值的差异在我的示例图像（imageA：所有像素都是5，imagesB：所有像素都是6）中始终是1，因此1000*1000像素的总数应该是1000000。 CPU版本总是正确的，但openCL版本总是有点不精确，并且有时也会有所不同（例如998895或998829）

2）我遇到的另一个问题是运行时，因为将两个比较像素的差值添加到结果变量需要很长时间。但我的感觉是，它可以通过另一种记忆布局来解决

对我遇到的问题有什么想法吗？也许使用二维工作集的方式也会导致错误

谢谢你和亲切的问候亨德里克

以下是内核：基本上，它得到了两幅图像和700个变换矩阵（目前所有的矩阵都表示身份）

\u内核无效合规性(
__只读图像2D\t图像A，
__只读图像2D\t图像B，
__常数浮点*矩阵，
__全局整数*结果
) 
{
对于（int i=0；i<700；i++）
{
大小\u t x=获取全局\u id（0）；
大小y=获取全局id（1）；
float t1=矩阵[0+i*6]；
浮点t2=矩阵[1+i*6]；
浮点数t3=矩阵[2+i*6]；
浮点数t4=矩阵[3+i*6]；
浮点数t5=矩阵[4+i*6]；
浮点数t6=矩阵[5+i*6]；
//计算比较像素的其他坐标
int x_new=x*t1+y*t2+1*t3；
int y_new=x*t4+y*t5+1*t6；
inta=（读取图像i（图像a，（int2）（x，y））.x）；
intb=（读取图像i（图像b，（int2）（x_新，y_新））.x）；
int diff=b-a；
//将两个比较像素中的每一个添加到结果中
结果[i]+=diff；
}
}

这是我的主机代码：

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <opencv2\core.hpp>
#include <opencv2\imgproc.hpp>
#include <opencv2\highgui.hpp>


using namespace std;




int main(int argc, char** argv) {
    //700 transformation matrices
    int numberMatrices = 700;
    bool opencl = true;
    //iamge width
    int width = 1000;
    //image height
    int height = 1000;
    //total number of pixels of one image
    int size = width*height;


    // Create two example images
    const int LIST_SIZE = size;
    int *imageA = new int[LIST_SIZE];
    int *imageB = new int[LIST_SIZE];
    for (int i = 0; i < LIST_SIZE; i++) {
        //every pixel value of imageA is 5
        imageA[i] = 5;
        //every pixel value of imageA is 6
        imageB[i] = 6;
    }

    //creation of n transformation matrices
    const int MATRIX_SIZE = 6* numberMatrices;
    float *indi = new float[MATRIX_SIZE];
    //all the matrices are the same
    for (int i = 0; i < numberMatrices; i++)
    {
        //identity matrix
        indi[0 + i * 6] = 1;
        indi[1 + i * 6] = 0;
        indi[2 + i * 6] = 0;
        indi[3 + i * 6] = 0;
        indi[4 + i * 6] = 1;
        indi[5 + i * 6] = 0;
    }

    //array to save the results of the comparison
    const int RESULT_SIZE = numberMatrices;
    int *result = new int[RESULT_SIZE];





    if (opencl)
    {
        try {
            // Get available platforms
            vector<cl::Platform> platforms;
            cl::Platform::get(&platforms);
            std::cerr << "Platform number is: " << platforms.size() << std::endl;
            std::string platformVendor;
            platforms[0].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor);
            std::cerr << "Platform is by: " << platformVendor << "\n";

            // Select the default platform and create a context using this platform and the GPU
            cl_context_properties cps[3] = {
                CL_CONTEXT_PLATFORM,
                (cl_context_properties)(platforms[0])(),
                0
            };
            cl::Context context(CL_DEVICE_TYPE_CPU, cps);

            vector<cl::ImageFormat> format;
            context.getSupportedImageFormats(CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &format);
            /*  for (int i = 0; i < format.size(); i++)
            {
            cout << "Channel Data Type: " << format.at(i).image_channel_data_type
            << "    Channel order: "  << format.at(i).image_channel_order << endl;
            }*/



            // Get a list of devices on this platform
            vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
            for (int i = 0; i < devices.size(); i++)
            {

                cout << "Device: " << devices.at(i).getInfo<CL_DEVICE_NAME>() << endl;
                cout << "DOUBLE FP: " << devices.at(i).getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>() << endl;
                cout << "Image Max Height: " << devices.at(i).getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() << endl;
                cout << "Image Support: " << devices.at(i).getInfo<CL_DEVICE_IMAGE_SUPPORT>() << endl;
                cout << "Local Memory Size: " << devices.at(i).getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() << endl;
                cout << "Clock Frequency: " << devices.at(i).getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << endl;
                cout << "CUs: " << devices.at(i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
                cout << "Driver: " << devices.at(i).getInfo<CL_DRIVER_VERSION>() << endl;
                cout << "Version: " << devices.at(i).getInfo<CL_DEVICE_VERSION>() << endl;
                cout << "Work Group: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl;
                cout << "Items: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>();
                cout << endl;
            }


            //Create opencl image
            cl::Image2D clImage_A = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageA);
            cl::Image2D clImage_B = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageB);


            // Create a command queue and use the first device
            cl::CommandQueue queue = cl::CommandQueue(context, devices[0]);

            // Read kernel source file
            std::ifstream sourceFile("difference.cl");
            std::string sourceCode(
                std::istreambuf_iterator<char>(sourceFile),
                (std::istreambuf_iterator<char>()));
            cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length() + 1));

            // Make program of the source code in the context
            cl::Program program = cl::Program(context, source);

            // Build program for these specific devices
            program.build(devices);

            // Make kernel
            cl::Kernel kernel(program, "compliance");


            // Create memory buffers
            cl::Buffer buffer_matrix = cl::Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(float));
            cl::Buffer buffer_result = cl::Buffer(context, CL_MEM_READ_WRITE, RESULT_SIZE * sizeof(int));

            // Copy list of results to the memory buffers
            queue.enqueueWriteBuffer(buffer_matrix, CL_TRUE, 0, MATRIX_SIZE * sizeof(float), indi);


            // Set arguments to kernel
            kernel.setArg(0, clImage_A);
            kernel.setArg(1, clImage_B);
            kernel.setArg(2, buffer_matrix);
            kernel.setArg(3, buffer_result);


            cl::Event event;

            std::cout << "Start OpenCL processing.." << endl;
            chrono::high_resolution_clock::time_point t1 = chrono::high_resolution_clock::now();

            // Run the kernel n-times on specific ND range
            for (int i = 0; i < 1; i++)
            {
                queue.enqueueNDRangeKernel(
                    kernel,
                    cl::NullRange,
                    cl::NDRange((size_t)width, (size_t)height),
                    cl::NDRange(1, 1),
                    NULL,
                    &event);
                cout << i << " ";
                event.wait();
            }

            chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now();

            auto duration_opencl = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();


            std::cout << "OpenCL processing done.." << endl;
            std::cout << "Start CPU Processing.." << endl;


            // Read buffer_result into result
            queue.enqueueReadBuffer(buffer_result, CL_TRUE, 0, RESULT_SIZE * sizeof(int), result);


            //cpu version to calculate the difference between the two arryays
            t1 = chrono::high_resolution_clock::now();
            int different = 0;
            int x_new;
            int x;
            for (int i = 0; i < numberMatrices; i++)
            {
                different = 0;
                for (int n = 0; n < LIST_SIZE; n++)
                {
                    x = imageA[n];
                    x_new = x;;
                    int a = imageA[x];
                    int b = imageB[x_new];
                    int diff = imageB[x_new] - imageA[x];

                    different += diff;
                }

            }

            t2 = chrono::high_resolution_clock::now();
            auto duration_cpu = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
            std::cout << "CPU processing done.." << endl;


            //output of the results
            std::cout << "opencl: diff " << result[0] << endl;
            std::cout << "Runtime opencl: " << duration_opencl << endl;

            std::cout << "CPU: diff " << different << endl;
            std::cout << "Runtime CPU: " << duration_cpu << endl;

            double times = (double)duration_cpu / (double)duration_opencl;
            std::cout << "OpenCL is  " << times << " times faster!!!"  << endl;

            char c;
            std::cin >> c;

        }
        catch (cl::Error error) {
            std::cout << error.what() << "(" << error.err() << ")" << std::endl;
            char c;
            std::cin >> c;
        }
    }


    return 0;
}

\define\u CL\u ENABLE\u异常
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
使用名称空间std；
int main（int argc，字符**argv）{
//700变换矩阵
整数米=700；
bool opencl=true；
//直径宽度
整数宽度=1000；
//像高
整数高度=1000；
//一幅图像的总像素数
整数大小=宽度*高度；
//创建两个示例图像
const int LIST_SIZE=大小；
int*imageA=新的int[列表大小]；
int*imageB=新int[列表大小]；
对于（int i=0；istd：：cerr你不认为在这一行结果[i]中的情况下，你有竞争条件吗+=diff；
在OpenCL代码中，您的程序在每个工作项中同时执行此操作？因此，这可能是一个问题。
浮点算法在不同的平台上有所不同。您很可能看到了由OpenCL
编译器执行的特定于硬件的优化的效果。据我所知，禁用优化使用-cl opt disable
在这种情况下没有帮助。
是的，这可能是一个问题。这也是运行速度慢的原因。您知道保存每个像素对差异总和的另一种方法吗？您可以使用我所知的OpenCL的不同方法：原子添加（结果[I]，差异）您可以防止在同一个内存位置同时写入。谢谢……虽然我还没有成功实现原子添加。这个问题被称为并行还原。我发现的一个术语是还原。这里有一个指向ppt的链接。我没有做过，但可能有人也有同样的问题：
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <opencv2\core.hpp>
#include <opencv2\imgproc.hpp>
#include <opencv2\highgui.hpp>


using namespace std;




int main(int argc, char** argv) {
    //700 transformation matrices
    int numberMatrices = 700;
    bool opencl = true;
    //iamge width
    int width = 1000;
    //image height
    int height = 1000;
    //total number of pixels of one image
    int size = width*height;


    // Create two example images
    const int LIST_SIZE = size;
    int *imageA = new int[LIST_SIZE];
    int *imageB = new int[LIST_SIZE];
    for (int i = 0; i < LIST_SIZE; i++) {
        //every pixel value of imageA is 5
        imageA[i] = 5;
        //every pixel value of imageA is 6
        imageB[i] = 6;
    }

    //creation of n transformation matrices
    const int MATRIX_SIZE = 6* numberMatrices;
    float *indi = new float[MATRIX_SIZE];
    //all the matrices are the same
    for (int i = 0; i < numberMatrices; i++)
    {
        //identity matrix
        indi[0 + i * 6] = 1;
        indi[1 + i * 6] = 0;
        indi[2 + i * 6] = 0;
        indi[3 + i * 6] = 0;
        indi[4 + i * 6] = 1;
        indi[5 + i * 6] = 0;
    }

    //array to save the results of the comparison
    const int RESULT_SIZE = numberMatrices;
    int *result = new int[RESULT_SIZE];





    if (opencl)
    {
        try {
            // Get available platforms
            vector<cl::Platform> platforms;
            cl::Platform::get(&platforms);
            std::cerr << "Platform number is: " << platforms.size() << std::endl;
            std::string platformVendor;
            platforms[0].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor);
            std::cerr << "Platform is by: " << platformVendor << "\n";

            // Select the default platform and create a context using this platform and the GPU
            cl_context_properties cps[3] = {
                CL_CONTEXT_PLATFORM,
                (cl_context_properties)(platforms[0])(),
                0
            };
            cl::Context context(CL_DEVICE_TYPE_CPU, cps);

            vector<cl::ImageFormat> format;
            context.getSupportedImageFormats(CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &format);
            /*  for (int i = 0; i < format.size(); i++)
            {
            cout << "Channel Data Type: " << format.at(i).image_channel_data_type
            << "    Channel order: "  << format.at(i).image_channel_order << endl;
            }*/



            // Get a list of devices on this platform
            vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
            for (int i = 0; i < devices.size(); i++)
            {

                cout << "Device: " << devices.at(i).getInfo<CL_DEVICE_NAME>() << endl;
                cout << "DOUBLE FP: " << devices.at(i).getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>() << endl;
                cout << "Image Max Height: " << devices.at(i).getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() << endl;
                cout << "Image Support: " << devices.at(i).getInfo<CL_DEVICE_IMAGE_SUPPORT>() << endl;
                cout << "Local Memory Size: " << devices.at(i).getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() << endl;
                cout << "Clock Frequency: " << devices.at(i).getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << endl;
                cout << "CUs: " << devices.at(i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
                cout << "Driver: " << devices.at(i).getInfo<CL_DRIVER_VERSION>() << endl;
                cout << "Version: " << devices.at(i).getInfo<CL_DEVICE_VERSION>() << endl;
                cout << "Work Group: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl;
                cout << "Items: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>();
                cout << endl;
            }


            //Create opencl image
            cl::Image2D clImage_A = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageA);
            cl::Image2D clImage_B = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageB);


            // Create a command queue and use the first device
            cl::CommandQueue queue = cl::CommandQueue(context, devices[0]);

            // Read kernel source file
            std::ifstream sourceFile("difference.cl");
            std::string sourceCode(
                std::istreambuf_iterator<char>(sourceFile),
                (std::istreambuf_iterator<char>()));
            cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length() + 1));

            // Make program of the source code in the context
            cl::Program program = cl::Program(context, source);

            // Build program for these specific devices
            program.build(devices);

            // Make kernel
            cl::Kernel kernel(program, "compliance");


            // Create memory buffers
            cl::Buffer buffer_matrix = cl::Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(float));
            cl::Buffer buffer_result = cl::Buffer(context, CL_MEM_READ_WRITE, RESULT_SIZE * sizeof(int));

            // Copy list of results to the memory buffers
            queue.enqueueWriteBuffer(buffer_matrix, CL_TRUE, 0, MATRIX_SIZE * sizeof(float), indi);


            // Set arguments to kernel
            kernel.setArg(0, clImage_A);
            kernel.setArg(1, clImage_B);
            kernel.setArg(2, buffer_matrix);
            kernel.setArg(3, buffer_result);


            cl::Event event;

            std::cout << "Start OpenCL processing.." << endl;
            chrono::high_resolution_clock::time_point t1 = chrono::high_resolution_clock::now();

            // Run the kernel n-times on specific ND range
            for (int i = 0; i < 1; i++)
            {
                queue.enqueueNDRangeKernel(
                    kernel,
                    cl::NullRange,
                    cl::NDRange((size_t)width, (size_t)height),
                    cl::NDRange(1, 1),
                    NULL,
                    &event);
                cout << i << " ";
                event.wait();
            }

            chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now();

            auto duration_opencl = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();


            std::cout << "OpenCL processing done.." << endl;
            std::cout << "Start CPU Processing.." << endl;


            // Read buffer_result into result
            queue.enqueueReadBuffer(buffer_result, CL_TRUE, 0, RESULT_SIZE * sizeof(int), result);


            //cpu version to calculate the difference between the two arryays
            t1 = chrono::high_resolution_clock::now();
            int different = 0;
            int x_new;
            int x;
            for (int i = 0; i < numberMatrices; i++)
            {
                different = 0;
                for (int n = 0; n < LIST_SIZE; n++)
                {
                    x = imageA[n];
                    x_new = x;;
                    int a = imageA[x];
                    int b = imageB[x_new];
                    int diff = imageB[x_new] - imageA[x];

                    different += diff;
                }

            }

            t2 = chrono::high_resolution_clock::now();
            auto duration_cpu = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
            std::cout << "CPU processing done.." << endl;


            //output of the results
            std::cout << "opencl: diff " << result[0] << endl;
            std::cout << "Runtime opencl: " << duration_opencl << endl;

            std::cout << "CPU: diff " << different << endl;
            std::cout << "Runtime CPU: " << duration_cpu << endl;

            double times = (double)duration_cpu / (double)duration_opencl;
            std::cout << "OpenCL is  " << times << " times faster!!!"  << endl;

            char c;
            std::cin >> c;

        }
        catch (cl::Error error) {
            std::cout << error.what() << "(" << error.err() << ")" << std::endl;
            char c;
            std::cin >> c;
        }
    }


    return 0;
}