在OpenCL中进行reduce的最佳实践是什么？_Opencl_Reduce

在OpenCL中进行reduce的最佳实践是什么？

opencl

在OpenCL中进行reduce的最佳实践是什么？,opencl,reduce,Opencl,Reduce,想象一下，一个二进制操作可以使用关联属性将其命名为+。当你可以计算a1+a2+a3+a4+。。。并行地，首先是计算 b1 = a1 + a2 b2 = a3 + a4 然后然后对上一步的结果执行相同的操作，依此类推，直到剩下一个元素我正在学习OpenCL并尝试实现这种方法来总结数组中的所有元素。我对这项技术完全是新手，所以这个程序看起来可能有些奇怪这是内核： __kernel void reduce (__global float *input, __global float *outp

想象一下，一个二进制操作可以使用关联属性将其命名为+。当你可以计算a1+a2+a3+a4+。。。并行地，首先是计算

b1 = a1 + a2
b2 = a3 + a4

然后

然后对上一步的结果执行相同的操作，依此类推，直到剩下一个元素

我正在学习OpenCL并尝试实现这种方法来总结数组中的所有元素。我对这项技术完全是新手，所以这个程序看起来可能有些奇怪

这是内核：

__kernel void reduce (__global float *input, __global float *output)
{
    size_t gl = get_global_id (0);
    size_t s = get_local_size (0);
    int i;
    float accum = 0;

    for (i=0; i<s; i++) {
        accum += input[s*gl+i];
    }

    output[gl] = accum;
}

这是主程序：

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>

#define N (64*64*64*64)

#include <sys/time.h>
#include <stdlib.h>

double gettime ()
{
    struct timeval tv;
    gettimeofday (&tv, NULL);
    return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}

int main()
{
    int i, fd, res = 0;
    void* kernel_source = MAP_FAILED;

    cl_context context;
    cl_context_properties properties[3];
    cl_kernel kernel;
    cl_command_queue command_queue;
    cl_program program;
    cl_int err;
    cl_uint num_of_platforms=0;
    cl_platform_id platform_id;
    cl_device_id device_id;
    cl_uint num_of_devices=0;
    cl_mem input, output;
    size_t global, local;

    cl_float *array = malloc (sizeof (cl_float)*N);
    cl_float *array2 = malloc (sizeof (cl_float)*N);
    for (i=0; i<N; i++) array[i] = i;

    fd = open ("kernel.cl", O_RDONLY);
    if (fd == -1) {
        perror ("Cannot open kernel");
        res = 1;
        goto cleanup;
    }
    struct stat s;

    res = fstat (fd, &s);
    if (res == -1) {
        perror ("Cannot stat() kernel");
        res = 1;
        goto cleanup;
    }

    kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (kernel_source == MAP_FAILED) {
        perror ("Cannot map() kernel");
        res = 1;
        goto cleanup;
    }

    if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
        printf("Unable to get platform_id\n");
        res = 1;
        goto cleanup;
    }

    if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
                       &num_of_devices) != CL_SUCCESS)
    { 
        printf("Unable to get device_id\n");
        res = 1;
        goto cleanup;
    }
    properties[0]= CL_CONTEXT_PLATFORM;
    properties[1]= (cl_context_properties) platform_id;
    properties[2]= 0;
    context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
    command_queue = clCreateCommandQueue(context, device_id, 0, &err);
    program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);


    if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
        char buffer[4096];
        size_t len;

        printf("Error building program\n");
        clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
        printf ("%s\n", buffer);
        res = 1;
        goto cleanup;
     }

    kernel = clCreateKernel(program, "reduce", &err);
    if (err != CL_SUCCESS) {
        printf("Unable to create kernel\n");
        res = 1;
        goto cleanup;
    }

    // create buffers for the input and ouput
    input = clCreateBuffer(context, CL_MEM_READ_ONLY, 
                            sizeof(cl_float) * N, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
                            sizeof(cl_float) * N, NULL, NULL);

    // load data into the input buffer
    clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0, 
                          sizeof(cl_float) * N, array, 0, NULL, NULL);

    size_t size = N;
    cl_mem tmp;
    double time = gettime();
    while (size > 1)
    {
        // set the argument list for the kernel command
        clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
        clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
        global = size;
        local = 64;

        // enqueue the kernel command for execution
        clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, 
                           &local, 0, NULL, NULL);
        clFinish(command_queue);
        size = size/64;
        tmp = output;
        output = input;
        input = tmp;
    }
    cl_float answer[1];
    clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0, 
                        sizeof(cl_float), array, 0, NULL, NULL);
    time = gettime() - time;
    printf ("%f %f\n", array[0], time);

cleanup:
    free (array);
    free (array2);
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);

    if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
    if (fd != -1) close (fd);

    _Exit (res); // Kludge
    return res;
}

所以我重新运行内核，直到缓冲区中只有一个元素。这是计算OpenCL中元素之和的正确方法吗？当在CPU上编译clang 4.0.0和-O2-ffast数学标志时，我用gettime测量的时间大约慢10倍。我使用的硬件：Amd Ryzen 5 1600X和Amd Radeon HD 6950。

您可以做一些事情来提高性能

首先，去掉循环中的clFinish调用。这将强制内核的单个执行依赖于命令队列的整个状态，在继续之前到达与主机的同步点，这是不必要的。唯一需要的同步是内核按顺序执行，即使您的程序没有请求一个无序队列，您也可以通过简单使用事件对象来保证这一点

size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
    total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
    // set the argument list for the kernel command
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    global = size;
    local = 64;

    if(event_index == 0)
        // enqueue the kernel command for execution
        clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, 
                           &local, 0, NULL, events);
    else
        clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, 
                           &local, 1, events + (event_index - 1), events + event_index);
    size = size/64;
    tmp = output;
    output = input;
    input = tmp;
    event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
    clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0, 
                    sizeof(cl_float), array, 0, NULL, NULL);

另一件可能需要研究的事情是在一个内核中执行缩减，而不是将其分散到同一内核的多个调用中。例如，虽然它可能比您需要的更复杂。

感谢您提供了删除clFinish的有用建议。至于AMD的那篇文章，我能够用它来改进内核，这样它可以更好地在工作组中分配工作，并利用本地内存。但我仍然觉得那篇文章令人困惑。示例：为什么我需要使用操作的交换属性对操作进行重新排序？据我所知，如果工作元素的加载更紧凑，那么它们之间就没有间隙，这会更好。对吗？这篇文章谈论的是什么样的SIMD波前？请查看来自不同GPU制造商nVidia、AMD、Intel等的OpenCL优化指南。他们对GPU的工作原理做了非常好的介绍，包括术语。顺便说一句，我找到了链接。非常有用。@shamaz.mazum该链接已严重过时。AMD5000系列是旧的VLIW4格式，其计算方式与现代GCN设备完全不同。我建议阅读更多最新的AMD优化指南，除非你真的经常使用8年以上的GPU。。。

size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
    total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
    // set the argument list for the kernel command
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    global = size;
    local = 64;

    if(event_index == 0)
        // enqueue the kernel command for execution
        clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, 
                           &local, 0, NULL, events);
    else
        clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, 
                           &local, 1, events + (event_index - 1), events + event_index);
    size = size/64;
    tmp = output;
    output = input;
    input = tmp;
    event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
    clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0, 
                    sizeof(cl_float), array, 0, NULL, NULL);