为什么OpenCL中不同的局部大小会产生不同的结果?

为什么OpenCL中不同的局部大小会产生不同的结果?,c,parallel-processing,opencl,matrix-multiplication,C,Parallel Processing,Opencl,Matrix Multiplication,我正在尝试使用OpenCL执行一个基本的矩阵乘法算法。这两个矩阵的维数应该相等(大小x大小),所以我将问题定义为二维问题,全局大小x大小,我正在测试不同局部大小的情况 内核编写如下: __kernel void matmul( __global unsigned int *a, __global unsigned int *b, __global unsigned int *c ) { int row, col, i, size; unsigned int

我正在尝试使用OpenCL执行一个基本的矩阵乘法算法。这两个矩阵的维数应该相等(大小x大小),所以我将问题定义为二维问题,全局大小x大小,我正在测试不同局部大小的情况

内核编写如下:

__kernel void matmul(
    __global unsigned int *a,
    __global unsigned int *b,
    __global unsigned int *c
) {
    int row, col, i, size;
    unsigned int dot;

    row = get_global_id(0);
    col = get_global_id(1);
    size = get_global_size(0);

    dot = 0;
    for (i = 0; i < size; i++) {
        dot += a[row * size + i] * b[i * size + col];
    }

    c[row * size + col] = dot;
}
\u内核无效matmul(
__全局无符号整数*a,
__全局无符号整数*b,
__全局无符号整数*c
) {
int行、列、i、大小;
无符号整数点;
行=获取全局id(0);
col=获取全局id(1);
大小=获取全局大小(0);
点=0;
对于(i=0;i
如果全局大小和局部大小分别设置为1024x1024和1x1,则效果很好。但是,如果局部大小是2x2或4x4,我在乘法中会得到错误的结果。现在,对于使用8的倍数的局部大小,如8x8,16x16。。。乘法中没有错误。为什么会这样

我不知道问题是在内核的编程中,还是因为我对工作组或工作项可以做什么理解不好

完整的主机代码如下所示:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <CL/opencl.h>

#define SIZE (1024)
#define WORKITEMS (4096)
#define LOG_SIZE (2048)

int main(int argc, char *argv[]) {
    int i, j, k, size, errors;

    // Host memory
    cl_uint *a_host = NULL;
    cl_uint *b_host = NULL;
    cl_uint *c_host = NULL;
    cl_uint ref_dot;

    // Device memory
    cl_mem a_device;
    cl_mem b_device;
    cl_mem c_device;

    // Performance measurements
    struct timeval t0, tf;
    float ts, tp, tb;

    // OpenCL variables
    FILE *f;
    size_t f_size;
    size_t global[3] = {0}, local[3] = {0};
    char *buffer = NULL;
    cl_int ret;
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;

    // [1] Initialize application

    // Read command line arguments to configure run
    size = (argc > 1) ? atoi(argv[1]) : SIZE;
    printf("Matrix multiplication with OpenCL (Size = %d)\n", size);

    // Allocate memory for host variables
    a_host = malloc(size * size * sizeof *a_host);
    b_host = malloc(size * size * sizeof *b_host);
    c_host = malloc(size * size * sizeof *c_host);

    // Initialize input arrays
    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            a_host[i * size + j] = rand();
            b_host[i * size + j] = rand();
        }
    }

    // [2] Initialize OpenCL environment

    // Get platform
    ret = clGetPlatformIDs(1, &platform, NULL);
    // Get device
    ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);

    // Create context
    context = clCreateContext(0, 1, &device, NULL, NULL, &ret);

    // Create command queue
    queue = clCreateCommandQueueWithProperties(context, device, 0, &ret);

    // [3] Compile OpenCL kernel
    f = fopen("kernel.cl", "rb");
    fseek(f, 0, SEEK_END);
    f_size = ftell(f);
    rewind(f);

    // Read file into memory
    buffer = malloc(f_size + 1);
    buffer[f_size] = '\0';
    fread(buffer, sizeof(char), f_size, f);
    fclose(f);

    // Create program
    printf("<OpenCL> Kernel source:\n%s", buffer);
    program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &f_size, &ret);

    // Build program
    printf("<OpenCL> Building kernel...\n");
    gettimeofday(&t0, NULL);
    ret = clBuildProgram(program, 0, NULL, "-cl-std=CL2.0", NULL, NULL);
    gettimeofday(&tf, NULL);
    tb = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
    printf("Build time: %.3f ms\n", tb);

    // Print build log (optional)
    char log[LOG_SIZE];
    ret = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, LOG_SIZE, log, NULL);
    printf("<OpenCL> Kernel build log:\n%s\n", log);

    // [4] Configure OpenCL kernel

    // Create kernel
    kernel = clCreateKernel(program, "matmul", &ret);

    // Create device buffers
    a_device = clCreateBuffer(context, CL_MEM_READ_ONLY, size * size * sizeof *a_host, NULL, &ret);
    b_device = clCreateBuffer(context, CL_MEM_READ_ONLY, size * size * sizeof *b_host, NULL, &ret);
    c_device = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size * size * sizeof *c_host, NULL, &ret);

    // Set kernel parameters
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &a_device);
    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_device);
    ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_device);

    // [5] Execute kernel
    printf("<OpenCL> Executing kernel...\n");
    gettimeofday(&t0, NULL);

    // Write data from host to device
    ret = clEnqueueWriteBuffer(queue, a_device, CL_TRUE, 0, size * size * sizeof *a_host, a_host, 0, NULL, NULL);
    ret |= clEnqueueWriteBuffer(queue, b_device, CL_TRUE, 0, size * size * sizeof *b_host, b_host, 0, NULL, NULL);

    // Enqueue kernel for execution
    global[0] = size;
    global[1] = size;
    local[0] = 2;
    local[1] = 2;
    ret = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL);

    // Wait for kernel to finish
    ret = clFinish(queue);

    // Read data from device to host
    ret = clEnqueueReadBuffer(queue, c_device, CL_TRUE, 0, size * size * sizeof *c_host, c_host, 0, NULL, NULL);

    gettimeofday(&tf, NULL);
    tp = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
    printf("[PAR] Execution time: %.3f ms\n", tp);

    // [6] Print results, perform checks

    // Compute golden reference and check errors

    gettimeofday(&t0, NULL);
    errors = 0;

    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            ref_dot = 0;
            for (k = 0; k < size; k++) {
                ref_dot += a_host[i * size + k] * b_host[k * size + j];
            }

            if (ref_dot != c_host[i * size + j]) {
                errors++;
            }
        }
    }

    gettimeofday(&tf, NULL);
    ts = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
    printf("[SEQ] Execution time : %.3f ms\n", ts);
    printf("Found %d error%s\n", errors, (errors == 1) ? "" : "s");

    // [7] Cleanup system

    // Cleanup host variables
    free(a_host);
    free(b_host);
    free(c_host);
    free(buffer);

    // Cleanup OpenCL
    clReleaseMemObject(a_device);
    clReleaseMemObject(b_device);
    clReleaseMemObject(c_device);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}
#包括
#包括
#包括
#包括
#包括
#定义大小(1024)
#定义工作项(4096)
#定义日志大小(2048)
int main(int argc,char*argv[]){
int i,j,k,大小,误差;
//主机存储器
cl_uint*a_host=NULL;
cl_uint*b_host=NULL;
cl_uint*c_host=NULL;
cl_uint参考点;
//设备存储器
cl_mem a_装置;
cl_mem b_装置;
cl_mem c_装置;
//性能测量
结构时间值t0,tf;
浮点数ts、tp、tb;
//OpenCL变量
文件*f;
大小;
大小\u t全局[3]={0},局部[3]={0};
char*buffer=NULL;
cl_int ret;
cl_平台\u id平台;
cl_设备\u id设备;
语境;
cl_命令_队列;
CLU计划;
cl_核;
//[1]初始化应用程序
//读取命令行参数以配置运行
大小=(argc>1)?atoi(argv[1]):大小;
printf(“与OpenCL的矩阵相乘(大小=%d)\n”,大小);
//为主机变量分配内存
a_host=malloc(size*size*sizeof*a_host);
b_主机=malloc(大小*大小*大小*大小*b_主机);
c_主机=malloc(大小*大小*大小*大小*c_主机);
//初始化输入数组
对于(i=0;i