为什么OpenCL中不同的局部大小会产生不同的结果?
我正在尝试使用OpenCL执行一个基本的矩阵乘法算法。这两个矩阵的维数应该相等(大小x大小),所以我将问题定义为二维问题,全局大小x大小,我正在测试不同局部大小的情况 内核编写如下:为什么OpenCL中不同的局部大小会产生不同的结果?,c,parallel-processing,opencl,matrix-multiplication,C,Parallel Processing,Opencl,Matrix Multiplication,我正在尝试使用OpenCL执行一个基本的矩阵乘法算法。这两个矩阵的维数应该相等(大小x大小),所以我将问题定义为二维问题,全局大小x大小,我正在测试不同局部大小的情况 内核编写如下: __kernel void matmul( __global unsigned int *a, __global unsigned int *b, __global unsigned int *c ) { int row, col, i, size; unsigned int
__kernel void matmul(
__global unsigned int *a,
__global unsigned int *b,
__global unsigned int *c
) {
int row, col, i, size;
unsigned int dot;
row = get_global_id(0);
col = get_global_id(1);
size = get_global_size(0);
dot = 0;
for (i = 0; i < size; i++) {
dot += a[row * size + i] * b[i * size + col];
}
c[row * size + col] = dot;
}
\u内核无效matmul(
__全局无符号整数*a,
__全局无符号整数*b,
__全局无符号整数*c
) {
int行、列、i、大小;
无符号整数点;
行=获取全局id(0);
col=获取全局id(1);
大小=获取全局大小(0);
点=0;
对于(i=0;i
如果全局大小和局部大小分别设置为1024x1024和1x1,则效果很好。但是,如果局部大小是2x2或4x4,我在乘法中会得到错误的结果。现在,对于使用8的倍数的局部大小,如8x8,16x16。。。乘法中没有错误。为什么会这样
我不知道问题是在内核的编程中,还是因为我对工作组或工作项可以做什么理解不好
完整的主机代码如下所示:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <CL/opencl.h>
#define SIZE (1024)
#define WORKITEMS (4096)
#define LOG_SIZE (2048)
int main(int argc, char *argv[]) {
int i, j, k, size, errors;
// Host memory
cl_uint *a_host = NULL;
cl_uint *b_host = NULL;
cl_uint *c_host = NULL;
cl_uint ref_dot;
// Device memory
cl_mem a_device;
cl_mem b_device;
cl_mem c_device;
// Performance measurements
struct timeval t0, tf;
float ts, tp, tb;
// OpenCL variables
FILE *f;
size_t f_size;
size_t global[3] = {0}, local[3] = {0};
char *buffer = NULL;
cl_int ret;
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
// [1] Initialize application
// Read command line arguments to configure run
size = (argc > 1) ? atoi(argv[1]) : SIZE;
printf("Matrix multiplication with OpenCL (Size = %d)\n", size);
// Allocate memory for host variables
a_host = malloc(size * size * sizeof *a_host);
b_host = malloc(size * size * sizeof *b_host);
c_host = malloc(size * size * sizeof *c_host);
// Initialize input arrays
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
a_host[i * size + j] = rand();
b_host[i * size + j] = rand();
}
}
// [2] Initialize OpenCL environment
// Get platform
ret = clGetPlatformIDs(1, &platform, NULL);
// Get device
ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
// Create context
context = clCreateContext(0, 1, &device, NULL, NULL, &ret);
// Create command queue
queue = clCreateCommandQueueWithProperties(context, device, 0, &ret);
// [3] Compile OpenCL kernel
f = fopen("kernel.cl", "rb");
fseek(f, 0, SEEK_END);
f_size = ftell(f);
rewind(f);
// Read file into memory
buffer = malloc(f_size + 1);
buffer[f_size] = '\0';
fread(buffer, sizeof(char), f_size, f);
fclose(f);
// Create program
printf("<OpenCL> Kernel source:\n%s", buffer);
program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &f_size, &ret);
// Build program
printf("<OpenCL> Building kernel...\n");
gettimeofday(&t0, NULL);
ret = clBuildProgram(program, 0, NULL, "-cl-std=CL2.0", NULL, NULL);
gettimeofday(&tf, NULL);
tb = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
printf("Build time: %.3f ms\n", tb);
// Print build log (optional)
char log[LOG_SIZE];
ret = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, LOG_SIZE, log, NULL);
printf("<OpenCL> Kernel build log:\n%s\n", log);
// [4] Configure OpenCL kernel
// Create kernel
kernel = clCreateKernel(program, "matmul", &ret);
// Create device buffers
a_device = clCreateBuffer(context, CL_MEM_READ_ONLY, size * size * sizeof *a_host, NULL, &ret);
b_device = clCreateBuffer(context, CL_MEM_READ_ONLY, size * size * sizeof *b_host, NULL, &ret);
c_device = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size * size * sizeof *c_host, NULL, &ret);
// Set kernel parameters
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &a_device);
ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_device);
ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_device);
// [5] Execute kernel
printf("<OpenCL> Executing kernel...\n");
gettimeofday(&t0, NULL);
// Write data from host to device
ret = clEnqueueWriteBuffer(queue, a_device, CL_TRUE, 0, size * size * sizeof *a_host, a_host, 0, NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, b_device, CL_TRUE, 0, size * size * sizeof *b_host, b_host, 0, NULL, NULL);
// Enqueue kernel for execution
global[0] = size;
global[1] = size;
local[0] = 2;
local[1] = 2;
ret = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
// Wait for kernel to finish
ret = clFinish(queue);
// Read data from device to host
ret = clEnqueueReadBuffer(queue, c_device, CL_TRUE, 0, size * size * sizeof *c_host, c_host, 0, NULL, NULL);
gettimeofday(&tf, NULL);
tp = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
printf("[PAR] Execution time: %.3f ms\n", tp);
// [6] Print results, perform checks
// Compute golden reference and check errors
gettimeofday(&t0, NULL);
errors = 0;
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
ref_dot = 0;
for (k = 0; k < size; k++) {
ref_dot += a_host[i * size + k] * b_host[k * size + j];
}
if (ref_dot != c_host[i * size + j]) {
errors++;
}
}
}
gettimeofday(&tf, NULL);
ts = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
printf("[SEQ] Execution time : %.3f ms\n", ts);
printf("Found %d error%s\n", errors, (errors == 1) ? "" : "s");
// [7] Cleanup system
// Cleanup host variables
free(a_host);
free(b_host);
free(c_host);
free(buffer);
// Cleanup OpenCL
clReleaseMemObject(a_device);
clReleaseMemObject(b_device);
clReleaseMemObject(c_device);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
#包括
#包括
#包括
#包括
#包括
#定义大小(1024)
#定义工作项(4096)
#定义日志大小(2048)
int main(int argc,char*argv[]){
int i,j,k,大小,误差;
//主机存储器
cl_uint*a_host=NULL;
cl_uint*b_host=NULL;
cl_uint*c_host=NULL;
cl_uint参考点;
//设备存储器
cl_mem a_装置;
cl_mem b_装置;
cl_mem c_装置;
//性能测量
结构时间值t0,tf;
浮点数ts、tp、tb;
//OpenCL变量
文件*f;
大小;
大小\u t全局[3]={0},局部[3]={0};
char*buffer=NULL;
cl_int ret;
cl_平台\u id平台;
cl_设备\u id设备;
语境;
cl_命令_队列;
CLU计划;
cl_核;
//[1]初始化应用程序
//读取命令行参数以配置运行
大小=(argc>1)?atoi(argv[1]):大小;
printf(“与OpenCL的矩阵相乘(大小=%d)\n”,大小);
//为主机变量分配内存
a_host=malloc(size*size*sizeof*a_host);
b_主机=malloc(大小*大小*大小*大小*b_主机);
c_主机=malloc(大小*大小*大小*大小*c_主机);
//初始化输入数组
对于(i=0;i