Opencl 优化主机到GPU的传输

Opencl 优化主机到GPU的传输,opencl,gpu,Opencl,Gpu,我正在使用OpenCL(矩阵乘法的一种变体)将工作转移到GPU上。矩阵代码本身工作得非常好,但是将数据移动到GPU的成本太高了 我已经从使用clenqueuerad/clEnqueueWrite转移到内存映射缓冲区,如下所示: d_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, sizeof(char) * queryVector_size,

我正在使用OpenCL(矩阵乘法的一种变体)将工作转移到GPU上。矩阵代码本身工作得非常好,但是将数据移动到GPU的成本太高了

我已经从使用clenqueuerad/clEnqueueWrite转移到内存映射缓冲区,如下所示:

d_a  = clCreateBuffer(context,  CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
                    sizeof(char) * queryVector_size,
                    NULL, NULL);
checkErr(err,"Buf A");

d_b  = clCreateBuffer(context,  CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
                    sizeof(char) * segment_size,
                     NULL, NULL);

checkErr(err,"Buf B");




err  = clSetKernelArg(ko_smat, 0, sizeof(cl_mem), &d_c);
checkErr(err,"Compute Kernel");
err = clSetKernelArg(ko_smat, 1, sizeof(cl_mem), &d_a);
checkErr(err,"Compute Kernel");
err = clSetKernelArg(ko_smat, 2, sizeof(cl_mem), &d_b);
checkErr(err,"Compute Kernel");

  query_vector = (char*) clEnqueueMapBuffer(commands, d_a, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * queryVector_size, 0, NULL, NULL, &err);
 checkErr(err,"Write A");

 segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_size, 0, NULL, NULL, &err);
    checkErr(err,"Write B");

     // code which initialises buffers using ptrs (segment_data and queryV)

  err = clEnqueueUnmapMemObject(commands,
                             d_a,
                      query_vector, 0, NULL, NULL);
 checkErr(err,"Unmap Buffer");

  err = clEnqueueUnmapMemObject(commands,
                       d_b,
                      segment_data, 0, NULL, NULL);
 checkErr(err,"Unmap Buff");
 err = clEnqueueNDRangeKernel(commands, ko_smat, 2, NULL, globalWorkItems, localWorkItems, 0, NULL, NULL);

 err = clFinish(commands);
 checkErr(err, "Execute Kernel");

     result = (char*) clEnqueueMapBuffer(commands, d_c, CL_TRUE,CL_MAP_WRITE, 0, sizeof(char) * result_size, 0, NULL, NULL, &err);
     checkErr(err,"Write C");

  printMatrix(result, result_row, result_col);
Create input buffers
Create output buffers
Map input buffers
Write input data
Unmap input buffers
Enqueue kernel
Map output buffers
Read output data
Unmap output buffers
当我使用ReadEnqueue/WriteEnqueue方法并通过该方法初始化d_a、d_b、d_c时,这段代码运行良好,但当我使用mappedbuffer时,由于d_a和d_b为null,结果为0 当运行内核时

映射/取消映射缓冲区的适当方式是什么

编辑: 核心问题似乎就在这里

  segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_width * segment_length, 0, NULL, NULL, &err);

  // INITIALISE

  printMatrix(segment_data, segment_length, segment_width);

  // ALL GOOD    

   err = clEnqueueUnmapMemObject(commands,
                           d_b,
                          segment_data, 0, NULL, NULL);
  checkErr(err,"Unmap Buff");

   segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_width * segment_length, 0\
, NULL, NULL, &err);

   printMatrix(segment_data, segment_length, segment_width);

   // ALL ZEROs again

第一个printMatrix()返回正确的输出,一旦我取消映射并重新映射它,段_数据将变为所有0(它的初始值)。我怀疑我在某处使用了错误的标志?但是我不知道在哪里。

来自OpenCL 1.2规范:

5.4.3访问内存对象的映射区域

如果内存对象当前映射为读取,则应用程序必须确保在写入此内存对象或其任何关联内存对象(子缓冲区或1D图像缓冲区对象)或其父对象的任何排队内核或命令之前取消映射内存对象(如果内存对象是子缓冲区或1D图像缓冲区对象)开始执行;否则行为未定义

因此,您需要在内核排队后映射
结果
缓冲区。类似地,您需要在内核排队前取消映射输入缓冲区。映射/取消映射缓冲区的时间线大致如下:

d_a  = clCreateBuffer(context,  CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
                    sizeof(char) * queryVector_size,
                    NULL, NULL);
checkErr(err,"Buf A");

d_b  = clCreateBuffer(context,  CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
                    sizeof(char) * segment_size,
                     NULL, NULL);

checkErr(err,"Buf B");




err  = clSetKernelArg(ko_smat, 0, sizeof(cl_mem), &d_c);
checkErr(err,"Compute Kernel");
err = clSetKernelArg(ko_smat, 1, sizeof(cl_mem), &d_a);
checkErr(err,"Compute Kernel");
err = clSetKernelArg(ko_smat, 2, sizeof(cl_mem), &d_b);
checkErr(err,"Compute Kernel");

  query_vector = (char*) clEnqueueMapBuffer(commands, d_a, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * queryVector_size, 0, NULL, NULL, &err);
 checkErr(err,"Write A");

 segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_size, 0, NULL, NULL, &err);
    checkErr(err,"Write B");

     // code which initialises buffers using ptrs (segment_data and queryV)

  err = clEnqueueUnmapMemObject(commands,
                             d_a,
                      query_vector, 0, NULL, NULL);
 checkErr(err,"Unmap Buffer");

  err = clEnqueueUnmapMemObject(commands,
                       d_b,
                      segment_data, 0, NULL, NULL);
 checkErr(err,"Unmap Buff");
 err = clEnqueueNDRangeKernel(commands, ko_smat, 2, NULL, globalWorkItems, localWorkItems, 0, NULL, NULL);

 err = clFinish(commands);
 checkErr(err, "Execute Kernel");

     result = (char*) clEnqueueMapBuffer(commands, d_c, CL_TRUE,CL_MAP_WRITE, 0, sizeof(char) * result_size, 0, NULL, NULL, &err);
     checkErr(err,"Write C");

  printMatrix(result, result_row, result_col);
Create input buffers
Create output buffers
Map input buffers
Write input data
Unmap input buffers
Enqueue kernel
Map output buffers
Read output data
Unmap output buffers

显然,提高代码速度的最佳方法是使用映射缓冲区。您可以使用CL_MEM_ALLOC_HOST_PTR创建缓冲区,这基本上可以通过启动DMA传输减轻CPU的传输负担

以下是使用映射缓冲区的示例:

// pointer to hold the result
int * host_ptr = malloc(size * sizeof(int));

d_mem = clCreateBuffer(context,CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR,
                       size*sizeof(cl_int), NULL, &ret);

int * map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_WRITE,
                                   0,size*sizeof(int),0,NULL,NULL,&ret);
// initialize data
for (i=0; i<size;i++) {
  map_ptr[i] = i;
}

ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL); 

//Set OpenCL Kernel Parameters
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_mem);

size_t global_work[1]  = { size };
//Execute OpenCL Kernel
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
                             global_work, NULL, 0, 0, NULL);

map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_READ,
                             0,size*sizeof(int),0,NULL,NULL,&ret);
// copy the data to result array 
for (i=0; i<size;i++){
  host_ptr[i] = map_ptr[i];
} 

ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);        

// cl finish etc   
//保存结果的指针
int*host_ptr=malloc(size*sizeof(int));
d_mem=clCreateBuffer(上下文,CL_mem_读写| CL_mem_ALLOC_HOST_PTR,
大小*sizeof(cl_int)、NULL和ret);
int*map\u ptr=clenqueueemapbuffer(命令队列、数据存储、CL\u真、CL\u映射写入、,
0,size*sizeof(int),0,NULL,NULL,&ret);
//初始化数据
对于(i=0;i

缓冲区映射为CL_MAP_READ但写入。与缓冲区创建不同,这些标志不采用内存的设备视图,而是主机视图,因此它们应该使用CL_MAP_WRITE标志进行映射,否则,当其未映射的

一致时,任何更改都将被丢弃。此外,如果您有多个内核要运行另一个win,则将覆盖p带内核计算的数据传输。许多高端GPU具有双DMA引擎,可以同时进行上传、下载和计算。通过这样的重叠操作,您只需支付最昂贵的操作。当我这样做时(这是我最初做的),我系统地得到如下输出:null@user1018513你的意思是
clEnqueueMapBuffer
返回
NULL
?如果是,它返回的错误代码是什么?我已经编辑了上面的代码。很抱歉措辞不当。我的内核认为d_a和d_b到处都是0(我在取消映射之前打印出查询向量和段数据的内容,并正确初始化),这反过来会导致结果为0。任何地方都不会抛出错误。如果我手动将结果强制为内核中的任意值,我可以读回正确的值。类似地,如果我取消映射/重新映射d_a,则第二次d_a都是0。这是我最初所做的,但当我这样做时,map_ptr的内容系统地为空。