Opencl 内核未保留对全局cl_mem对象的写入
我有一个在多个设备之间共享内存对象的设置。我试图多次调用内核来积累一些值。在CPU上,这工作正常。在GPU上,后续调用看不到上一次调用的结果。这是带有调试语句的内核Opencl 内核未保留对全局cl_mem对象的写入,opencl,Opencl,我有一个在多个设备之间共享内存对象的设置。我试图多次调用内核来积累一些值。在CPU上,这工作正常。在GPU上,后续调用看不到上一次调用的结果。这是带有调试语句的内核 #pragma OPENCL EXTENSION cl_khr_fp64 : enable __kernel void mgrid_sum(__global double *ar, __global double *ap,
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void mgrid_sum(__global double *ar,
__global double *ap,
__global double *az,
__global const double *temp_ar,
__global const double *temp_ap,
__global const double *temp_az,
double current) {
size_t i = get_global_id(0);
if (i == 0) {
printf("1 %6i %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e\n", i, ar[i], ap[i], az[i], temp_ar[i], temp_ap[i], temp_az[i], current);
}
ar[i] += temp_ar[i]*current;
ap[i] += temp_ap[i]*current;
az[i] += temp_az[i]*current;
if (i == 0) {
printf("2 %6i %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e\n", i, ar[i], ap[i], az[i], temp_ar[i], temp_ap[i], temp_az[i], current);
}
}
我有两组内存对象。在每次内核调用之前,我的temp
内存缓冲区将加载新值。因为每个设备都在独立的内存块上工作,所以我希望在完成所有内核调用之前不需要同步内存缓冲区。主机代码是
// Define memory objects. The host only needs these values at the very end.
a_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
a_p = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
a_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
// Define temp memory objects. This will only be written to by the host and read by the kernel.
cl_mem temp_ar = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
cl_mem temp_ap = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
cl_mem temp_az = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
// The first six kernel arguments don't change.
clSetKernelArg(sum, 0, sizeof(cl_mem), &a_r);
clSetKernelArg(sum, 1, sizeof(cl_mem), &a_p);
clSetKernelArg(sum, 2, sizeof(cl_mem), &a_z);
clSetKernelArg(sum, 3, sizeof(cl_mem), &temp_ar);
clSetKernelArg(sum, 4, sizeof(cl_mem), &temp_ap);
clSetKernelArg(sum, 5, sizeof(cl_mem), &temp_az);
size_t totalsize = 0;
for (device_info *device : cpu_devices) {
size_t worksize;
clGetKernelWorkGroupInfo(sum, device->id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &worksize, NULL);
device->max_work_group_size = worksize;
totalsize += worksize;
}
for (device_info *device : gpu_devices) {
size_t worksize;
clGetKernelWorkGroupInfo(sum, device->id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &worksize, NULL);
device->max_work_group_size = worksize;
totalsize += worksize;
}
size_t n_chunks = array_size/totalsize;
size_t remainder = array_size%totalsize;
size_t offset = 0;
for (device_info *device : gpu_devices) {
device->m_offset = offset;
device->m_chunk = n_chunks*device->max_work_group_size;
offset += device->m_chunk;
}
for (device_info *device : cpu_devices) {
device->m_offset = offset;
device->m_chunk = n_chunks*device->max_work_group_size;
offset += device->m_chunk;
}
cpu_devices.back()->m_chunk += remainder;
cl_event event;
std::vector<cl_event> buffer_events;
std::vector<cl_event> unmap_events;
// Make sure the a_ memory starts with zero.
const cl_char pattern = 0;
for (device_info *device : gpu_devices) {
offset = device->m_offset*sizeof(cl_double);
size_t fill_size = device->m_chunk*sizeof(cl_double);
clEnqueueFillBuffer(device->queue, a_r, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
clEnqueueFillBuffer(device->queue, a_p, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
clEnqueueFillBuffer(device->queue, a_z, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
}
for (device_info *device : cpu_devices) {
offset = device->m_offset*sizeof(cl_double);
size_t fill_size = device->m_chunk*sizeof(cl_double);
clEnqueueFillBuffer(device->queue, a_r, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
clEnqueueFillBuffer(device->queue, a_p, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
clEnqueueFillBuffer(device->queue, a_z, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
}
// For each iteration load the value of current set the temp_ buffer value.
// Temp buffer values are obtained by mapping the memory object to the host
// and writing to the memory object directly.
for (size_t i = 0, e = extcur.size(); i < e; i++) {
clSetKernelArg(sum, 6, sizeof(cl_double), extcur.data() + i);
cl_command_queue cpu_queue = cpu_devices.back()->queue;
std::stringstream ss_ar;
ss_ar << "ar_" << std::setfill('0') << std::setw(3) << i + 1;
std::stringstream ss_ap;
ss_ap << "ap_" << std::setfill('0') << std::setw(3) << i + 1;
std::stringstream ss_az;
ss_az << "az_" << std::setfill('0') << std::setw(3) << i + 1;
cl_double *temp_buffer;
nc_inq_varid(ncid, ss_ar.str().c_str(), &temp_varid);
temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_ar, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
nc_get_var(ncid, temp_varid, temp_buffer);
clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
buffer_events.push_back(event);
nc_inq_varid(ncid, ss_ap.str().c_str(), &temp_varid);
temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_ap, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
nc_get_var(ncid, temp_varid, temp_buffer);
buffer_events.push_back(event);
clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
buffer_events.push_back(event);
nc_inq_varid(ncid, ss_az.str().c_str(), &temp_varid);
temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_az, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
nc_get_var(ncid, temp_varid, temp_buffer);
buffer_events.push_back(event);
clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
buffer_events.push_back(event);
for (cl_event event : last_events) {
clReleaseEvent(event);
}
last_events.clear();
// Call kernels. These should wait until the memory objects are fully written.
for (device_info *device : gpu_devices) {
offset = device->m_offset;
size_t fill_size = device->m_chunk;
clEnqueueNDRangeKernel(device->queue, sum, 1, &offset, &fill_size, NULL, static_cast<cl_uint>(buffer_events.size()), buffer_events.data(), &event);
last_events.push_back(event);
offset += fill_size;
}
for (device_info *device : cpu_devices) {
offset = device->m_offset;
size_t fill_size = device->m_chunk;
clEnqueueNDRangeKernel(device->queue, sum, 1, &offset, &fill_size, NULL, static_cast<cl_uint>(buffer_events.size()), buffer_events.data(), &event);
last_events.push_back(event);
}
for (cl_event event : buffer_events) {
clReleaseEvent(event);
}
buffer_events.clear();
}
它是否在调用之间进行隐式同步
更新
以下是有关这些设备的信息
Platform Name : Apple
Device Name : Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
Device Type : CPU
Supports Double : True
Address Bits : 64
Max Work Size : 1
Extensions : cl_APPLE_SetMemObjectDestructor
cl_APPLE_ContextLoggingFunctions
cl_APPLE_clut
cl_APPLE_query_kernel_names
cl_APPLE_gl_sharing
cl_khr_gl_event
cl_khr_fp64
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_byte_addressable_store
cl_khr_int64_base_atomics
cl_khr_int64_extended_atomics
cl_khr_3d_image_writes
cl_khr_image2d_from_buffer
cl_APPLE_fp64_basic_ops
cl_APPLE_fixed_alpha_channel_orders
cl_APPLE_biased_fixed_point_image_formats
cl_APPLE_command_queue_priority
m_offset : 731392
m_chunk : 3080
Platform Name : Apple
Device Name : AMD Radeon R9 M370X Compute Engine
Device Type : GPU
Supports Double : True
Address Bits : 32
Max Work Size : 256
Extensions : cl_APPLE_SetMemObjectDestructor
cl_APPLE_ContextLoggingFunctions
cl_APPLE_clut
cl_APPLE_query_kernel_names
cl_APPLE_gl_sharing
cl_khr_gl_event
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_byte_addressable_store
cl_khr_image2d_from_buffer
cl_khr_depth_images
cl_APPLE_command_queue_priority
cl_APPLE_command_queue_select_compute_units
cl_khr_fp64
m_offset : 0
m_chunk : 731392
我知道发生了什么。我需要添加显式调用来跨队列使用事件。各国的文件: 若要将引用命令队列中排队的命令的事件对象用作事件对象,以供其他命令队列中排队的命令等待,应用程序必须调用clFlush或任何执行命令队列隐式刷新的阻塞命令,其中引用这些事件对象的命令已排队
关于
cl\u事件
的文档中不清楚您的GPU是否支持cl\u khr\u fp64扩展?我添加了有关设备扩展的信息。nc\u get\u var()在做什么?这是一个从netcdf文件加载数组的netcdf例程。我认为您缺少(auto&ev:last_events)clWaitForEvents(1,&ev)的代码>为gpu和cpu设备执行所有clEnqueueNDRangeKernel()
之后。在将新数据设置为temp_-ar、temp_-ap和temp_-az之前,您需要确保内核完成了对以前数据的处理。顺便说一句,您在temp\u ar
上呼叫了3次clEnqueueUnmapMemObject()。
Platform Name : Apple
Device Name : Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
Device Type : CPU
Supports Double : True
Address Bits : 64
Max Work Size : 1
Extensions : cl_APPLE_SetMemObjectDestructor
cl_APPLE_ContextLoggingFunctions
cl_APPLE_clut
cl_APPLE_query_kernel_names
cl_APPLE_gl_sharing
cl_khr_gl_event
cl_khr_fp64
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_byte_addressable_store
cl_khr_int64_base_atomics
cl_khr_int64_extended_atomics
cl_khr_3d_image_writes
cl_khr_image2d_from_buffer
cl_APPLE_fp64_basic_ops
cl_APPLE_fixed_alpha_channel_orders
cl_APPLE_biased_fixed_point_image_formats
cl_APPLE_command_queue_priority
m_offset : 731392
m_chunk : 3080
Platform Name : Apple
Device Name : AMD Radeon R9 M370X Compute Engine
Device Type : GPU
Supports Double : True
Address Bits : 32
Max Work Size : 256
Extensions : cl_APPLE_SetMemObjectDestructor
cl_APPLE_ContextLoggingFunctions
cl_APPLE_clut
cl_APPLE_query_kernel_names
cl_APPLE_gl_sharing
cl_khr_gl_event
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_byte_addressable_store
cl_khr_image2d_from_buffer
cl_khr_depth_images
cl_APPLE_command_queue_priority
cl_APPLE_command_queue_select_compute_units
cl_khr_fp64
m_offset : 0
m_chunk : 731392