Opencl 内核未保留对全局cl_mem对象的写入

Opencl 内核未保留对全局cl_mem对象的写入,opencl,Opencl,我有一个在多个设备之间共享内存对象的设置。我试图多次调用内核来积累一些值。在CPU上,这工作正常。在GPU上,后续调用看不到上一次调用的结果。这是带有调试语句的内核 #pragma OPENCL EXTENSION cl_khr_fp64 : enable __kernel void mgrid_sum(__global double *ar, __global double *ap,

我有一个在多个设备之间共享内存对象的设置。我试图多次调用内核来积累一些值。在CPU上,这工作正常。在GPU上,后续调用看不到上一次调用的结果。这是带有调试语句的内核

#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void mgrid_sum(__global double *ar,            
                        __global double *ap,           
                        __global double *az,            
                        __global const double *temp_ar,
                        __global const double *temp_ap,
                        __global const double *temp_az,
                        double current) {
    size_t i = get_global_id(0);
    if (i == 0) {
        printf("1 %6i %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e\n", i, ar[i], ap[i], az[i], temp_ar[i], temp_ap[i], temp_az[i], current);
    }
    ar[i] += temp_ar[i]*current;
    ap[i] += temp_ap[i]*current;
    az[i] += temp_az[i]*current;
    if (i == 0) {
        printf("2 %6i %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e\n", i, ar[i], ap[i], az[i], temp_ar[i], temp_ap[i], temp_az[i], current);
    }
}
我有两组内存对象。在每次内核调用之前,我的
temp
内存缓冲区将加载新值。因为每个设备都在独立的内存块上工作,所以我希望在完成所有内核调用之前不需要同步内存缓冲区。主机代码是

// Define memory objects. The host only needs these values at the very end.
a_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
a_p = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
a_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);

// Define temp memory objects. This will only be written to by the host and read by the kernel.
cl_mem temp_ar = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
cl_mem temp_ap = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
cl_mem temp_az = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);

// The first six kernel arguments don't change.
clSetKernelArg(sum, 0, sizeof(cl_mem), &a_r);
clSetKernelArg(sum, 1, sizeof(cl_mem), &a_p);
clSetKernelArg(sum, 2, sizeof(cl_mem), &a_z);
clSetKernelArg(sum, 3, sizeof(cl_mem), &temp_ar);
clSetKernelArg(sum, 4, sizeof(cl_mem), &temp_ap);
clSetKernelArg(sum, 5, sizeof(cl_mem), &temp_az);

size_t totalsize = 0;
for (device_info *device : cpu_devices) {
    size_t worksize;
    clGetKernelWorkGroupInfo(sum, device->id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &worksize, NULL);
    device->max_work_group_size = worksize;
    totalsize += worksize;
}
for (device_info *device : gpu_devices) {
    size_t worksize;
    clGetKernelWorkGroupInfo(sum, device->id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &worksize, NULL);
    device->max_work_group_size = worksize;
    totalsize += worksize;
}

size_t n_chunks = array_size/totalsize;
size_t remainder = array_size%totalsize;

size_t offset = 0;
for (device_info *device : gpu_devices) {
    device->m_offset = offset;
    device->m_chunk = n_chunks*device->max_work_group_size;
    offset += device->m_chunk;
}
for (device_info *device : cpu_devices) {
    device->m_offset = offset;
    device->m_chunk = n_chunks*device->max_work_group_size;
    offset += device->m_chunk;
}
cpu_devices.back()->m_chunk += remainder;

cl_event event;
std::vector<cl_event> buffer_events;
std::vector<cl_event> unmap_events;

// Make sure the a_ memory starts with zero.
const cl_char pattern = 0;
for (device_info *device : gpu_devices) {
    offset = device->m_offset*sizeof(cl_double);
    size_t fill_size = device->m_chunk*sizeof(cl_double);
    clEnqueueFillBuffer(device->queue, a_r, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
    clEnqueueFillBuffer(device->queue, a_p, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
    clEnqueueFillBuffer(device->queue, a_z, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
}
for (device_info *device : cpu_devices) {
    offset = device->m_offset*sizeof(cl_double);
    size_t fill_size = device->m_chunk*sizeof(cl_double);
    clEnqueueFillBuffer(device->queue, a_r, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
    clEnqueueFillBuffer(device->queue, a_p, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
    clEnqueueFillBuffer(device->queue, a_z, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
}

// For each iteration load the value of current set the temp_ buffer value.
// Temp buffer values are obtained by mapping the memory object to the host 
// and writing to the memory object directly.
for (size_t i = 0, e = extcur.size(); i < e; i++) {
    clSetKernelArg(sum, 6, sizeof(cl_double), extcur.data() + i);

    cl_command_queue cpu_queue = cpu_devices.back()->queue;

    std::stringstream ss_ar;
    ss_ar << "ar_" << std::setfill('0') << std::setw(3) << i + 1;
    std::stringstream ss_ap;
    ss_ap << "ap_" << std::setfill('0') << std::setw(3) << i + 1;
    std::stringstream ss_az;
    ss_az << "az_" << std::setfill('0') << std::setw(3) << i + 1;

    cl_double *temp_buffer;
    nc_inq_varid(ncid, ss_ar.str().c_str(), &temp_varid);
    temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_ar, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
    nc_get_var(ncid, temp_varid, temp_buffer);
    clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
    buffer_events.push_back(event);

    nc_inq_varid(ncid, ss_ap.str().c_str(), &temp_varid);
    temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_ap, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
    nc_get_var(ncid, temp_varid, temp_buffer);
    buffer_events.push_back(event);
    clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
    buffer_events.push_back(event);

    nc_inq_varid(ncid, ss_az.str().c_str(), &temp_varid);
    temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_az, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
    nc_get_var(ncid, temp_varid, temp_buffer);
    buffer_events.push_back(event);
    clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
    buffer_events.push_back(event);

    for (cl_event event : last_events) {
        clReleaseEvent(event);
    }
    last_events.clear();

    // Call kernels. These should wait until the memory objects are fully written.
    for (device_info *device : gpu_devices) {
        offset = device->m_offset;
        size_t fill_size = device->m_chunk;
        clEnqueueNDRangeKernel(device->queue, sum, 1, &offset, &fill_size, NULL, static_cast<cl_uint>(buffer_events.size()), buffer_events.data(), &event);
        last_events.push_back(event);
        offset += fill_size;
    }
    for (device_info *device : cpu_devices) {
        offset = device->m_offset;
        size_t fill_size = device->m_chunk;
        clEnqueueNDRangeKernel(device->queue, sum, 1, &offset, &fill_size, NULL, static_cast<cl_uint>(buffer_events.size()), buffer_events.data(), &event);
        last_events.push_back(event);
    }

    for (cl_event event : buffer_events) {
        clReleaseEvent(event);
    }
    buffer_events.clear();
}
它是否在调用之间进行隐式同步

更新

以下是有关这些设备的信息

Platform Name   : Apple
Device Name     : Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
Device Type     : CPU
Supports Double : True
Address Bits    : 64
Max Work Size   : 1
Extensions      : cl_APPLE_SetMemObjectDestructor  
                  cl_APPLE_ContextLoggingFunctions 
                  cl_APPLE_clut 
                  cl_APPLE_query_kernel_names 
                  cl_APPLE_gl_sharing 
                  cl_khr_gl_event 
                  cl_khr_fp64             
                  cl_khr_global_int32_base_atomics 
                  cl_khr_global_int32_extended_atomics 
                  cl_khr_local_int32_base_atomics  
                  cl_khr_local_int32_extended_atomics 
                  cl_khr_byte_addressable_store 
                  cl_khr_int64_base_atomics 
                  cl_khr_int64_extended_atomics 
                  cl_khr_3d_image_writes 
                  cl_khr_image2d_from_buffer 
                  cl_APPLE_fp64_basic_ops 
                  cl_APPLE_fixed_alpha_channel_orders 
                  cl_APPLE_biased_fixed_point_image_formats 
                  cl_APPLE_command_queue_priority
m_offset        : 731392
m_chunk         : 3080

Platform Name   : Apple
Device Name     : AMD Radeon R9 M370X Compute Engine
Device Type     : GPU
Supports Double : True
Address Bits    : 32
Max Work Size   : 256
Extensions      : cl_APPLE_SetMemObjectDestructor 
                  cl_APPLE_ContextLoggingFunctions 
                  cl_APPLE_clut 
                  cl_APPLE_query_kernel_names 
                  cl_APPLE_gl_sharing 
                  cl_khr_gl_event 
                  cl_khr_global_int32_base_atomics 
                  cl_khr_global_int32_extended_atomics 
                  cl_khr_local_int32_base_atomics            
                  cl_khr_local_int32_extended_atomics 
                  cl_khr_byte_addressable_store 
                  cl_khr_image2d_from_buffer 
                  cl_khr_depth_images 
                  cl_APPLE_command_queue_priority 
                  cl_APPLE_command_queue_select_compute_units 
                  cl_khr_fp64
m_offset        : 0
m_chunk         : 731392

我知道发生了什么。我需要添加显式调用来跨队列使用事件。各国的文件:

若要将引用命令队列中排队的命令的事件对象用作事件对象,以供其他命令队列中排队的命令等待,应用程序必须调用clFlush或任何执行命令队列隐式刷新的阻塞命令,其中引用这些事件对象的命令已排队


关于
cl\u事件

的文档中不清楚您的GPU是否支持cl\u khr\u fp64扩展?我添加了有关设备扩展的信息。nc\u get\u var()在做什么?这是一个从netcdf文件加载数组的netcdf例程。我认为您缺少(auto&ev:last_events)clWaitForEvents(1,&ev)的
为gpu和cpu设备执行所有
clEnqueueNDRangeKernel()
之后。在将新数据设置为temp_-ar、temp_-ap和temp_-az之前,您需要确保内核完成了对以前数据的处理。顺便说一句,您在
temp\u ar
上呼叫了3次
clEnqueueUnmapMemObject()。
Platform Name   : Apple
Device Name     : Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
Device Type     : CPU
Supports Double : True
Address Bits    : 64
Max Work Size   : 1
Extensions      : cl_APPLE_SetMemObjectDestructor  
                  cl_APPLE_ContextLoggingFunctions 
                  cl_APPLE_clut 
                  cl_APPLE_query_kernel_names 
                  cl_APPLE_gl_sharing 
                  cl_khr_gl_event 
                  cl_khr_fp64             
                  cl_khr_global_int32_base_atomics 
                  cl_khr_global_int32_extended_atomics 
                  cl_khr_local_int32_base_atomics  
                  cl_khr_local_int32_extended_atomics 
                  cl_khr_byte_addressable_store 
                  cl_khr_int64_base_atomics 
                  cl_khr_int64_extended_atomics 
                  cl_khr_3d_image_writes 
                  cl_khr_image2d_from_buffer 
                  cl_APPLE_fp64_basic_ops 
                  cl_APPLE_fixed_alpha_channel_orders 
                  cl_APPLE_biased_fixed_point_image_formats 
                  cl_APPLE_command_queue_priority
m_offset        : 731392
m_chunk         : 3080

Platform Name   : Apple
Device Name     : AMD Radeon R9 M370X Compute Engine
Device Type     : GPU
Supports Double : True
Address Bits    : 32
Max Work Size   : 256
Extensions      : cl_APPLE_SetMemObjectDestructor 
                  cl_APPLE_ContextLoggingFunctions 
                  cl_APPLE_clut 
                  cl_APPLE_query_kernel_names 
                  cl_APPLE_gl_sharing 
                  cl_khr_gl_event 
                  cl_khr_global_int32_base_atomics 
                  cl_khr_global_int32_extended_atomics 
                  cl_khr_local_int32_base_atomics            
                  cl_khr_local_int32_extended_atomics 
                  cl_khr_byte_addressable_store 
                  cl_khr_image2d_from_buffer 
                  cl_khr_depth_images 
                  cl_APPLE_command_queue_priority 
                  cl_APPLE_command_queue_select_compute_units 
                  cl_khr_fp64
m_offset        : 0
m_chunk         : 731392