OpenCL中NDRangeKernel的高效重复调用
我已经编写了以下代码。我有一个循环,它在两个红色和黑色内核之间迭代。在每次迭代中,我都调用我认为效率不高的clenqueueredbuffer。有没有其他方法可以有效地重复调用内核? 谢谢OpenCL中NDRangeKernel的高效重复调用,opencl,Opencl,我已经编写了以下代码。我有一个循环,它在两个红色和黑色内核之间迭代。在每次迭代中,我都调用我认为效率不高的clenqueueredbuffer。有没有其他方法可以有效地重复调用内核? 谢谢 #包括 #包括 #包括 #包括 #包括 #包括 #包括 #ifdef MAC #包括 #否则 #包括 #恩迪夫 #定义数据大小(1048576) #定义纳米到微米1e6 #定义最大值1 #定义限制100 #定义大范围限制*4*100 #定义EPS 1e-2 #定义SQ1024 #定义A(i,j)A[i*SQ+
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#ifdef MAC
#包括
#否则
#包括
#恩迪夫
#定义数据大小(1048576)
#定义纳米到微米1e6
#定义最大值1
#定义限制100
#定义大范围限制*4*100
#定义EPS 1e-2
#定义SQ1024
#定义A(i,j)A[i*SQ+j]
使用名称空间std;
cl_平台\u id平台;
cl_设备\u id设备;
语境;
cl_项目1、项目2;
cl_命令_队列命令;
cl_int err;
cl_果仁红色,果仁黑色;
cl_int i;
cl_mem输入A、输入b、输入输出X;
cl_事件时间安排_事件;
持续时间开始、时间结束、总时间=0;
const char options[]=“-cl mad enable-cl finite math only-Werror-DWIDTH=1024-DHEIGHT=1024”;
char*kernel_names[]={“Red”,“Black”};
浮点范数(浮点*,浮点*,整数);
无效掉期(浮动**入、浮动**出);
void创建队列(void);
void创建内核(void);
void CreateBuffer(无符号整数);
无效排队写入缓冲区(无符号整数);
void Kernel_Arg_集(cl_Kernel,unsigned int);
无效排队读取缓冲区(无符号整数);
void Create_Work_组(cl_内核,unsigned int);
无效关机();
浮动*A、*oldX、*newX、*b;
int main(int argc,字符**argv){
无符号整数计数=数据大小;
int i,j;
时钟开始、结束;
浮动*XX,*XXnew;
A=(浮动*)malloc(浮动*计数);
newX=(浮动*)malloc(浮动大小)*SQ);
oldX=(浮动*)malloc(浮动大小)*SQ);
b=(浮动*)malloc(浮动)*SQ;
XX=(浮点数*)malloc(浮点数)*SQ;
浮子h=1.0f/SQ;
浮动xx[SQ];
对于(i=0;i来说,您所做的工作效率很低。您可以只编写一次缓冲区,然后使用与参数相同的缓冲区将任意多个内核排队。当然,如果您需要计算范数,您需要读回数据。我建议这样做:
为范数创建额外的缓冲区;在每个内核的开头检查范数是什么(只需读取其值);如果它小于阈值,则立即返回
创建一个新内核,它将为您计算范数
完成以下任务:
- 写入缓冲区
- 内核:{red,black}*10,updateNorm}*10
- 读取缓冲区
计算将运行10倍,然后更新norm。如果已经正常,则将立即重新运行已排队的计算内核。队列完成后,读回缓冲区并检查CPU上的norm。如果norm仍然不正常,则再次将同一批内核排队
在最坏的情况下,您将浪费9个real和90个立即返回的内核运行
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <cmath>
#include <ctime>
#include <ocl
Utils.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define DATA_SIZE (1048576)
#define NANO_TO_MILI 1e6
#define MAX_ITER 1
#define LIMIT 100
#define BIG_RANGE LIMIT*4*100
#define EPS 1e-2
#define SQ 1024
#define A(i,j) A[i*SQ+j]
using namespace std;
cl_platform_id platforms;
cl_device_id device;
cl_context context;
cl_program program1, program2;
cl_command_queue command;
cl_int err;
cl_kernel kernel_red, kernel_black;
cl_int i;
cl_mem input_A,input_b,in_out_X;
cl_event timing_event;
cl_ulong time_start, time_end,total_time = 0;
const char options[] = "-cl-mad-enable -cl-finite-math-only -Werror -DWIDTH=1024 -DHEIGHT=1024";
char *kernel_names[] = {"Red","Black"};
float norm (float*,float*,int);
void swap(float **in, float **out);
void CreateQueue(void);
void CreateKernel(void);
void CreateBuffer(unsigned int);
void Enqueue_Write_Buffer(unsigned int);
void Kernel_Arg_Set(cl_kernel, unsigned int);
void Enqueue_Read_Buffer(unsigned int);
void Create_Work_Group(cl_kernel, unsigned int);
void Shutdown();
float *A,*oldX,*newX,*b;
int main(int argc, char** argv) {
unsigned int count = DATA_SIZE;
int i,j;
clock_t start,end;
float *XX,*XXnew;
A = (float*)malloc(sizeof(float)*count);
newX = (float*)malloc(sizeof(float)*SQ);
oldX = (float*)malloc(sizeof(float)*SQ);
b = (float*)malloc(sizeof(float)*SQ);
XX = (float*)malloc(sizeof(float)*SQ);
float h=1.0f/SQ;
float xx[SQ];
for (i=0;i<SQ;i++){
XX[i] = 0.0f;
oldX[i]=0.0f;
xx[i] = 0.0f + (i+1)*h;
if (i != 0) b[i] = -2.0f*xx[i]; else b[i] = -2.0f*xx[i]-1.0f/(h*h)+1.0f/(2.0f*h);
for(j=0;j<SQ;j++) A(i,j) =0.0f;
A(i,i) = -2.0f/(h*h);
if (i!=SQ-1) A(i,i+1) = 1.0f/(h*h) + 1.0f/(2.0f*h); else A(i,i+1) = 0.0f;
if (i != 0) A(i,i-1) = 1.0f/(h*h) - 1.0f/(2.0f*h); else A(i,i-1) = 0.0f;
}
newX[0] = BIG_RANGE;
int cnt = 0;
CreateQueue();
CreateKernel();
CreateBuffer(count);
Kernel_Arg_Set(kernel_red ,count);
Kernel_Arg_Set(kernel_black,count);
end=0.0f;start =clock();cnt =0;
Enqueue_Write_Buffer(count);
while(norm(oldX,newX,SQ) > EPS && cnt<LIMIT){
Create_Work_Group(kernel_red, count);
Enqueue_Read_Buffer(count);
Create_Work_Group(kernel_black, count);
cnt++;
Enqueue_Read_Buffer(count);
}
clFinish(command);
Shutdown();
free(oldX);
free(newX);
free(XX);
free(XXnew);
return 0;
}
void CreateQueue(){
err = clGetPlatformIDs(1, &platforms, NULL);
if(err<0){
perror("no platform");getchar();exit(1);}
err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &device,NULL);
if(err<0){
perror("no device");getchar();exit(1);}
context = clCreateContext(NULL, 1, &device,NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");exit(1);}
command = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
if (!command)
{
printf("Error: Failed to create a command commands!\n");
exit(1);
}
clEnqueueBarrier(command);
}
void CreateBuffer(unsigned int count){
input_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * count, A, NULL);
in_out_X = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, oldX, NULL);
input_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, b, NULL);
if (!input_A || !input_b || !in_out_X)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
}
void CreateKernel(){
FILE *fp;
size_t program_size;
string kernel_src;
fp = fopen("Red.cl", "r");
fseek(fp, 0, SEEK_END);
program_size = ftell(fp);
kernel_src.resize(program_size + 1);
fseek(fp, 0, SEEK_SET);
fread(&kernel_src[0], program_size, 1, fp);
fclose(fp);
kernel_src[program_size] = '\0';
const char *src = &kernel_src[0];
program1 = clCreateProgramWithSource(context, 1,&src, NULL, &err);
if (!program1)
{
printf("clCreateProgramWithSource failed\n");
exit(1);
}
err =clBuildProgram(program1, 1, &device, options, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2*2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program1, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
kernel_red = clCreateKernel(program1, kernel_names[0], &err);
if (!kernel_red || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
kernel_black = clCreateKernel(program1, kernel_names[1], &err);
if (!kernel_black || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
}
void Create_Work_Group(cl_kernel kernel, unsigned int count){
size_t global[] = {SQ,SQ,0};
size_t local[] = {32,32,0};
err = clEnqueueNDRangeKernel(command, kernel, 2, NULL, global, local, 0, NULL,NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
exit(1);
}
}
void Kernel_Arg_Set(cl_kernel kernel,unsigned int count){
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_A);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_out_X);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_b);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
}
void Enqueue_Read_Buffer(unsigned int count){
err = clEnqueueReadBuffer( command, in_out_X, CL_TRUE, 0, sizeof(float) * SQ, oldX, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
}
void Enqueue_Write_Buffer(unsigned int count){
err = clEnqueueWriteBuffer(command, input_A , CL_FALSE, 0, sizeof(float) * count, A, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, input_b , CL_FALSE, 0, sizeof(float) * SQ , b, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, in_out_X, CL_FALSE, 0, sizeof(float) * SQ ,oldX, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
}