OpenCL中NDRangeKernel的高效重复调用

OpenCL中NDRangeKernel的高效重复调用,opencl,Opencl,我已经编写了以下代码。我有一个循环,它在两个红色和黑色内核之间迭代。在每次迭代中,我都调用我认为效率不高的clenqueueredbuffer。有没有其他方法可以有效地重复调用内核? 谢谢 #包括 #包括 #包括 #包括 #包括 #包括 #包括 #ifdef MAC #包括 #否则 #包括 #恩迪夫 #定义数据大小(1048576) #定义纳米到微米1e6 #定义最大值1 #定义限制100 #定义大范围限制*4*100 #定义EPS 1e-2 #定义SQ1024 #定义A(i,j)A[i*SQ+

我已经编写了以下代码。我有一个循环,它在两个红色和黑色内核之间迭代。在每次迭代中,我都调用我认为效率不高的clenqueueredbuffer。有没有其他方法可以有效地重复调用内核? 谢谢

#包括
#包括
#包括
#包括
#包括
#包括
#包括
#ifdef MAC
#包括
#否则
#包括
#恩迪夫
#定义数据大小(1048576)
#定义纳米到微米1e6
#定义最大值1
#定义限制100
#定义大范围限制*4*100
#定义EPS 1e-2
#定义SQ1024
#定义A(i,j)A[i*SQ+j]
使用名称空间std;
cl_平台\u id平台;
cl_设备\u id设备;
语境;
cl_项目1、项目2;
cl_命令_队列命令;
cl_int err;
cl_果仁红色,果仁黑色;
cl_int i;
cl_mem输入A、输入b、输入输出X;
cl_事件时间安排_事件;
持续时间开始、时间结束、总时间=0;
const char options[]=“-cl mad enable-cl finite math only-Werror-DWIDTH=1024-DHEIGHT=1024”;
char*kernel_names[]={“Red”,“Black”};
浮点范数(浮点*,浮点*,整数);
无效掉期(浮动**入、浮动**出);
void创建队列(void);
void创建内核(void);
void CreateBuffer(无符号整数);
无效排队写入缓冲区(无符号整数);
void Kernel_Arg_集(cl_Kernel,unsigned int);
无效排队读取缓冲区(无符号整数);
void Create_Work_组(cl_内核,unsigned int);
无效关机();
浮动*A、*oldX、*newX、*b;
int main(int argc,字符**argv){
无符号整数计数=数据大小;
int i,j;
时钟开始、结束;
浮动*XX,*XXnew;
A=(浮动*)malloc(浮动*计数);
newX=(浮动*)malloc(浮动大小)*SQ);
oldX=(浮动*)malloc(浮动大小)*SQ);
b=(浮动*)malloc(浮动)*SQ;
XX=(浮点数*)malloc(浮点数)*SQ;
浮子h=1.0f/SQ;
浮动xx[SQ];

对于(i=0;i来说,您所做的工作效率很低。您可以只编写一次缓冲区,然后使用与参数相同的缓冲区将任意多个内核排队。当然,如果您需要计算范数,您需要读回数据。我建议这样做:

  • 为范数创建额外的缓冲区;在每个内核的开头检查范数是什么(只需读取其值);如果它小于阈值,则立即返回

  • 创建一个新内核,它将为您计算范数

  • 完成以下任务:

    • 写入缓冲区
    • 内核:{red,black}*10,updateNorm}*10
    • 读取缓冲区
    计算将运行10倍,然后更新norm。如果已经正常,则将立即重新运行已排队的计算内核。队列完成后,读回缓冲区并检查CPU上的norm。如果norm仍然不正常,则再次将同一批内核排队

    在最坏的情况下,您将浪费9个real和90个立即返回的内核运行

  • #include <stdio.h>
    #include <stdlib.h> 
    #include <string>
    #include <iostream>
    #include <cmath>
    #include <ctime>
    #include <ocl
    
    Utils.h>
    
    #ifdef MAC
    #include <OpenCL/cl.h>
    #else
    #include <CL/cl.h>
    #endif
    
    
    #define DATA_SIZE (1048576)
    #define NANO_TO_MILI 1e6
    #define MAX_ITER 1
    #define LIMIT 100
    #define BIG_RANGE LIMIT*4*100
    
    #define EPS 1e-2
    #define SQ 1024
    
    #define A(i,j) A[i*SQ+j]
    
    using namespace std;
    
    cl_platform_id platforms;
    cl_device_id device;
    cl_context context;
    cl_program program1, program2;
    cl_command_queue command;
    cl_int err;
    cl_kernel kernel_red, kernel_black;
    cl_int i;
    cl_mem input_A,input_b,in_out_X;
    cl_event timing_event;
    cl_ulong time_start, time_end,total_time = 0;
    
    
    const char options[] = "-cl-mad-enable -cl-finite-math-only -Werror -DWIDTH=1024 -DHEIGHT=1024";
    char *kernel_names[] = {"Red","Black"};
    
    float norm (float*,float*,int);
    void swap(float **in, float **out); 
    
    void CreateQueue(void);
    void CreateKernel(void);
    void CreateBuffer(unsigned int);
    void Enqueue_Write_Buffer(unsigned int);
    void Kernel_Arg_Set(cl_kernel, unsigned int);
    void Enqueue_Read_Buffer(unsigned int);
    void Create_Work_Group(cl_kernel, unsigned int);
    void Shutdown();
    
    float *A,*oldX,*newX,*b;
    
    int main(int argc, char** argv) {
    unsigned int count = DATA_SIZE;
    int i,j;
    clock_t start,end;
    float *XX,*XXnew;
    
        A = (float*)malloc(sizeof(float)*count);
        newX = (float*)malloc(sizeof(float)*SQ);
        oldX = (float*)malloc(sizeof(float)*SQ);
        b = (float*)malloc(sizeof(float)*SQ);
    
        XX = (float*)malloc(sizeof(float)*SQ);
    
        float h=1.0f/SQ;
        float xx[SQ];
    
        for (i=0;i<SQ;i++){
            XX[i] = 0.0f;
            oldX[i]=0.0f;
            xx[i] = 0.0f + (i+1)*h;
            if (i != 0) b[i] = -2.0f*xx[i]; else b[i] = -2.0f*xx[i]-1.0f/(h*h)+1.0f/(2.0f*h);
            for(j=0;j<SQ;j++) A(i,j) =0.0f;
            A(i,i) = -2.0f/(h*h);
            if (i!=SQ-1) A(i,i+1) = 1.0f/(h*h) + 1.0f/(2.0f*h); else A(i,i+1) = 0.0f;
            if (i != 0)  A(i,i-1) = 1.0f/(h*h) - 1.0f/(2.0f*h); else A(i,i-1) = 0.0f;
        }
    
    
        newX[0] = BIG_RANGE;
    
        int cnt = 0;
    
        CreateQueue();
    
        CreateKernel();
    
        CreateBuffer(count);
    
    
    
        Kernel_Arg_Set(kernel_red  ,count);
        Kernel_Arg_Set(kernel_black,count);
    
        end=0.0f;start =clock();cnt =0;
    
        Enqueue_Write_Buffer(count);
    
    
        while(norm(oldX,newX,SQ) > EPS && cnt<LIMIT){
    
        Create_Work_Group(kernel_red, count);
    
        Enqueue_Read_Buffer(count);
    
        Create_Work_Group(kernel_black, count);
    
        cnt++;
    
        Enqueue_Read_Buffer(count);
    
        }
    
        clFinish(command);
    
        Shutdown();
    
    
        free(oldX);
        free(newX);
        free(XX);
        free(XXnew);
        return 0;
    }
    
    
    
    
    void CreateQueue(){
    err = clGetPlatformIDs(1, &platforms, NULL);
    if(err<0){
        perror("no platform");getchar();exit(1);}
    
    err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &device,NULL);
    if(err<0){
        perror("no device");getchar();exit(1);}
    
    context = clCreateContext(NULL, 1, &device,NULL, NULL, &err);
    if(err < 0) {
        perror("Couldn't create a context");exit(1);}
    
    command = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
        if (!command)
        {
            printf("Error: Failed to create a command commands!\n");
            exit(1);
        }
    
    clEnqueueBarrier(command);
    
    
    }
    
    void CreateBuffer(unsigned int count){
    
        input_A  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * count, A, NULL);
        in_out_X = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, oldX, NULL);
        input_b  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, b, NULL);
    
        if (!input_A || !input_b || !in_out_X)
        {
            printf("Error: Failed to allocate device memory!\n");
            exit(1);
        }    
    }
    
    
    void CreateKernel(){
    
        FILE *fp;
        size_t program_size;
        string kernel_src;
        fp = fopen("Red.cl", "r");
        fseek(fp, 0, SEEK_END);
        program_size = ftell(fp);
        kernel_src.resize(program_size + 1);
        fseek(fp, 0, SEEK_SET);
        fread(&kernel_src[0], program_size, 1, fp);
        fclose(fp);
        kernel_src[program_size] = '\0';
    
    
    const char *src = &kernel_src[0];
    program1 = clCreateProgramWithSource(context, 1,&src, NULL, &err);
    
    if (!program1)
       {
          printf("clCreateProgramWithSource failed\n");
          exit(1);
       }
    
    err =clBuildProgram(program1, 1, &device, options, NULL, NULL);
    
    if (err != CL_SUCCESS)
        {
            size_t len;
            char buffer[2*2048];
    
            printf("Error: Failed to build program executable!\n");
            clGetProgramBuildInfo(program1, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
            printf("%s\n", buffer);
            exit(1);
        }
    
    
    
    kernel_red   = clCreateKernel(program1, kernel_names[0], &err);
    
    if (!kernel_red || err != CL_SUCCESS)
        {
            printf("Error: Failed to create compute kernel!\n");
            exit(1);
        }
    
    
    kernel_black   = clCreateKernel(program1, kernel_names[1], &err);
    
    if (!kernel_black || err != CL_SUCCESS)
        {
            printf("Error: Failed to create compute kernel!\n");
            exit(1);
        }
    
    }
    
    void Create_Work_Group(cl_kernel kernel, unsigned int count){
    
        size_t global[] = {SQ,SQ,0};
        size_t local[] = {32,32,0};
        err = clEnqueueNDRangeKernel(command, kernel, 2, NULL, global, local, 0, NULL,NULL);
        if (err)
        {
            printf("Error: Failed to execute kernel!\n");
            exit(1);
        }
    }
    
    void Kernel_Arg_Set(cl_kernel kernel,unsigned int count){
        err  = 0;
            err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_A);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_out_X);
        err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_b);
    
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to set kernel arguments! %d\n", err);
            exit(1);
        }
    }
    
    void Enqueue_Read_Buffer(unsigned int count){   
        err = clEnqueueReadBuffer( command, in_out_X, CL_TRUE, 0, sizeof(float) * SQ, oldX, 0, NULL, NULL );  
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read output array! %d\n", err);
            exit(1);
        }
    }
    
    void Enqueue_Write_Buffer(unsigned int count){
         err  = clEnqueueWriteBuffer(command, input_A , CL_FALSE, 0, sizeof(float) * count,   A, 0, NULL,  NULL);
         err |= clEnqueueWriteBuffer(command, input_b , CL_FALSE, 0, sizeof(float) * SQ   ,   b, 0, NULL,  NULL);
         err |= clEnqueueWriteBuffer(command, in_out_X, CL_FALSE, 0, sizeof(float) * SQ   ,oldX, 0, NULL,  NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to write to source array!\n");
            exit(1);
        }
    
    }