Java 使用JOCL/OPENCL加速强度总和计算_Java_Opencl_Jocl

Java 使用JOCL/OPENCL加速强度总和计算

java opencl

Java 使用JOCL/OPENCL加速强度总和计算,java,opencl,jocl,Java,Opencl,Jocl,嗨，我是新来的JOCL（opencl）。我写这段代码是为了获取每张图像的强度之和。内核获取一个1D数组，该数组包含相互放置的所有图像的所有像素。一幅图像是300x300，因此每幅图像有90000像素。现在它比我按顺序做的时候慢我的代码 package PAR; /* * JOCL - Java bindings for OpenCL * * Copyright 2009 Marco Hutter - http://www.jocl.org/ */ import IMAGE_IO.I

嗨，我是新来的JOCL（opencl）。我写这段代码是为了获取每张图像的强度之和。内核获取一个1D数组，该数组包含相互放置的所有图像的所有像素。一幅图像是300x300，因此每幅图像有90000像素。现在它比我按顺序做的时候慢

我的代码

package PAR;

/*
 * JOCL - Java bindings for OpenCL
 * 
 * Copyright 2009 Marco Hutter - http://www.jocl.org/
 */
import IMAGE_IO.ImageReader;
import IMAGE_IO.Input_Folder;
import static org.jocl.CL.*;

import org.jocl.*;

/**
 * A small JOCL sample.
 */
public class IPPARA {

    /**
     * The source code of the OpenCL program to execute
     */
    private static String programSource =
            "__kernel void "
            + "sampleKernel(__global uint *a,"
            + "             __global uint *c)"
            + "{"
            + "__private uint intensity_core=0;"
            + "      uint i = get_global_id(0);"
            + "      for(uint j=i*90000; j < (i+1)*90000; j++){ "
            + "              intensity_core += a[j];"
            + "     }"
            + "c[i]=intensity_core;" 
            + "}";

    /**
     * The entry point of this sample
     *
     * @param args Not used
     */
    public static void main(String args[]) {
        long numBytes[] = new long[1];

        ImageReader imagereader = new ImageReader() ;
        int srcArrayA[]  = imagereader.readImages();

        int size[] = new int[1];
        size[0] = srcArrayA.length;
        long before = System.nanoTime();
        int dstArray[] = new int[size[0]/90000];


        Pointer srcA = Pointer.to(srcArrayA);
        Pointer dst = Pointer.to(dstArray);


        // Obtain the platform IDs and initialize the context properties
        System.out.println("Obtaining platform...");
        cl_platform_id platforms[] = new cl_platform_id[1];
        clGetPlatformIDs(platforms.length, platforms, null);
        cl_context_properties contextProperties = new cl_context_properties();
        contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);

        // Create an OpenCL context on a GPU device
        cl_context context = clCreateContextFromType(
                contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
        if (context == null) {
            // If no context for a GPU device could be created,
            // try to create one for a CPU device.
            context = clCreateContextFromType(
                    contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);

            if (context == null) {
                System.out.println("Unable to create a context");
                return;
            }
        }

        // Enable exceptions and subsequently omit error checks in this sample
        CL.setExceptionsEnabled(true);

        // Get the list of GPU devices associated with the context
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);

        // Obtain the cl_device_id for the first device
        int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
        cl_device_id devices[] = new cl_device_id[numDevices];
        clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
                Pointer.to(devices), null);

        // Create a command-queue
        cl_command_queue commandQueue =
                clCreateCommandQueue(context, devices[0], 0, null);

        // Allocate the memory objects for the input- and output data
        cl_mem memObjects[] = new cl_mem[2];
        memObjects[0] = clCreateBuffer(context,
                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                Sizeof.cl_uint * srcArrayA.length, srcA, null);
        memObjects[1] = clCreateBuffer(context,
                CL_MEM_READ_WRITE,
                Sizeof.cl_uint * (srcArrayA.length/90000), null, null);

        // Create the program from the source code
        cl_program program = clCreateProgramWithSource(context,
                1, new String[]{programSource}, null, null);

        // Build the program
        clBuildProgram(program, 0, null, null, null, null);

        // Create the kernel
        cl_kernel kernel = clCreateKernel(program, "sampleKernel", null);

        // Set the arguments for the kernel
        clSetKernelArg(kernel, 0,
                Sizeof.cl_mem, Pointer.to(memObjects[0]));
        clSetKernelArg(kernel, 1,
                Sizeof.cl_mem, Pointer.to(memObjects[1]));

        // Set the work-item dimensions
        long local_work_size[] = new long[]{1};
        long global_work_size[] = new long[]{(srcArrayA.length/90000)*local_work_size[0]};


        // Execute the kernel
        clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
                global_work_size, local_work_size, 0, null, null);

        // Read the output data
        clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE, 0,
                (srcArrayA.length/90000) * Sizeof.cl_float, dst, 0, null, null);

        // Release kernel, program, and memory objects
        clReleaseMemObject(memObjects[0]);
        clReleaseMemObject(memObjects[1]);
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(commandQueue);
        clReleaseContext(context);


        long after = System.nanoTime();

        System.out.println("Time: " + (after - before) / 1e9);

    }
}

代码>包面值； /* *JOCL-OpenCL的Java绑定 * *版权所有2009 Marco Hutter-http://www.jocl.org/ */ 导入图像\ IO.ImageReader；导入图像IO.Input文件夹；导入静态org.jocl.CL.*；导入org.jocl.*； /** *一个小样本。 */ 公共级伊帕拉{ /** *要执行的OpenCL程序的源代码 */ 私有静态字符串程序源= “_内核无效” +sampleKernel（uu全局uint*a +“uu全局uint*c）” + "{" +“\uuuu私有单元强度\u核心=0；” +“uint i=get_global_id（0）；” +“对于（uint j=i*90000；j<（i+1）*90000；j++）{” +“强度μcore+=a[j]；” + " }" +“c[i]=强度_核心；” + "}"; /** *此示例的入口点 * *@param参数未使用 */ 公共静态void main（字符串参数[]）{ long numBytes[]=新长[1]； ImageReader ImageReader=新的ImageReader（）； int srcArrayA[]=imagereader.readImages（）；整数大小[]=新整数[1]；大小[0]=srcArrayA.length；很久以前=System.nanoTime（）； int-dstArray[]=新的int[size[0]/90000]；指针srcA=Pointer.to（srcArrayA）；指针dst=指针指向（DSTARRY）； //获取平台ID并初始化上下文属性 System.out.println（“获取平台…”）； cl_平台_id平台[]=新cl_平台_id[1]； clGetPlatformIDs（platforms.length，platforms，null）； cl_context_properties contextProperties=新的cl_context_properties（）； addProperty（CL_CONTEXT_PLATFORM，platforms[0]）； //在GPU设备上创建OpenCL上下文 cl_context context=clCreateContextFromType( contextProperties，CL_设备_类型_CPU，null，null，null）； if（上下文==null）{ //如果无法创建GPU设备的上下文， //尝试为CPU设备创建一个。 context=clCreateContextFromType( contextProperties，CL_设备_类型_CPU，null，null，null）； if（上下文==null）{ System.out.println（“无法创建上下文”）；返回； } } //在此示例中启用异常并随后忽略错误检查 CL.setExceptionsEnabled（真）； //获取与上下文关联的GPU设备列表 clGetContextInfo（上下文，上下文设备，0，null，numBytes）； //获取第一个设备的cl_设备id int numDevices=（int）numBytes[0]/Sizeof.cl\u设备id； cl_device_id devices[]=新cl_device_id[numDevices]； clGetContextInfo（上下文，上下文设备，数量[0]，指向（设备）的指针，null）； //创建命令队列命令队列命令队列= clCreateCommandQueue（上下文，设备[0]，0，null）； //为输入和输出数据分配内存对象 cl_mem MemoObjects[]=新cl_mem[2]； MemoObjects[0]=clCreateBuffer（上下文， CL_MEM_只读| CL_MEM_副本(主机)PTR， Sizeof.cl_uint*srcArrayA.length，srcA，null）； MemoObjects[1]=clCreateBuffer（上下文， CL_MEM_READ_WRITE， Sizeof.cl_uint*（srcArrayA.length/90000），空，空； //从源代码创建程序 cl_program=clCreateProgramWithSource（上下文， 1，新字符串[]{programSource}，null，null）； //构建程序 clBuildProgram（程序，0，null，null，null，null）； //创建内核 cl_kernel kernel=clCreateKernel（程序，“sampleKernel”，null）； //设置内核的参数 clSetKernelArg（内核，0， Sizeof.cl_mem，指针指向（memObjects[0]）； clSetKernelArg（内核，1， Sizeof.cl_mem，指针指向（memObjects[1]）； //设置工作项维度长本地工作大小[]=新长[]{1}；长全局工作大小[]=新长[]{（srcArrayA.length/90000）*本地工作大小[0]}； //执行内核 clEnqueueNDRangeKernel（commandQueue，kernel，1，null，全局工作大小、本地工作大小、0、null、null）； //读取输出数据 clenqueueredbuffer（commandQueue，memObjects[1]，CL_TRUE，0，（srcArrayA.length/90000）*Sizeof.cl_float，dst，0，null，null）； //释放内核、程序和内存对象 clreleasemobject（memObjects[0]）； clreleasemobject（memObjects[1]）； clreleaseernel（内核）； clReleaseProgram（program）； clReleaseCommandQueue（commandQueue）； clReleaseContext（上下文）； long after=System.nanoTime（）； System.out.println（“时间：”+（之后-之前）/1e9）； } } 根据答案中的建议，通过CPU的并行代码几乎与顺序代码一样快。还有什么可以改进的吗

 for(uint j=i*90000; j < (i+1)*90000; j++){ "
        + "              c[i] += a[j];"

在您的程序中：

 // Obtain the cl_device_id for the first device
    int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
    cl_device_id devices[] = new cl_device_id[numDevices];
    clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
            Pointer.to(devices), null);

获取第一个设备可能是gpu。

每个300x300映像应该使用整个工作组。这将有助于使gpu内核饱和，并允许您使用本地内存。内核还应该能够同时处理尽可能多的图像，因为您的设备上有计算单元

下面的内核通过三个步骤来完成缩减

将这些值读入一个

 // Obtain a device ID 
    cl_device_id devices[] = new cl_device_id[numDevices];
    clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
    cl_device_id device = devices[deviceIndex];
 //one of devices[] element must be your HD3000.Example: devices[0]->gpu devices[1]->cpu 
 //devices[2]-->HD3000

 // Obtain the cl_device_id for the first device
    int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
    cl_device_id devices[] = new cl_device_id[numDevices];
    clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
            Pointer.to(devices), null);