Java 使用JOCL/OPENCL加速强度总和计算
嗨,我是新来的JOCL(opencl)。我写这段代码是为了获取每张图像的强度之和。内核获取一个1D数组,该数组包含相互放置的所有图像的所有像素。一幅图像是300x300,因此每幅图像有90000像素。现在它比我按顺序做的时候慢 我的代码Java 使用JOCL/OPENCL加速强度总和计算,java,opencl,jocl,Java,Opencl,Jocl,嗨,我是新来的JOCL(opencl)。我写这段代码是为了获取每张图像的强度之和。内核获取一个1D数组,该数组包含相互放置的所有图像的所有像素。一幅图像是300x300,因此每幅图像有90000像素。现在它比我按顺序做的时候慢 我的代码 package PAR; /* * JOCL - Java bindings for OpenCL * * Copyright 2009 Marco Hutter - http://www.jocl.org/ */ import IMAGE_IO.I
package PAR;
/*
* JOCL - Java bindings for OpenCL
*
* Copyright 2009 Marco Hutter - http://www.jocl.org/
*/
import IMAGE_IO.ImageReader;
import IMAGE_IO.Input_Folder;
import static org.jocl.CL.*;
import org.jocl.*;
/**
* A small JOCL sample.
*/
public class IPPARA {
/**
* The source code of the OpenCL program to execute
*/
private static String programSource =
"__kernel void "
+ "sampleKernel(__global uint *a,"
+ " __global uint *c)"
+ "{"
+ "__private uint intensity_core=0;"
+ " uint i = get_global_id(0);"
+ " for(uint j=i*90000; j < (i+1)*90000; j++){ "
+ " intensity_core += a[j];"
+ " }"
+ "c[i]=intensity_core;"
+ "}";
/**
* The entry point of this sample
*
* @param args Not used
*/
public static void main(String args[]) {
long numBytes[] = new long[1];
ImageReader imagereader = new ImageReader() ;
int srcArrayA[] = imagereader.readImages();
int size[] = new int[1];
size[0] = srcArrayA.length;
long before = System.nanoTime();
int dstArray[] = new int[size[0]/90000];
Pointer srcA = Pointer.to(srcArrayA);
Pointer dst = Pointer.to(dstArray);
// Obtain the platform IDs and initialize the context properties
System.out.println("Obtaining platform...");
cl_platform_id platforms[] = new cl_platform_id[1];
clGetPlatformIDs(platforms.length, platforms, null);
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
// Create an OpenCL context on a GPU device
cl_context context = clCreateContextFromType(
contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
if (context == null) {
// If no context for a GPU device could be created,
// try to create one for a CPU device.
context = clCreateContextFromType(
contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
if (context == null) {
System.out.println("Unable to create a context");
return;
}
}
// Enable exceptions and subsequently omit error checks in this sample
CL.setExceptionsEnabled(true);
// Get the list of GPU devices associated with the context
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);
// Obtain the cl_device_id for the first device
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
Pointer.to(devices), null);
// Create a command-queue
cl_command_queue commandQueue =
clCreateCommandQueue(context, devices[0], 0, null);
// Allocate the memory objects for the input- and output data
cl_mem memObjects[] = new cl_mem[2];
memObjects[0] = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
Sizeof.cl_uint * srcArrayA.length, srcA, null);
memObjects[1] = clCreateBuffer(context,
CL_MEM_READ_WRITE,
Sizeof.cl_uint * (srcArrayA.length/90000), null, null);
// Create the program from the source code
cl_program program = clCreateProgramWithSource(context,
1, new String[]{programSource}, null, null);
// Build the program
clBuildProgram(program, 0, null, null, null, null);
// Create the kernel
cl_kernel kernel = clCreateKernel(program, "sampleKernel", null);
// Set the arguments for the kernel
clSetKernelArg(kernel, 0,
Sizeof.cl_mem, Pointer.to(memObjects[0]));
clSetKernelArg(kernel, 1,
Sizeof.cl_mem, Pointer.to(memObjects[1]));
// Set the work-item dimensions
long local_work_size[] = new long[]{1};
long global_work_size[] = new long[]{(srcArrayA.length/90000)*local_work_size[0]};
// Execute the kernel
clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
global_work_size, local_work_size, 0, null, null);
// Read the output data
clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE, 0,
(srcArrayA.length/90000) * Sizeof.cl_float, dst, 0, null, null);
// Release kernel, program, and memory objects
clReleaseMemObject(memObjects[0]);
clReleaseMemObject(memObjects[1]);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
long after = System.nanoTime();
System.out.println("Time: " + (after - before) / 1e9);
}
}
代码>包面值;
/*
*JOCL-OpenCL的Java绑定
*
*版权所有2009 Marco Hutter-http://www.jocl.org/
*/
导入图像\ IO.ImageReader;
导入图像IO.Input文件夹;
导入静态org.jocl.CL.*;
导入org.jocl.*;
/**
*一个小样本。
*/
公共级伊帕拉{
/**
*要执行的OpenCL程序的源代码
*/
私有静态字符串程序源=
“_内核无效”
+sampleKernel(uu全局uint*a
+“uu全局uint*c)”
+ "{"
+“\uuuu私有单元强度\u核心=0;”
+“uint i=get_global_id(0);”
+“对于(uint j=i*90000;j<(i+1)*90000;j++){”
+“强度μcore+=a[j];”
+ " }"
+“c[i]=强度_核心;”
+ "}";
/**
*此示例的入口点
*
*@param参数未使用
*/
公共静态void main(字符串参数[]){
long numBytes[]=新长[1];
ImageReader ImageReader=新的ImageReader();
int srcArrayA[]=imagereader.readImages();
整数大小[]=新整数[1];
大小[0]=srcArrayA.length;
很久以前=System.nanoTime();
int-dstArray[]=新的int[size[0]/90000];
指针srcA=Pointer.to(srcArrayA);
指针dst=指针指向(DSTARRY);
//获取平台ID并初始化上下文属性
System.out.println(“获取平台…”);
cl_平台_id平台[]=新cl_平台_id[1];
clGetPlatformIDs(platforms.length,platforms,null);
cl_context_properties contextProperties=新的cl_context_properties();
addProperty(CL_CONTEXT_PLATFORM,platforms[0]);
//在GPU设备上创建OpenCL上下文
cl_context context=clCreateContextFromType(
contextProperties,CL_设备_类型_CPU,null,null,null);
if(上下文==null){
//如果无法创建GPU设备的上下文,
//尝试为CPU设备创建一个。
context=clCreateContextFromType(
contextProperties,CL_设备_类型_CPU,null,null,null);
if(上下文==null){
System.out.println(“无法创建上下文”);
返回;
}
}
//在此示例中启用异常并随后忽略错误检查
CL.setExceptionsEnabled(真);
//获取与上下文关联的GPU设备列表
clGetContextInfo(上下文,上下文设备,0,null,numBytes);
//获取第一个设备的cl_设备id
int numDevices=(int)numBytes[0]/Sizeof.cl\u设备id;
cl_device_id devices[]=新cl_device_id[numDevices];
clGetContextInfo(上下文,上下文设备,数量[0],
指向(设备)的指针,null);
//创建命令队列
命令队列命令队列=
clCreateCommandQueue(上下文,设备[0],0,null);
//为输入和输出数据分配内存对象
cl_mem MemoObjects[]=新cl_mem[2];
MemoObjects[0]=clCreateBuffer(上下文,
CL_MEM_只读| CL_MEM_副本(主机)PTR,
Sizeof.cl_uint*srcArrayA.length,srcA,null);
MemoObjects[1]=clCreateBuffer(上下文,
CL_MEM_READ_WRITE,
Sizeof.cl_uint*(srcArrayA.length/90000),空,空;
//从源代码创建程序
cl_program=clCreateProgramWithSource(上下文,
1,新字符串[]{programSource},null,null);
//构建程序
clBuildProgram(程序,0,null,null,null,null);
//创建内核
cl_kernel kernel=clCreateKernel(程序,“sampleKernel”,null);
//设置内核的参数
clSetKernelArg(内核,0,
Sizeof.cl_mem,指针指向(memObjects[0]);
clSetKernelArg(内核,1,
Sizeof.cl_mem,指针指向(memObjects[1]);
//设置工作项维度
长本地工作大小[]=新长[]{1};
长全局工作大小[]=新长[]{(srcArrayA.length/90000)*本地工作大小[0]};
//执行内核
clEnqueueNDRangeKernel(commandQueue,kernel,1,null,
全局工作大小、本地工作大小、0、null、null);
//读取输出数据
clenqueueredbuffer(commandQueue,memObjects[1],CL_TRUE,0,
(srcArrayA.length/90000)*Sizeof.cl_float,dst,0,null,null);
//释放内核、程序和内存对象
clreleasemobject(memObjects[0]);
clreleasemobject(memObjects[1]);
clreleaseernel(内核);
clReleaseProgram(program);
clReleaseCommandQueue(commandQueue);
clReleaseContext(上下文);
long after=System.nanoTime();
System.out.println(“时间:”+(之后-之前)/1e9);
}
}
根据答案中的建议,通过CPU的并行代码几乎与顺序代码一样快。还有什么可以改进的吗
for(uint j=i*90000; j < (i+1)*90000; j++){ "
+ " c[i] += a[j];"
在您的程序中:
// Obtain the cl_device_id for the first device
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
Pointer.to(devices), null);
获取第一个设备可能是gpu。每个300x300映像应该使用整个工作组。这将有助于使gpu内核饱和,并允许您使用本地内存。内核还应该能够同时处理尽可能多的图像,因为您的设备上有计算单元 下面的内核通过三个步骤来完成缩减
// Obtain a device ID
cl_device_id devices[] = new cl_device_id[numDevices];
clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
cl_device_id device = devices[deviceIndex];
//one of devices[] element must be your HD3000.Example: devices[0]->gpu devices[1]->cpu
//devices[2]-->HD3000
// Obtain the cl_device_id for the first device
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
Pointer.to(devices), null);