Java jocl神经网络_Java_Neural Network_Gpgpu_Jocl

Java jocl神经网络

java neural-network

Java jocl神经网络,java,neural-network,gpgpu,jocl,Java,Neural Network,Gpgpu,Jocl,我用java编写了一个神经网络，在gpu上进行性能计算似乎是个好主意。我的问题是它太慢了。。。我曾经用jocl这样做。我现在不知道它是不是内核这里有一些代码： private static String programSource = "__kernel void " + "sampleKernel(__constant float *input," + " __global float *weights," + "

我用java编写了一个神经网络，在gpu上进行性能计算似乎是个好主意。我的问题是它太慢了。。。我曾经用jocl这样做。我现在不知道它是不是内核这里有一些代码：

private static String programSource = "__kernel void "
        + "sampleKernel(__constant float *input,"
        + "             __global float *weights,"
        + "             __constant int *length,"
        + "             __global float *dst)" + "               {"
        + "    __private int gid = get_global_id(0);"
        + "    __private int pos = (gid*length[0]);"
        + "    __private float tmp = 0;"
        + "    __private int l = length[0];" + "        dst[gid]  = 0;"
        + "    for(int i = 0; i < l; i++){"
        + "         tmp += gewichte[pos+i]*input[i];"
        + "    }"
        + "   dst[gid] = tanh(tmp);" + "}";

无关注释：是否确实要将程序设置为字符串常量？如果它位于资源文件中，维护起来会简单得多。

public OpenClNetz(float[][][] gew, cl_context context,
        cl_command_queue commandQueue) throws Exception {
    if (context == null) {
        throw new Exception("context == null, Konstruktor schlug fehl");
    }
    if (commandQueue == null) {
        throw new Exception("commandQueue == null, Konstruktor schlug fehl");
    }
    this.layersize = new int[gew.length + 1];
    for (int i = 0; i < layersize.length - 1; i++) {
        this.layersize[i] = gew[i][0].length;
    }
    this.layersize[this.layersize.length - 1] = gew[gew.length - 1].length;
    this.context = context;
    builded = false;
    this.commandQueue = commandQueue;
    this.output = new float[layersize[layersize.length - 1]];
    gewichte = new cl_mem[layersize.length - 1];
    tmp = new cl_mem[layersize.length - 1];
    lengths = new cl_mem[layersize.length - 1];
    input = new cl_mem();
    float[] tmpG;
    int[][] tmpL = new int[layersize.length - 1][];
    for (int i = 0; i < gewichte.length; i++) {
        tmpG = new float[layersize[i] * layersize[i + 1]];
        tmpL[i] = new int[1];
        tmpL[i][0] = layersize[i];
        int n = 0;
        for (int j = 0; j < layersize[i + 1]; j++) {
            for (int k = 0; k < layersize[i]; k++) {
                tmpG[n] = gew[i][j][k];
                n++;
            }
        }
        gewichte[i] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * tmpG.length, Pointer.to(tmpG),
                null);
        lengths[i] = clCreateBuffer(context, CL_MEM_READ_WRITE
                | CL_MEM_COPY_HOST_PTR, Sizeof.cl_int,  Pointer.to(tmpL[i]), null);
        tmp[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, Sizeof.cl_float
                * layersize[i + 1], null, null);
    }

}


public void setInput(float[] in) {
    if (in.length != layersize[0]) {
        System.out
                .println("array Länge entspricht nicht der Inputsize, setInput schlug fehl");
        return;
    }
    input = clCreateBuffer(context, CL_MEM_READ_WRITE
            | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * layersize[0],
            Pointer.to(in), null);
    clSetKernelArg(kernel[0], 0, Sizeof.cl_mem, Pointer.to(input));
}

public void buildProgramm() {
    program = clCreateProgramWithSource(context, 1,
            new String[] { programSource }, null, null);
    clBuildProgram(program, 0, null, null, null, null);
    builded = true;
    kernel = new cl_kernel[gewichte.length];
    kernel[0] = clCreateKernel(program, "sampleKernel", null);
    clSetKernelArg(kernel[0], 0, Sizeof.cl_mem, Pointer.to(input));
    clSetKernelArg(kernel[0], 1, Sizeof.cl_mem, Pointer.to(gewichte[0]));
    clSetKernelArg(kernel[0], 2, Sizeof.cl_mem, Pointer.to(lengths[0]));
    clSetKernelArg(kernel[0], 3, Sizeof.cl_mem, Pointer.to(tmp[0]));
    for (int i = 1; i < gewichte.length; i++) {
        kernel[i] = clCreateKernel(program, "sampleKernel", null);
        clSetKernelArg(kernel[i], 0, Sizeof.cl_mem, Pointer.to(tmp[i - 1]));
        clSetKernelArg(kernel[i], 1, Sizeof.cl_mem, Pointer.to(gewichte[i]));
        clSetKernelArg(kernel[i], 2, Sizeof.cl_mem, Pointer.to(lengths[i]));
        clSetKernelArg(kernel[i], 3, Sizeof.cl_mem, Pointer.to(tmp[i]));
    }
}


public void run() throws Exception {
    if (!builded) {
        throw new Exception(
                "buildProgramm muss zuerst aufgerufen werden, run schlug fehl");
    }
    long global_work_size[] = new long[] { layersize[1] };
    this.local_work_size = new long[] { 8 };
    // Execute the kernel
    clEnqueueNDRangeKernel(commandQueue, kernel[0], 1, null,
            global_work_size, local_work_size, 0, null, null);

    for (int i = 1; i < gewichte.length; i++) {
        global_work_size = new long[] { layersize[i + 1] };

        // Execute the kernel
        clEnqueueNDRangeKernel(commandQueue, kernel[i], 1, null,
                global_work_size, local_work_size, 0, null, null);

    }

}

public class TEST{
public static void main(String args[]) throws Exception
{
    // The platform, device type and device number
    // that will be used
    final int platformIndex = 0;
    final long deviceType = CL_DEVICE_TYPE_DEFAULT;
    final int deviceIndex = 0;

    // Enable exceptions and subsequently omit error checks in this sample
    CL.setExceptionsEnabled(true);

    // Obtain the number of platforms
    int numPlatformsArray[] = new int[1];
    clGetPlatformIDs(0, null, numPlatformsArray);
    int numPlatforms = numPlatformsArray[0];

    // Obtain a platform ID
    cl_platform_id platforms[] = new cl_platform_id[numPlatforms];
    clGetPlatformIDs(platforms.length, platforms, null);
    cl_platform_id platform = platforms[platformIndex];

    // Initialize the context properties
    cl_context_properties contextProperties = new cl_context_properties();
    contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform);

    // Obtain the number of devices for the platform
    int numDevicesArray[] = new int[1];
    clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
    int numDevices = numDevicesArray[0];

    // Obtain a device ID 
    cl_device_id devices[] = new cl_device_id[numDevices];
    clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
    cl_device_id device = devices[deviceIndex];

    // Create a context for the selected device
    cl_context context = clCreateContext(
        contextProperties, 1, new cl_device_id[]{device}, 
        null, null, null);
    // Create a command-queue for the selected device
    cl_command_queue commandQueue = 
        clCreateCommandQueue(context, device, 0, null);




    int[] layersize = {512,512,512};
    float[] in = new float[512];
    for(int i = 0; i < 512; i++){
        in[i] = (float) (Math.random()*1.4 -0.7);
    }
    Netz net = new Netz(layersize);
    net.set_Input(in);
    OpenClNetz netz= new OpenClNetz(net.gewichte,context,commandQueue);
    netz.buildProgramm();
    netz.setInput(in);
    double time = System.currentTimeMillis();
    for(int i = 0; i < 10000; i++){
        netz.run();
    }
    System.out.println(Arrays.toString(netz.retrieveOutput()));
    System.out.println("time OpenCl: " + (System.currentTimeMillis()-time));

    time = System.currentTimeMillis();

    for(int i = 0; i < 10000; i++){
        net.start();
    }

    System.out.println("time normal: " + (System.currentTimeMillis()-time));
    System.out.println(Arrays.toString(netz.retrieveOutput()));
    System.out.println(Arrays.toString(net.start()));

    netz.destroy();






    // Release kernel, program, and memory objects
    clReleaseCommandQueue(commandQueue);
    clReleaseContext(context);

normal (running on CPU) : 6475ms

running on GPU (local worksize = 1) : 19110ms
running on GPU (local worksize = 2) : 11778ms
running on GPU (local worksize = 4) : 8985ms
running on GPU (local worksize = 8) : 6880ms
running on GPU (local worksize = 16) : 8237ms              (it becomes slower ?! O.o)
running on GPU (local worksize = 32) : 9298ms              (Im kinda new to Jocl)
running on GPU (local worksize = 64) : 10062ms