OpenCL&;Java-奇怪的性能结果

OpenCL&;Java-奇怪的性能结果,java,opencl,nvidia,gpgpu,jocl,Java,Opencl,Nvidia,Gpgpu,Jocl,我正在尝试使用OpenCL来提高一些Java代码的性能。我一直在浏览他们网站上提供的示例,并用它们组合出一个快速程序,将其性能与正常运行的程序进行比较。不过,我得到的结果有点出乎意料,我担心我可能做错了什么 首先,我使用的是jocl0.1.9,因为我有一个不支持OpenCL/jocl2.0的NVIDIA卡。我的电脑有一个Intel Core i7 CPU、一个Intel HD Graphics 530卡和一个NVIDIA Quadro M2000M 我写的程序是基于JOCL样本的;它将两个数字数

我正在尝试使用OpenCL来提高一些Java代码的性能。我一直在浏览他们网站上提供的示例,并用它们组合出一个快速程序,将其性能与正常运行的程序进行比较。不过,我得到的结果有点出乎意料,我担心我可能做错了什么

首先,我使用的是jocl0.1.9,因为我有一个不支持OpenCL/jocl2.0的NVIDIA卡。我的电脑有一个Intel Core i7 CPU、一个Intel HD Graphics 530卡和一个NVIDIA Quadro M2000M

我写的程序是基于JOCL样本的;它将两个数字数组相乘,将结果放入第三个数组。我使用Java的nanoTime()方法大致跟踪Java观察到的执行时间

public class PerformanceComparison {

    public static final int ARRAY_SIZE = 1000000;

    // OpenCL kernel code
    private static String programSource = "__kernel void " + "sampleKernel(__global const float *a,"
            + "             __global const float *b," + "             __global float *c)" + "{"
            + "    int gid = get_global_id(0);" + "    c[gid] = a[gid] * b[gid];" + "}";

    public static final void main(String[] args) {
        // build arrays
        float[] sourceA = new float[ARRAY_SIZE];
        float[] sourceB = new float[ARRAY_SIZE];
        float[] nvidiaResult = new float[ARRAY_SIZE];
        float[] intelCPUResult = new float[ARRAY_SIZE];
        float[] intelGPUResult = new float[ARRAY_SIZE];
        float[] javaResult = new float[ARRAY_SIZE];

        for (int i = 0; i < ARRAY_SIZE; i++) {
            sourceA[i] = i;
            sourceB[i] = i;
        }

        // get platforms
        cl_platform_id[] platforms = new cl_platform_id[2];
        clGetPlatformIDs(2, platforms, null);

        // I know what devices I have, so declare variables for each of them
        cl_context intelCPUContext = null;
        cl_context intelGPUContext = null;
        cl_context nvidiaContext = null;
        cl_device_id intelCPUDevice = null;
        cl_device_id intelGPUDevice = null;
        cl_device_id nvidiaDevice = null;

        // get all devices on all platforms
        for (int i = 0; i < 2; i++) {
            cl_platform_id platform = platforms[i];

            cl_context_properties properties = new cl_context_properties();
            properties.addProperty(CL_CONTEXT_PLATFORM, platform);

            int[] numDevices = new int[1];
            cl_device_id[] devices = new cl_device_id[2];

            clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, devices, numDevices);

            // get devices and build contexts
            for (int j = 0; j < numDevices[0]; j++) {
                cl_device_id device = devices[j];

                cl_context context = clCreateContext(properties, 1, new cl_device_id[] { device }, null, null, null);

                long[] length = new long[1];
                byte[] buffer = new byte[2000];
                clGetDeviceInfo(device, CL_DEVICE_NAME, 2000, Pointer.to(buffer), length);

                String deviceName = new String(buffer, 0, (int) length[0] - 1);

                // save based on the device name
                if (deviceName.contains("Quadro")) {
                    nvidiaContext = context;
                    nvidiaDevice = device;
                }
                if (deviceName.contains("Core(TM)")) {
                    intelCPUContext = context;
                    intelGPUDevice = device;
                }
                if (deviceName.contains("HD Graphics")) {
                    intelGPUContext = context;
                    intelGPUDevice = device;
                }
            }
        }

        // multiply the arrays using Java and on each of the devices
        long jvmElapsed = runInJVM(sourceA, sourceB, javaResult);
        long intelCPUElapsed = runInJOCL(intelCPUContext, intelCPUDevice, sourceA, sourceB, intelCPUResult);
        long intelGPUElapsed = runInJOCL(intelGPUContext, intelGPUDevice, sourceA, sourceB, intelGPUResult);
        long nvidiaElapsed = runInJOCL(nvidiaContext, nvidiaDevice, sourceA, sourceB, nvidiaResult);

        // results
        System.out.println("Standard Java Runtime: " + jvmElapsed + " ns");
        System.out.println("Intel CPU Runtime: " + intelCPUElapsed + " ns");
        System.out.println("Intel GPU Runtime: " + intelGPUElapsed + " ns");
        System.out.println("NVIDIA GPU Runtime: " + nvidiaElapsed + " ns");
    }

    /**
     * The basic Java approach - loop through the arrays, and save their results into the third array
     * 
     * @param sourceA multiplicand
     * @param sourceB multiplier
     * @param result product
     * @return the (rough) execution time in nanoseconds
     */
    private static long runInJVM(float[] sourceA, float[] sourceB, float[] result) {
        long startTime = System.nanoTime();
        for (int i = 0; i < ARRAY_SIZE; i++) {
            result[i] = sourceA[i] * sourceB[i];
        }
        long endTime = System.nanoTime();
        return endTime - startTime;
    }

    /**
     * Run a more-or-less equivalent program in OpenCL on the specified device
     * 
     * @param context JOCL context
     * @param device JOCL device
     * @param sourceA multiplicand
     * @param sourceB multiplier
     * @param result product
     * @return the (rough) execution time in nanoseconds
     */
    private static long runInJOCL(cl_context context, cl_device_id device, float[] sourceA, float[] sourceB,
            float[] result) {
        // create command queue
        cl_command_queue commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, null);

        // allocate memory
        cl_mem memObjects[] = new cl_mem[3];
        memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * ARRAY_SIZE,
                Pointer.to(sourceA), null);
        memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * ARRAY_SIZE,
                Pointer.to(sourceB), null);
        memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, Sizeof.cl_float * ARRAY_SIZE, null, null);

        // build program and set arguments
        cl_program program = clCreateProgramWithSource(context, 1, new String[] { programSource }, null, null);

        clBuildProgram(program, 0, null, null, null, null);

        cl_kernel kernel = clCreateKernel(program, "sampleKernel", null);

        clSetKernelArg(kernel, 0, Sizeof.cl_mem, Pointer.to(memObjects[0]));
        clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(memObjects[1]));
        clSetKernelArg(kernel, 2, Sizeof.cl_mem, Pointer.to(memObjects[2]));

        long global_work_size[] = new long[]{ARRAY_SIZE};
        long local_work_size[] = new long[]{1};

        // Execute the kernel
        long startTime = System.nanoTime();
        clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
            global_work_size, local_work_size, 0, null, null);

        // Read the output data
        clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0,
            ARRAY_SIZE * Sizeof.cl_float, Pointer.to(result), 0, null, null);
        long endTime = System.nanoTime();

        // Release kernel, program, and memory objects
        clReleaseMemObject(memObjects[0]);
        clReleaseMemObject(memObjects[1]);
        clReleaseMemObject(memObjects[2]);
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(commandQueue);
        clReleaseContext(context);

        return endTime - startTime;
    }
}
有两件事让我困惑:

  • 当使用OpenCL时,为什么程序在CPU上运行得更快?JVM将使用相同的设备;我知道Java比OpenCL这样的低级语言慢,但我认为它没有那么慢
  • 英伟达卡有什么问题?我知道考虑到他们的CUDA框架,他们对OpenCL的支持不如stellar,但我仍然希望它至少比正常情况下更快。事实上,备份,“这是在这里,以防你打破你的真正的图形卡,”英特尔GPU是围绕着它运行 我担心我做错了什么,或者至少错过了一些可以让它充分发挥潜力的东西。任何我能得到的建议都是非常受欢迎的

    另外,我知道,由于我有一张NVIDIA卡,CUDA可能是我更好/更快的选择;但是在这种情况下,我更喜欢OpenCL的灵活性

    更新:我发现我做错了一件事;依靠Java来报告运行时是愚蠢的。我使用OpenCL的评测工具编写了一个新的测试,它得到了更合理的结果:

    代码:

    这似乎表明功能更强大的NVIDIA卡实际上比Intel卡的性能更好,正如我所预期的那样。但是

  • 为什么CPU速度更快
  • 为什么普通Java突然变得如此之快

  • 我仍在摸索,试图理解这一点,但我会在这里发布一个实际的答案,以帮助像我这样的无知新手。希望那些不那么无知的人很快会来纠正我的错误,但至少其他无知的新手可以看到我的工作经历并从中学习

    正如我在编辑问题时所指出的,部分奇怪的结果是因为我依赖Java来告诉我事情运行的速度有多快。我认为这并不是完全错误,但我误解了数据。Java运行时将包括Java在GPU内存中转换所有内容所需的时间,而OpenCL的运行时将只报告运行所需的时间;毕竟,OpenCL并不真正知道或关心它叫什么。启用OpenCL评测并使用事件跟踪其运行时帮助我澄清了这一点。这也解释了CPU运行时之间的非常小的差距;它实际上并没有切换设备,所以并没有发生内存传输

    我还注意到我上面的代码确实有一个严重的缺陷。将内核命令排入队列时,CL.clEnqueueNDRangeKernel接受九个参数。第六个参数称为“本地工作大小”;这似乎指定了希望OpenCL用于运行代码的“工作组”的数量。我能想到的与Java最接近的类比是线程;更多的线程(通常)意味着可以同时完成更多的工作(直到某一点)。在上面的代码中,我正在做示例显示的事情,并告诉OpenCL使用单个工作组;基本上,在一个线程中运行所有内容。我的理解是,这恰恰是错误的事情做GPU;使用GPU的全部意义在于,它一次可以处理比CPU多得多的计算。强制GPU一次执行一个计算会使该点失效。似乎这里最好的方法就是将第六个参数留空;这指示OpenCL创建它认为必要的工作组。您可以指定一个数字,但允许的最大数字因设备而异(您可以使用CL.clGetDeviceInfo获取设备的CL_device_MAX_WORK_GROUP_SIZE属性以确定绝对最大值,但如果使用多个维度,则会变得更复杂)

    短版

  • OpenCL的评测将为您提供比Java更好的计时统计数据(但是使用两者将有助于显示CPU和GPU之间切换所需的延迟)
  • 在调用CL.clEnqueueNDRangeKernel时不要指定本地工作大小-这让OpenCL自动处理“多线程”
  • 新结果:

    Information for Quadro M2000M
        GPU Runtime: 35.88192 ms
        Java Runtime: 438.165651 ms
    Information for Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
        GPU Runtime: 166.278112 ms
        Java Runtime: 167.128259 ms
    Information for Intel(R) HD Graphics 530
        GPU Runtime: 90.985728 ms
        Java Runtime: 239.230354 ms
    JVM Benchmark: 177.824372 ms
    

    您可以尝试一下OpenCL的实现吗?库具有许多用于本机内存分配的功能(在实现之前阅读库文档)。在任何情况下,GPU版本更可能将大部分时间花在从主机内存向视频内存发送数据上,反之亦然。此外,proffile可能会显示哪些函数/代码块是瓶颈。是的,我认为CPU和GPU之间的来回是GPU和Java运行时之间巨大差距的原因。我将查看您提到的LWJGL库,谢谢。Divide and Converge用于流水线数据+计算。重叠数据+计算。旁注(有点晚了-对不起):在许多情况下,您只需将
    null
    作为
    local\u work\u size
    传递即可。这样,OpenCL实现将自动确定“适当”的本地工作大小。但是,您应该考虑到全局工作大小必须被本地工作大小整除(因此,我猜想全局工作大小不应该是素数)。超过
    public class PerformanceComparisonTakeTwo {
    
        //@formatter:off
        private static final String PROFILE_TEST = 
                "__kernel void " 
                + "sampleKernel(__global const float *a,"
                + "             __global const float *b,"
                + "             __global float *c,"
                + "             __global float *d,"
                + "             __global float *e,"
                + "             __global float *f)" 
                + "{"
                + "    int gid = get_global_id(0);" 
                + "    c[gid] = a[gid] + b[gid];"
                + "    d[gid] = a[gid] - b[gid];"
                + "    e[gid] = a[gid] * b[gid];"
                + "    f[gid] = a[gid] / b[gid];"
                + "}";
        //@formatter:on
        private static final int ARRAY_SIZE = 100000000;
    
        public static final void main(String[] args) {
            initialize();
        }
    
        public static void initialize() {
            // identify all platforms
            cl_platform_id[] platforms = getPlatforms();
    
            Map<cl_device_id, cl_platform_id> deviceMap = getDevices(platforms);
    
            performProfilingTest(deviceMap);
        }
    
        private static cl_platform_id[] getPlatforms() {
            int[] platformCount = new int[1];
            clGetPlatformIDs(0, null, platformCount);
    
            cl_platform_id[] platforms = new cl_platform_id[platformCount[0]];
            clGetPlatformIDs(platforms.length, platforms, platformCount);
    
            return platforms;
        }
    
        private static Map<cl_device_id, cl_platform_id> getDevices(cl_platform_id[] platforms) {
            Map<cl_device_id, cl_platform_id> deviceMap = new HashMap<>();
    
            for(int i = 0; i < platforms.length; i++) {
                int[] deviceCount = new int[1];
    
                clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, null, deviceCount);
    
                cl_device_id[] devices = new cl_device_id[deviceCount[0]];
    
                clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, devices.length, devices, null);
    
                for(int j = 0; j < devices.length; j++) {
                    deviceMap.put(devices[j], platforms[i]);
                }
            }
    
            return deviceMap;
        }
    
        private static void performProfilingTest(Map<cl_device_id, cl_platform_id> deviceMap) {
            float[] sourceA = new float[ARRAY_SIZE];
            float[] sourceB = new float[ARRAY_SIZE];
    
            for(int i = 0; i < ARRAY_SIZE; i++) {
                sourceA[i] = i;
                sourceB[i] = i;
            }
    
            for(Entry<cl_device_id, cl_platform_id> devicePair : deviceMap.entrySet()) {
                cl_device_id device = devicePair.getKey();
                cl_platform_id platform = devicePair.getValue();
    
                cl_context_properties properties = new cl_context_properties();
                properties.addProperty(CL_CONTEXT_PLATFORM, platform);
    
                cl_context context = clCreateContext(properties, 1, new cl_device_id[] { device }, null, null, null);
    
                cl_command_queue commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, null);
    
                cl_mem memObjects[] = new cl_mem[6];
                memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * ARRAY_SIZE,
                        Pointer.to(sourceA), null);
    
                memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * ARRAY_SIZE,
                        Pointer.to(sourceB), null);
    
                memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, Sizeof.cl_float * ARRAY_SIZE, null, null);
                memObjects[3] = clCreateBuffer(context, CL_MEM_READ_WRITE, Sizeof.cl_float * ARRAY_SIZE, null, null);
                memObjects[4] = clCreateBuffer(context, CL_MEM_READ_WRITE, Sizeof.cl_float * ARRAY_SIZE, null, null);
                memObjects[5] = clCreateBuffer(context, CL_MEM_READ_WRITE, Sizeof.cl_float * ARRAY_SIZE, null, null);
    
                cl_program program = clCreateProgramWithSource(context, 1, new String[] { PROFILE_TEST }, null, null);
    
                clBuildProgram(program, 0, null, null, null, null);
    
                cl_kernel kernel = clCreateKernel(program, "sampleKernel", null);
    
                for(int i = 0; i < memObjects.length; i++) {
                    clSetKernelArg(kernel, i, Sizeof.cl_mem, Pointer.to(memObjects[i]));
                }
    
                cl_event event = new cl_event();
    
                long global_work_size[] = new long[]{ARRAY_SIZE};
                long local_work_size[] = new long[]{1};
    
                long start = System.nanoTime();
                clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
                        global_work_size, local_work_size, 0, null, event);
    
                clWaitForEvents(1, new cl_event[] {event});
                long end = System.nanoTime();
    
                System.out.println("Information for " + getDeviceInfoString(device, CL_DEVICE_NAME));
                System.out.println("\tGPU Runtime: " + getRuntime(event));
                System.out.println("\tJava Runtime: " + ((end - start) / 1e6) + " ms");
    
                clReleaseEvent(event);
                for(int i = 0; i < memObjects.length; i++) {
                    clReleaseMemObject(memObjects[i]);
                }
                clReleaseKernel(kernel);
                clReleaseProgram(program);
                clReleaseCommandQueue(commandQueue);
                clReleaseContext(context);
            }
    
            float[] result1 = new float[ARRAY_SIZE];
            float[] result2 = new float[ARRAY_SIZE];
            float[] result3 = new float[ARRAY_SIZE];
            float[] result4 = new float[ARRAY_SIZE];
    
            long start = System.nanoTime();
            for(int i = 0; i < ARRAY_SIZE; i++) {
                result1[i] = sourceA[i] + sourceB[i];
                result2[i] = sourceA[i] - sourceB[i];
                result3[i] = sourceA[i] * sourceB[i];
                result4[i] = sourceA[i] / sourceB[i];
            }
            long end = System.nanoTime();
    
            System.out.println("JVM Benchmark: " + ((end - start) / 1e6) + " ms");
        }
    
        private static String getDeviceInfoString(cl_device_id device, int parameter) {
            long[] bufferLength = new long[1];
            clGetDeviceInfo(device, parameter, 0, null, bufferLength);
    
            byte[] buffer = new byte[(int) bufferLength[0]];
            clGetDeviceInfo(device, parameter, bufferLength[0], Pointer.to(buffer), null);
    
            return new String(buffer, 0, buffer.length - 1);
        }
    
        private static String getRuntime(cl_event event) {
            long[] start = new long[1];
            long[] end = new long[1];
    
            clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, Sizeof.cl_ulong, Pointer.to(start), null);
            clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, Sizeof.cl_ulong, Pointer.to(end), null);
    
            long nanos = end[0] - start[0];
            double millis = nanos / 1e6;
            return millis + " ms";
        }
    
    }
    
    Information for Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
        GPU Runtime: 639.986906 ms
        Java Runtime: 641.590764 ms
    Information for Quadro M2000M
        GPU Runtime: 794.972 ms
        Java Runtime: 1191.357248 ms
    Information for Intel(R) HD Graphics 530
        GPU Runtime: 1897.876624 ms
        Java Runtime: 2065.011125 ms
    JVM Benchmark: 192.680669 ms
    
    Information for Quadro M2000M
        GPU Runtime: 35.88192 ms
        Java Runtime: 438.165651 ms
    Information for Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
        GPU Runtime: 166.278112 ms
        Java Runtime: 167.128259 ms
    Information for Intel(R) HD Graphics 530
        GPU Runtime: 90.985728 ms
        Java Runtime: 239.230354 ms
    JVM Benchmark: 177.824372 ms