rootbeer CUDA示例代码量化吞吐量增益_Cuda_Rootbeer

rootbeer CUDA示例代码量化吞吐量增益

cuda

rootbeer CUDA示例代码量化吞吐量增益,cuda,rootbeer,Cuda,Rootbeer,下面是我在笔记本电脑上运行的Nvidia CUDA的rootbeer示例代码，Ubuntu12.04（精确版），bumblebee和optirun。这款笔记本电脑采用了Nvidia Optimus，因此推出了optirun。GPU恰好是Nvidia GeFielgt 540M，英伟达网站称它有96个核心。我几乎没有获得吞吐量增益。有什么问题 package com.random.test; import java.util.ArrayList; import java.util.Formatt

下面是我在笔记本电脑上运行的Nvidia CUDA的rootbeer示例代码，Ubuntu12.04（精确版），bumblebee和optirun。这款笔记本电脑采用了Nvidia Optimus，因此推出了optirun。GPU恰好是Nvidia GeFielgt 540M，英伟达网站称它有96个核心。我几乎没有获得吞吐量增益。有什么问题

package com.random.test;

import java.util.ArrayList;
import java.util.Formatter;
import java.util.List;

import edu.syr.pcpratts.rootbeer.runtime.Kernel;
import edu.syr.pcpratts.rootbeer.runtime.Rootbeer;

public class ArraySumApp {
    final static int numberOfJobs = 1024; // 1024 in the original example
    final static int sizeOfArray = 512; // 512 in the original example
    final static int theAnswer = 130816;

    public int[] sumArrays(List<int[]> arrays) {

        List<Kernel> jobs = new ArrayList<Kernel>();
        int[] ret = new int[arrays.size()];
        for (int i = 0; i < arrays.size(); ++i) {
            jobs.add(new ArraySum(arrays.get(i), ret, i));
        }

        Rootbeer rootbeer = new Rootbeer();
        rootbeer.runAll(jobs);
        return ret;
    }

    private static long measureOneJob() {

        int[] source = new int[ArraySumApp.sizeOfArray];
        int[] destination = new int[1];
        for (int i = 0; i < ArraySumApp.sizeOfArray; i++)
            source[i] = i;
        Kernel job = new ArraySum(source, destination, 0);

        ElapsedTimer et = new ElapsedTimer();
        job.gpuMethod();
        long timeInMs = et.stopInMilliseconds();
        System.out.println("measureOneJob " + et.stringInMilliseconds());

        assert destination[0] == ArraySumApp.theAnswer : "cosmic rays";
        return timeInMs;
    }

    public static void main(String[] args) {

        Helper.assertAssertionEnabled();

        // measure the time to do one job
        ArraySumApp.measureOneJob();
        long oneJob = ArraySumApp.measureOneJob();

        ArraySumApp app = new ArraySumApp();
        List<int[]> arrays = new ArrayList<int[]>();

        // you want 1000s of threads to run on the GPU all at once for speedups
        for (int i = 0; i < ArraySumApp.numberOfJobs; ++i) {
            int[] array = new int[ArraySumApp.sizeOfArray];
            for (int j = 0; j < array.length; ++j) {
                array[j] = j;
            }
            arrays.add(array);
        }

        ElapsedTimer et = new ElapsedTimer();
        int[] sums = app.sumArrays(arrays);
        long allJobs = et.stopInMilliseconds();
        System.out.println("measureAllJobs " + et.stringInMilliseconds());

        double gainFactor = ((double) ArraySumApp.numberOfJobs) * oneJob
                / allJobs;
        System.out.println(String.format(
                "throughput gain factor %.1f\nthroughput gain %.1f\n",
                gainFactor, gainFactor - 1.0d));

        // check the number of answers is correct
        assert sums.length == ArraySumApp.numberOfJobs : "cosmic rays";

        // check they all have the answer
        for (int i = 0; i < ArraySumApp.numberOfJobs; i++)
            assert sums[i] == ArraySumApp.theAnswer : "cosmic rays";
    }
}

class ArraySum implements Kernel {

    final static int repetitionFactor = 100000;

    private int[] source;
    private int[] ret;
    private int index;

    public ArraySum(int[] src, int[] dst, int i) {
        source = src;
        ret = dst;
        index = i;
    }

    public void gpuMethod() {
        for (int repetition = 0; repetition < ArraySum.repetitionFactor; repetition++) {
            int sum = 0;
            for (int i = 0; i < source.length; ++i) {
                sum += source[i];
            }
            ret[index] = sum;
        }
    }
}

class Helper {
    private Helper() {
    }

    static void assertAssertionEnabled() {
        try {
            assert false;
        } catch (AssertionError e) {
            return;
        }
        Helper.noteCosmicRays();
    }

    static void noteCosmicRays() // programmer design or logic error
    {
        throw new RuntimeException("cosmic rays");
    }
}

class ElapsedTimer {
    private org.joda.time.DateTime t0;
    private long savedStopInMilliseconds;

    public ElapsedTimer() {
        this.t0 = new org.joda.time.DateTime();
    }

    public long stopInMilliseconds() {
        return stop();
    }

    public String stringInMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%d ms", this.savedStopInMilliseconds);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInMilliseconds() {
        stop();
        return stringInMilliseconds();
    }

    public String stringInSecondsAndMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%5.3f s", this.savedStopInMilliseconds / 1000.0d);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSecondsAndMilliseconds() {
        stop();
        return stringInSecondsAndMilliseconds();
    }

    public long stopInSeconds() {
        return (stop() + 500L) / 1000L; // rounding
    }

    public String stringInSeconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        long elapsed = (this.savedStopInMilliseconds + 500L) / 1000L; // rounding
        f.format("%d s", elapsed);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSeconds() {
        stop();
        return stringInSeconds();
    }

    /**
     * This is private. Use the stopInMilliseconds method if this is what you
     * need.
     */
    private long stop() {
        org.joda.time.DateTime t1 = new org.joda.time.DateTime();
        savedStopInMilliseconds = t1.getMillis() - this.t0.getMillis();
        return savedStopInMilliseconds;
    }
}

rootbeer开发人员表示，获取数组元素之和的示例代码不是最好的示例，另一个示例将显示吞吐量增益。

您可以看到：

这是2013年NVIDIA GTC会议的一个例子。我在使用转置的4核Java矩阵乘法上获得了20倍的加速

该示例是使用GPU上的共享内存的平铺矩阵乘法。从英伟达文学看，使用共享内存是获得良好加速的最重要的保证之一。要使用共享内存，请让块中的每个线程将值加载到共享数组中。然后您必须多次重用这些共享值。这样可以节省从全局内存中提取的时间

在Tesla 2.0体系结构上，从全局内存获取大约需要200-300个时钟周期，从共享内存获取大约需要2-3个时钟周期。

我认为您需要向rootbeer的作者询问这一点。

measureOneJob 110 ms
measureOneJob 26 ms
CudaRuntime2 ctor: elapsedTimeMillis: 609
measureAllJobs 24341 ms
throughput gain factor 1.1
throughput gain 0.1