Aleagpu 如何让Alea更快？_Aleagpu

Aleagpu 如何让Alea更快？

Aleagpu 如何让Alea更快？,aleagpu,Aleagpu,在Alea中实现各种ML算法之后，我尝试在Alea中对一些简单但重要的例程进行基准测试。我很惊讶地发现，Alea“比同等的cuBLAS调用sgeam做同样的事情所花费的时间大约要长3倍。如果我在做一些更复杂的事情，比如矩阵乘法，我必须处理共享内存，这是可以理解的，但下面只是简单的数组转换 let dmat = createRandomUniformMatrix 100 1000 1.0f 0.0f let dmat2 = createRandomUniformMatrix 100 1000 1.

在Alea中实现各种ML算法之后，我尝试在Alea中对一些简单但重要的例程进行基准测试。我很惊讶地发现，Alea“比同等的cuBLAS调用sgeam做同样的事情所花费的时间大约要长3倍。如果我在做一些更复杂的事情，比如矩阵乘法，我必须处理共享内存，这是可以理解的，但下面只是简单的数组转换

let dmat = createRandomUniformMatrix 100 1000 1.0f 0.0f
let dmat2 = createRandomUniformMatrix 100 1000 1.0f 0.0f
let rmat = createEmptyMatrixLike dmat

let m = new DeviceUnaryTransformModule<float32> <@ fun x -> x*2.0f @>

#time
//4.85s/100k
for i=1 to 100000 do
    m.Apply(dmat, rmat) |> ignore
#time

#time
//1.8s/100k
for i=1 to 100000 do
    sgeam2 nT nT 2.0f dmat 0.0f dmat2 rmat  |> ignore
#time

让dmat=createRandomUniformMatrix 100 1000 1.0f 0.0f
设dmat2=createRandomUniformMatrix 100 1000 1.0f 0.0f
设rmat=CreateEmptyMatrix类dmat
设m=新设备一元转换模块x*2.0f@>
#时间
//4.85s/100k
对于i=1到100000 do
m、 应用（dmat，rmat）|>忽略
#时间
#时间
//1.8s/100k
对于i=1到100000 do
sgeam2 nT nT 2.0f dmat 0.0f dmat2 rmat |>忽略
#时间

DeviceUnaryTransformModule转换模块的内核与基本转换示例中的内核相同，唯一的区别是，之后它不再收集到主机，而是将数据保留在设备上

另外，Unbound的reduce对我来说效果很差，事实上效果很差，我使用它的方式肯定有错误。它大约比使用sgeamv两次求和矩阵慢20倍

let makeReduce (op:Expr<'T -> 'T -> 'T>)  =
    let compileReductionKernel (op:Expr<'T -> 'T -> 'T>) =
        worker.LoadProgram(
                        DeviceReduceImpl.DeviceReduce(op, worker.Device.Arch, PlatformUtil.Instance.ProcessBitness).Template
                        )

    let prog = compileReductionKernel op

    let runReduceProgram (sumProg : Program<DeviceReduceImpl.IDeviceReduceFactory<'A>>) (x: DeviceMemory<'A>) = 
        sumProg.Entry.Create(blob, x.Length)
               .Reduce(None, x.Ptr, x.Length)

    let reduceProg (x: DeviceMemory<'T>) = runReduceProgram prog x
    reduceProg

let sumReduce: DeviceMemory<float32> -> float32 = makeReduce <@ fun (a:float32) b -> a + b @>

#time
//3.5s/10k
for i=1 to 10000 do
    sumReduce dmat.dArray |> ignore
#time

让makeReduce（op:Expr'T>）=
让编译器还原内核（op:Expr'T>）=
worker.LoadProgram(
DeviceReduceImpl.DeviceReduce（op，worker.Device.Arch，PlatformUtil.Instance.ProcessBitness）.Template
)
设prog=compileReductionKernel op
让runReduceProgram（sumProg:Program）=
sumProg.Entry.Create（blob，x.Length）
.减少（无，x.Ptr，x.长度）
让reduceProg（x:DeviceMemory我认为您的测试代码中存在一些问题：
在您的映射模块中，您应该预加载GPUModule。GPUModule在第一次启动时是JIT编译的。因此，实际上您的计时测量包括GPU代码编译时间
在映射模块中，无论是Alea代码还是cublas代码，都应该同步工作者（同步CUDA上下文）。CUDA编程是异步风格的。因此，当您启动内核时，它会立即返回，而不等待内核完成。如果您不同步工作进程，实际上您是在测量内核启动时间，而不是内核执行时间。哪个Alea gpu的启动时间会比本机C代码慢，因为它会执行一些封送处理还有一些与内核启动时间相关的问题，我将在下面的示例代码中向您展示
您的reduce测试实际上每次都加载reduce模块！也就是说，每次执行reduce时，您都会测量时间，包括GPU编译时间！建议您将GPU模块或程序的实例设置为长寿命，因为它们代表已编译的GPU代码
因此，我按照您的用法做了一个测试。这里我首先列出了完整的测试代码：
#r @"packages\Alea.CUDA.2.1.2.3274\lib\net40\Alea.CUDA.dll"
#r @"packages\Alea.CUDA.IL.2.1.2.3274\lib\net40\Alea.CUDA.IL.dll"
#r @"packages\Alea.CUDA.Unbound.2.1.2.3274\lib\net40\Alea.CUDA.Unbound.dll"
#r "System.Configuration"
open System.IO
Alea.CUDA.Settings.Instance.Resource.AssemblyPath <- Path.Combine(@"packages\Alea.CUDA.2.1.2.3274", "private")
Alea.CUDA.Settings.Instance.Resource.Path <- Path.GetTempPath()

open Alea.CUDA
open Alea.CUDA.Utilities
open Alea.CUDA.CULib
open Alea.CUDA.Unbound
open Microsoft.FSharp.Quotations

type MapModule(target, op:Expr<float32 -> float32>) =
    inherit GPUModule(target)

    [<Kernel;ReflectedDefinition>]
    member this.Kernel (C:deviceptr<float32>) (A:deviceptr<float32>) (B:deviceptr<float32>) (n:int) =
        let start = blockIdx.x * blockDim.x + threadIdx.x
        let stride = gridDim.x * blockDim.x
        let mutable i = start
        while i < n do
            C.[i] <- __eval(op) A.[i] + __eval(op) B.[i]
            i <- i + stride

    member this.Apply(C:deviceptr<float32>, A:deviceptr<float32>, B:deviceptr<float32>, n:int) =
        let lp = LaunchParam(64, 256)
        this.GPULaunch <@ this.Kernel @> lp C A B n

let inline mapTemplate (op:Expr<'T -> 'T>) = cuda {
    let! kernel = 
        <@ fun (C:deviceptr<'T>) (A:deviceptr<'T>) (B:deviceptr<'T>) (n:int) ->
            let start = blockIdx.x * blockDim.x + threadIdx.x
            let stride = gridDim.x * blockDim.x
            let mutable i = start
            while i < n do
                C.[i] <- (%op) A.[i] + (%op) B.[i]
                i <- i + stride @>
        |> Compiler.DefineKernel

    return Entry(fun program ->
        let worker = program.Worker
        let kernel = program.Apply kernel
        let lp = LaunchParam(64, 256)

        let run C A B n =
            kernel.Launch lp C A B n

        run ) }

let test1 (worker:Worker) m n sync iters =
    let n = m * n
    use m = new MapModule(GPUModuleTarget.Worker(worker), <@ fun x -> x * 2.0f @>)
    let rng = System.Random(42)
    use A = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use B = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use C = worker.Malloc<float32>(n)
    let timer = System.Diagnostics.Stopwatch.StartNew()
    for i = 1 to iters do
        m.Apply(C.Ptr, A.Ptr, B.Ptr, n)
    if sync then worker.Synchronize()
    timer.Stop()
    printfn "%f ms / %d %s (no pre-load module)" timer.Elapsed.TotalMilliseconds iters (if sync then "sync" else "nosync")

let test2 (worker:Worker) m n sync iters =
    let n = m * n
    use m = new MapModule(GPUModuleTarget.Worker(worker), <@ fun x -> x * 2.0f @>)
    // we pre-load the module, this will JIT compile the GPU code
    m.GPUForceLoad()
    let rng = System.Random(42)
    use A = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use B = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use C = worker.Malloc<float32>(n)
    let timer = System.Diagnostics.Stopwatch.StartNew()
    for i = 1 to iters do
        m.Apply(C.Ptr, A.Ptr, B.Ptr, n)
    if sync then worker.Synchronize()
    timer.Stop()
    printfn "%f ms / %d %s (pre-loaded module)" timer.Elapsed.TotalMilliseconds iters (if sync then "sync" else "nosync")

let test3 (worker:Worker) m n sync iters =
    let n = m * n
    use m = new MapModule(GPUModuleTarget.Worker(worker), <@ fun x -> x * 2.0f @>)
    // we pre-load the module, this will JIT compile the GPU code
    m.GPUForceLoad()
    let rng = System.Random(42)
    use A = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use B = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use C = worker.Malloc<float32>(n)
    // since the worker is running in a background thread
    // each cuda api will switch to that thread
    // use eval() to avoid the many thread switching
    worker.Eval <| fun _ ->
        let timer = System.Diagnostics.Stopwatch.StartNew()
        for i = 1 to iters do
            m.Apply(C.Ptr, A.Ptr, B.Ptr, n)
        if sync then worker.Synchronize()
        timer.Stop()
        printfn "%f ms / %d %s (pre-loaded module + worker.eval)" timer.Elapsed.TotalMilliseconds iters (if sync then "sync" else "nosync")

let test4 (worker:Worker) m n sync iters =
    use program = worker.LoadProgram(mapTemplate <@ fun x -> x * 2.0f @>)
    let n = m * n
    let rng = System.Random(42)
    use A = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use B = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use C = worker.Malloc<float32>(n)
    let timer = System.Diagnostics.Stopwatch.StartNew()
    for i = 1 to iters do
        program.Run C.Ptr A.Ptr B.Ptr n
    if sync then worker.Synchronize()
    timer.Stop()
    printfn "%f ms / %d %s (template usage)" timer.Elapsed.TotalMilliseconds iters (if sync then "sync" else "nosync")

let test5 (worker:Worker) m n sync iters =
    use program = worker.LoadProgram(mapTemplate <@ fun x -> x * 2.0f @>)
    let n = m * n
    let rng = System.Random(42)
    use A = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use B = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use C = worker.Malloc<float32>(n)
    worker.Eval <| fun _ ->
        let timer = System.Diagnostics.Stopwatch.StartNew()
        for i = 1 to iters do
            program.Run C.Ptr A.Ptr B.Ptr n
        if sync then worker.Synchronize()
        timer.Stop()
        printfn "%f ms / %d %s (template usage + worker.Eval)" timer.Elapsed.TotalMilliseconds iters (if sync then "sync" else "nosync")

let test6 (worker:Worker) m n sync iters =
    use cublas = new CUBLAS(worker)
    let rng = System.Random(42)
    use dmat1 = worker.Malloc(Array.init (m * n) (fun _ -> rng.NextDouble() |> float32))
    use dmat2 = worker.Malloc(Array.init (m * n) (fun _ -> rng.NextDouble() |> float32))
    use dmatr = worker.Malloc<float32>(m * n)
    let timer = System.Diagnostics.Stopwatch.StartNew()
    for i = 1 to iters do
        cublas.Sgeam(cublasOperation_t.CUBLAS_OP_N, cublasOperation_t.CUBLAS_OP_N, m, n, 2.0f, dmat1.Ptr, m, 2.0f, dmat2.Ptr, m, dmatr.Ptr, m)
    if sync then worker.Synchronize()
    timer.Stop()
    printfn "%f ms / %d %s (cublas)" timer.Elapsed.TotalMilliseconds iters (if sync then "sync" else "nosync")

let test7 (worker:Worker) m n sync iters =
    use cublas = new CUBLAS(worker)
    let rng = System.Random(42)
    use dmat1 = worker.Malloc(Array.init (m * n) (fun _ -> rng.NextDouble() |> float32))
    use dmat2 = worker.Malloc(Array.init (m * n) (fun _ -> rng.NextDouble() |> float32))
    use dmatr = worker.Malloc<float32>(m * n)
    worker.Eval <| fun _ ->
        let timer = System.Diagnostics.Stopwatch.StartNew()
        for i = 1 to iters do
            cublas.Sgeam(cublasOperation_t.CUBLAS_OP_N, cublasOperation_t.CUBLAS_OP_N, m, n, 2.0f, dmat1.Ptr, m, 2.0f, dmat2.Ptr, m, dmatr.Ptr, m)
        if sync then worker.Synchronize()
        timer.Stop()
        printfn "%f ms / %d %s (cublas + worker.eval)" timer.Elapsed.TotalMilliseconds iters (if sync then "sync" else "nosync")

let test worker m n sync iters =
    test6 worker m n sync iters
    test7 worker m n sync iters
    test1 worker m n sync iters
    test2 worker m n sync iters
    test3 worker m n sync iters
    test4 worker m n sync iters
    test5 worker m n sync iters

let testReduce1 (worker:Worker) n iters =
    let rng = System.Random(42)
    use input = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use reduceModule = new DeviceReduceModule<float32>(GPUModuleTarget.Worker(worker), <@ (+) @>)
    // JIT compile and load GPU code for this module
    reduceModule.GPUForceLoad()
    // create a reducer which will allocate temp memory for maxNum=n
    let reduce = reduceModule.Create(n)
    let timer = System.Diagnostics.Stopwatch.StartNew()
    for i = 1 to 10000 do
        reduce.Reduce(input.Ptr, n) |> ignore
    timer.Stop()
    printfn "%f ms / %d (pre-load gpu code)" timer.Elapsed.TotalMilliseconds iters

let testReduce2 (worker:Worker) n iters =
    let rng = System.Random(42)
    use input = worker.Malloc(Array.init n (fun _ -> rng.NextDouble() |> float32))
    use reduceModule = new DeviceReduceModule<float32>(GPUModuleTarget.Worker(worker), <@ (+) @>)
    // JIT compile and load GPU code for this module
    reduceModule.GPUForceLoad()
    // create a reducer which will allocate temp memory for maxNum=n
    let reduce = reduceModule.Create(n)
    worker.Eval <| fun _ ->
        let timer = System.Diagnostics.Stopwatch.StartNew()
        for i = 1 to 10000 do
            reduce.Reduce(input.Ptr, n) |> ignore
        timer.Stop()
        printfn "%f ms / %d (pre-load gpu code and avoid thread switching)" timer.Elapsed.TotalMilliseconds iters

let testReduce worker n iters =
    testReduce1 worker n iters
    testReduce2 worker n iters

let workerDefault = Worker.Default
let workerNoThread = Worker.CreateOnCurrentThread(Device.Default)

另外，您可能会注意到，我运行了两次此测试，第一次，没有预加载模块的测试使用304毫秒，但第二次，没有预加载模块的测试只使用29毫秒。原因是，我们使用LLVM p/Invoke编译内核。而这些p/Invoke函数是惰性函数，因此在fir时会进行一些初始化使用它之后，它会变得更快
现在，我们同步worker，它实际上测量了实际内核执行时间，现在它们是相似的。我在这里创建的内核非常简单，但它对矩阵A和B都有作用：
> test workerDefault 10000 10000 true 100;;
843.695000 ms / 100 sync (cublas)
841.452400 ms / 100 sync (cublas + worker.eval)
919.244900 ms / 100 sync (no pre-load module)
912.348000 ms / 100 sync (pre-loaded module)
908.909000 ms / 100 sync (pre-loaded module + worker.eval)
914.834100 ms / 100 sync (template usage)
914.170100 ms / 100 sync (template usage + worker.Eval)

现在，如果我们在threadless worker上测试它们，它们会有点快，因为没有线程切换：
> test workerNoThread 10000 10000 true 100;;
842.132100 ms / 100 sync (cublas)
841.627200 ms / 100 sync (cublas + worker.eval)
918.007800 ms / 100 sync (no pre-load module)
908.575900 ms / 100 sync (pre-loaded module)
908.770100 ms / 100 sync (pre-loaded module + worker.eval)
913.405300 ms / 100 sync (template usage)
913.942600 ms / 100 sync (template usage + worker.Eval)

下面是关于减少的测试：
> testReduce workerDefault 10000000 100;;
7691.335300 ms / 100 (pre-load gpu code)
6448.782500 ms / 100 (pre-load gpu code and avoid thread switching)
val it : unit = ()
> testReduce workerNoThread 10000000 100;;
6467.105300 ms / 100 (pre-load gpu code)
6426.296900 ms / 100 (pre-load gpu code and avoid thread switching)
val it : unit = ()

请注意，在这个缩减测试中，有一个内存收集（memcpyDtoH）对于每次缩减，都要将结果从设备复制到主机。而此内存复制API调用会自动同步工作进程，因为如果内核未完成，该值将毫无意义。因此，如果要将性能与C代码进行比较，还应将结果标量从设备复制到主机。虽然这只是一次CUDA API调用，但如果您进行了多次迭代（在本例中为100次），那么它将在那里累积一些时间
希望这能回答您的问题。顺便说一句，当前的alea gpu使用CUDA 6.0的nvidia llvm编译器。我们将发布一个新版本，在CUDA 7.5发布时将其升级到CUDA 7.5。根据我现在观察到的情况，这次升级提高了生成内核的性能。这是一个很好的答案。我实际上知道CUDA有异步内核默认情况下，UCH在C++中，但我假设阿莱亚中的不同。这是因为我不知道异步内核执行有点像函数语言中的懒惰评估。我认为这是另外一回事。谢谢。
> testReduce workerDefault 10000000 100;;
7691.335300 ms / 100 (pre-load gpu code)
6448.782500 ms / 100 (pre-load gpu code and avoid thread switching)
val it : unit = ()
> testReduce workerNoThread 10000000 100;;
6467.105300 ms / 100 (pre-load gpu code)
6426.296900 ms / 100 (pre-load gpu code and avoid thread switching)
val it : unit = ()