Parallel processing Julia阵列的并行维数统计

Parallel processing Julia阵列的并行维数统计,parallel-processing,julia,Parallel Processing,Julia,在Julia中,并行获取给定维度上的数组统计信息的最佳实践是什么?我有许多大型数组,正在寻找类似于mean(array,1),但并行(并返回一个分位数)的东西。我无法并行处理阵列,因为我没有足够的RAM 我编写了一个粗略的基准测试,它也说明了我迄今为止尝试过的方法:mapsicles和@parallel在SharedArrays和DArrays上循环(见下文)。并行化似乎并没有使事情加快多少。添加7个工人并使用SharedArrays可产生1.8倍的加速比,使用DArrays可产生2.3倍的加速

在Julia中,并行获取给定维度上的数组统计信息的最佳实践是什么?我有许多大型数组,正在寻找类似于
mean(array,1)
,但并行(并返回一个分位数)的东西。我无法并行处理阵列,因为我没有足够的RAM

我编写了一个粗略的基准测试,它也说明了我迄今为止尝试过的方法:
mapsicles
@parallel
SharedArray
s和
DArray
s上循环(见下文)。并行化似乎并没有使事情加快多少。添加7个工人并使用SharedArrays可产生1.8倍的加速比,使用DArrays可产生2.3倍的加速比。朱莉娅对我来说是个新手。这是意料之中的事吗?我做错什么了吗

谢谢你的帮助。下面是我的脚本输出,后面是脚本本身


脚本输出:

WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
mapslices on Array
 38.152894 seconds (218.71 M allocations: 14.435 GB, 3.33% gc time)
 37.985577 seconds (218.10 M allocations: 14.406 GB, 3.23% gc time)
loop over Array using CartesianRange
  9.161392 seconds (25.27 M allocations: 9.005 GB, 4.41% gc time)
  9.118627 seconds (25.17 M allocations: 9.000 GB, 4.40% gc time)
@parallel loop over SharedArray
  9.092477 seconds (322.23 k allocations: 14.190 MB, 0.05% gc time)
  4.945648 seconds (18.90 k allocations: 1.405 MB)
@parallel loop over DArray
  5.615429 seconds (496.26 k allocations: 21.535 MB, 0.08% gc time)
  3.932704 seconds (15.63 k allocations: 1.178 MB)

脚本:

procs_added = addprocs(CPU_CORES - 1)
@everywhere using DistributedArrays

function benchmark_array(dtype, dims)
    data = rand(dtype, dims...)

    println("mapslices on Array")
    @time out = mapslices(f->quantile(f, 0.2), data, 1)
    @time out = mapslices(f->quantile(f, 0.2), data, 1)

    println("loop over Array using CartesianRange")
    out = Array(Float32, size(data)[2:end])
    @time loop_over_array!(out, data)
    @time loop_over_array!(out, data)
end

function loop_over_array!(out::Array, data::Array)
    for I in CartesianRange(size(out))
        # explicit indexing, since [:, I...] didn't work
        out[I] = quantile(data[:, I[1], I[2], I[3]], 0.2)
    end
end

function benchmark_shared_array(dtype, dims)
    data = SharedArray(dtype, (dims...), pids=workers())

    println("@parallel loop over SharedArray")
    out = SharedArray(Float32, size(data)[2:end], pids=workers())
    @time parallel_loop_over_shared_array!(out, data)
    @time parallel_loop_over_shared_array!(out, data)
end

function parallel_loop_over_shared_array!(out::SharedArray, data::SharedArray)
    # @parallel for I in CartesianRange(size(out)) does not seem to work
    @sync @parallel for i in 1:size(out)[end]
        for I in CartesianRange(size(out)[1:end-1])
            out[I[1], I[2], i] = quantile(data[:, I[1], I[2], i], 0.2)
        end
    end
end

function benchmark_distributed_array(dtype, dims)
    data = drand(dtype, (dims...), workers(),
        [i == length(dims) ? nworkers() : 1 for i in 1:length(dims)])

    println("@parallel loop over DArray")
    out = dzeros(Float32, size(data)[2:end], workers(),
        [i ==  ndims(data) ? nworkers() : 1 for i in 2:ndims(data)])
    @time parallel_loop_over_distributed_array!(out, data)
    @time parallel_loop_over_distributed_array!(out, data)
end

function parallel_loop_over_distributed_array!(out::DArray, data::DArray)
    @sync for pid in workers()
        @spawnat pid begin
            inchunk = localpart(data)
            outchunk = localpart(out)
            for I in CartesianRange(size(outchunk))
                outchunk[I] = quantile(inchunk[:, I[1], I[2], I[3]], 0.2)
            end
        end
    end
end

function benchmark_all(dtype, dims)
    benchmark_array(dtype, dims)
    benchmark_shared_array(dtype, dims)
    benchmark_distributed_array(dtype, dims)
end

const dtype = Int
const dims = [128,256,256,64]
benchmark_all(dtype, dims)