Parallel processing CUDA-从3D阵列中提取图层_Parallel Processing_Cuda_Gpu_Gpgpu_Pycuda

Parallel processing CUDA-从3D阵列中提取图层

parallel-processing cuda

Parallel processing CUDA-从3D阵列中提取图层,parallel-processing,cuda,gpu,gpgpu,pycuda,Parallel Processing,Cuda,Gpu,Gpgpu,Pycuda,我有一个3D矩阵，其中x-y平面表示图像，z平面表示图像层。问题是，当我尝试使用idz提取第一层或其他层时，我没有得到预期的结果。看起来，一旦在CUDA中，数组的x、y或z索引与我在pycuda中预期的不同。我通过下面的结果数组看到了这一点。下面是这个小例子的一步一步的过程，我使用通用的整数来表示我的图像，以保存上传的图像和整个代码！这里我导入库并定义图像大小和图层 import pycuda.driver as cuda import pycuda.autoinit from pycud

我有一个3D矩阵，其中x-y平面表示图像，z平面表示图像层。问题是，当我尝试使用idz提取第一层或其他层时，我没有得到预期的结果。看起来，一旦在CUDA中，数组的x、y或z索引与我在pycuda中预期的不同。我通过下面的结果数组看到了这一点。下面是这个小例子的一步一步的过程，我使用通用的整数来表示我的图像，以保存上传的图像和整个代码！这里我导入库并定义图像大小和图层

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
from pycuda.gpuarray import to_gpu

row = 10
column = 10
depth = 5

然后我定义我的输入3D数组和输出2D数组

#--==== Input 3D Array ====---
arrayA = numpy.full((row, column, depth), 0)

#populate each layer with fixed values
for i in range(depth):
    arrayA[:,:,i] = i + 1

arrayA = arrayA.astype(numpy.uint16)
arrayA_gpu = cuda.mem_alloc(arrayA.nbytes)
cuda.memcpy_htod(arrayA_gpu, arrayA)
arrayA_Answer = numpy.empty_like(arrayA)

#--==== Output 2D array container ====---
arrayB = numpy.zeros([row, column], dtype = numpy.uint16)
arrayB_gpu = cuda.mem_alloc(arrayB.nbytes)
cuda.memcpy_htod(arrayB_gpu, arrayB)
arrayB_Answer = numpy.empty_like(arrayB)

接下来，我在pycuda中定义CUDA内核和函数

mod = SourceModule("""
    __global__ void getLayer(int *arrayA, int *arrayB)
    {
        int idx = threadIdx.x + (blockIdx.x * blockDim.x); // x coordinate (numpy axis 2) 
        int idy = threadIdx.y + (blockIdx.y * blockDim.y); // y coordinate (numpy axis 1)
        int idz = 0; //The first layer, this can set in range from 0-4 
        int x_width = (blockDim.x * gridDim.x); 
        int y_width = (blockDim.y * gridDim.y); 

        arrayB[idx + (x_width * idy)] = arrayA[idx + (x_width * idy) + (x_width * y_width) * idz];
    }
    """)

func = mod.get_function("getLayer")
func(arrayA_gpu, arrayB_gpu, block=(row, column, 1), grid=(1,1))

使用标准的pycuda命令，我提取的结果与我预期的不同 arrayA[：，：，0]=10x10矩阵填充1的良好值

arrayB[：，：]=10x10矩阵，填充了以下错误，预期等于arrayA[：，：，0]

如前所述，numpy 3D存储顺序模式是z即第三个索引是快速变化的索引，当您在内存中线性前进时。代码假定第一个索引x是快速变化的索引

由于您的内核已经为高效的合并加载/存储行为进行了组织，您可以通过在numpy中重新排序图像/层/切片的存储来解决这个问题。以下是一个成功的例子：

$ cat t10.py
from __future__ import print_function
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
from pycuda.gpuarray import to_gpu

row = 5
column = 10
depth = 10

#--==== Input 3D Array ====---
arrayA = numpy.full((row, column, depth), 0)
my_slice=numpy.int32(3)  # choose the layer
#populate each layer with fixed values
for i in range(row):
    arrayA[i,:,:] = i + 1

arrayA = arrayA.astype(numpy.int32)
arrayA_gpu = cuda.mem_alloc(arrayA.nbytes)
cuda.memcpy_htod(arrayA_gpu, arrayA)
arrayA_Answer = numpy.empty_like(arrayA)

#--==== Output 2D array container ====---
arrayB = numpy.zeros([column, depth], dtype = numpy.int32)
arrayB_gpu = cuda.mem_alloc(arrayB.nbytes)
cuda.memcpy_htod(arrayB_gpu, arrayB)
arrayB_Answer = numpy.empty_like(arrayB)

mod = SourceModule("""
    __global__ void getLayer(int *arrayA, int *arrayB, int slice)
    {
        int idx = threadIdx.x + (blockIdx.x * blockDim.x); // x coordinate (numpy axis 2)
        int idy = threadIdx.y + (blockIdx.y * blockDim.y); // y coordinate (numpy axis 1)
        int idz = slice; //The "layer"
        int x_width = (blockDim.x * gridDim.x);
        int y_width = (blockDim.y * gridDim.y);

        arrayB[idx + (x_width * idy)] = arrayA[idx + (x_width * idy) + (x_width * y_width) * idz];
    }
    """)

func = mod.get_function("getLayer")
func(arrayA_gpu, arrayB_gpu, my_slice, block=(depth, column, 1), grid=(1,1))
cuda.memcpy_dtoh(arrayB_Answer,arrayB_gpu)

print(arrayA[my_slice,:,:])

print(arrayB_Answer[:,:])
$ python t10.py
[[4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]]
[[4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]]
$

请注意，我还将您对uint16的使用更改为int32，以匹配内核类型int。

你好，罗伯特·克罗维拉，感谢您的及时响应。正如我所怀疑的，正如你所帮助的，这是一个重新排序的问题。用下面的详细信息更新了我的程序并使其正常工作。再次感谢！

print(arrayB_Answer)
[[1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]
 [1 2 3 4 5 1 2 3 4 5]]

$ cat t10.py
from __future__ import print_function
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
from pycuda.gpuarray import to_gpu

row = 5
column = 10
depth = 10

#--==== Input 3D Array ====---
arrayA = numpy.full((row, column, depth), 0)
my_slice=numpy.int32(3)  # choose the layer
#populate each layer with fixed values
for i in range(row):
    arrayA[i,:,:] = i + 1

arrayA = arrayA.astype(numpy.int32)
arrayA_gpu = cuda.mem_alloc(arrayA.nbytes)
cuda.memcpy_htod(arrayA_gpu, arrayA)
arrayA_Answer = numpy.empty_like(arrayA)

#--==== Output 2D array container ====---
arrayB = numpy.zeros([column, depth], dtype = numpy.int32)
arrayB_gpu = cuda.mem_alloc(arrayB.nbytes)
cuda.memcpy_htod(arrayB_gpu, arrayB)
arrayB_Answer = numpy.empty_like(arrayB)

mod = SourceModule("""
    __global__ void getLayer(int *arrayA, int *arrayB, int slice)
    {
        int idx = threadIdx.x + (blockIdx.x * blockDim.x); // x coordinate (numpy axis 2)
        int idy = threadIdx.y + (blockIdx.y * blockDim.y); // y coordinate (numpy axis 1)
        int idz = slice; //The "layer"
        int x_width = (blockDim.x * gridDim.x);
        int y_width = (blockDim.y * gridDim.y);

        arrayB[idx + (x_width * idy)] = arrayA[idx + (x_width * idy) + (x_width * y_width) * idz];
    }
    """)

func = mod.get_function("getLayer")
func(arrayA_gpu, arrayB_gpu, my_slice, block=(depth, column, 1), grid=(1,1))
cuda.memcpy_dtoh(arrayB_Answer,arrayB_gpu)

print(arrayA[my_slice,:,:])

print(arrayB_Answer[:,:])
$ python t10.py
[[4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]]
[[4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]
 [4 4 4 4 4 4 4 4 4 4]]
$