将数组传递到PyCuda卷积内核产生意外行为

将数组传递到PyCuda卷积内核产生意外行为,cuda,pycuda,Cuda,Pycuda,我正在尝试使用PyCuda将高斯滤波器与图像进行卷积。我从PyCuda文档中获取了一些代码,从在线页面获取了一个Cuda卷积内核。由于某种原因,生成的图像显示为完全黑色。我相信图像数组和高斯滤波器数组被错误地传入了-当我尝试使用printf从内核中打印值时,图像的值只是“0.00…”,而滤波器的值是非常大的数字,如“125529009160192000.000000” 我尝试过将数组展平并显式地设置为C顺序,但这似乎没有帮助。我也尝试过使用PyCuda GPUarrays,但没有成功 谢谢你看

我正在尝试使用PyCuda将高斯滤波器与图像进行卷积。我从PyCuda文档中获取了一些代码,从在线页面获取了一个Cuda卷积内核。由于某种原因,生成的图像显示为完全黑色。我相信图像数组和高斯滤波器数组被错误地传入了-当我尝试使用printf从内核中打印值时,图像的值只是“0.00…”,而滤波器的值是非常大的数字,如“125529009160192000.000000”

我尝试过将数组展平并显式地设置为C顺序,但这似乎没有帮助。我也尝试过使用PyCuda GPUarrays,但没有成功

谢谢你看

这是我的密码:

import pycuda.driver as cuda
import pycuda.autoinit
import math
from pycuda.compiler import SourceModule
from timeit import default_timer as timer
from PIL import Image
import numpy as np

def make_k(sig):
    s = 65
    out = np.zeros((s,s))
    for x in range(s):
        for y in range(s):
            X = x-(s-1)/2
            Y = y-(s-1)/2
            gauss = 1/(2*np.pi*sig**2) * np.exp(-(X**2 + Y**2)/(2*sig**2))
            out[x,y] = gauss
    a = np.sum(out)
    kernel = out/a
    return kernel

def replication_pad(img, W, H, S, paddedW, paddedH):
    output = np.zeros((paddedH, paddedW))
    output[:S, S:W+S] = img[0:1,:]
    output[S:H+S, :S] = img[:, 0:1]
    output[H+S:, S:W+S] = img[-1:,:]
    output[S:H+S, W+S:] = img[:, -1:]

    output[:S, :S] = img[0, 0]
    output[:S, paddedW-S:] = img[0, -1]
    output[paddedH-S:, :S] = img[-1, 0]
    output[paddedH-S:, paddedW-S:] = img[-1, -1]

    output[S:H+S, S:W+S] = img
    return output


#d_f is the padded image
#d_g is the filter
#d_h is the filtering result

mod = SourceModule("""
__global__ void convolution( const float *d_f, const unsigned int paddedW, const unsigned int paddedH,
                                      const float *d_g, const int S,
                                      float *d_h, const unsigned int W, const unsigned int H )
{   
    // Set the padding size and filter size
    unsigned int paddingSize = S;
    unsigned int filterSize = 2 * S + 1;

    // Set the pixel coordinate
    const unsigned int j = blockIdx.x * blockDim.x + threadIdx.x + paddingSize;
    const unsigned int i = blockIdx.y * blockDim.y + threadIdx.y + paddingSize;

    // Print for debugging (on the first thread)
    if( i==paddingSize && j==paddingSize) {
        //printf("%lf", d_g[50]);
        printf("%lf", d_f[100400]);
    }

    // The multiply-add operation for the pixel coordinate ( j, i )
    if( j >= paddingSize && j < paddedW - paddingSize && i >= paddingSize && i < paddedH - paddingSize ) {
        unsigned int oPixelPos = ( i - paddingSize ) * W + ( j - paddingSize );
        d_h[oPixelPos] = 0.0;
        for( int k = -S; k <=S; k++ ) {
            for( int l = -S; l <= S; l++ ) {
                unsigned int iPixelPos = ( i + k ) * paddedW + ( j + l );
                unsigned int coefPos = ( k + S ) * filterSize + ( l + S );
                d_h[oPixelPos] += d_f[iPixelPos] * d_g[coefPos];
            }
        }
    }

}
""")

image = Image.open('spooky.jpg').convert('L')
img_full = np.asarray(image, dtype='float')
img = img_full[:1080,:1920] # 1080p resolution
W = 1920
H = 1080

S = 32
paddedW = W + 2*S
paddedH = H + 2*S

img_padded = replication_pad(img, W, H, S, paddedW, paddedH)

kernel = make_k(10)
ker_cont = np.ascontiguousarray(kernel, dtype="float")
ker_gpu = cuda.mem_alloc(ker_cont.nbytes)
cuda.memcpy_htod(ker_gpu, ker_cont)

img_cont = np.ascontiguousarray(img_padded)
img_gpu = cuda.mem_alloc(img_cont.nbytes)
cuda.memcpy_htod(img_gpu, img_cont)

img_og = np.ascontiguousarray(img)
result_gpu = cuda.mem_alloc(img_og.nbytes)

blockW = 32
blockH = 32
gridW = math.ceil(W/blockW)
gridH = math.ceil(H/blockH)

func = mod.get_function("convolution")
func(img_gpu, np.int_(paddedW), np.int_(paddedH), ker_gpu, np.int_(S), result_gpu, np.int_(W), np.int_(H), block = (blockW, blockH, 1), grid=(gridW, gridH))

host_output = np.empty_like(img_og)
cuda.memcpy_dtoh(host_output, result_gpu)

Image.fromarray(host_output).show()

下面是我使用的图像:

我需要将输入映像和输入内核的数据类型从float64更改为float32。还需要参考float32数组为适当的nbytes分配输出数组。这看起来像:

ker_cont = np.float32(ker_cont)

img_cont = np.float32(img_cont)

img_og = np.float32(img_og)
result_gpu = cuda.mem_alloc(img_og.nbytes)

你至少有两个问题。1.提示:尝试将printimg_cont.dtype放在代码中的适当位置。2.你对np.int的使用。。。对于32位整数,内核参数可能不正确。np.int_uu至少在linux平台上是64位类型。试试np.int32。。。另外,不需要使用%lf作为printf格式说明符。只需使用%f,无论是打印浮点数还是双字符。@RobertCrovella非常感谢您!我不认为我自己会抓到这个。我不知道数据类型有这么重要。我是新手,但我确实认为PyCuda以强制类型正确性而闻名。我很惊讶我成功地在没有错误信息的情况下解决了这个错误。无论如何,再次非常感谢你帮助我。