Python CUDA Historogram2d不工作_Python_Numpy_Cuda_Pycuda_Histogram2d

Python CUDA Historogram2d不工作

python numpy cuda

Python CUDA Historogram2d不工作,python,numpy,cuda,pycuda,histogram2d,Python,Numpy,Cuda,Pycuda,Histogram2d,由于CUDA似乎缺少一个像样的2D柱状图（我可以找到……欢迎使用指针），我正在尝试用pyCUDA实现它下面是直方图的外观（使用Numpy）：以下是到目前为止我得到的信息： code = ''' __global__ void histogram2d(const float *in_x, const float *in_y, const float *in_w, float *out) {{ int start = blockIdx.x * blockDim.x + threadId

由于CUDA似乎缺少一个像样的2D柱状图（我可以找到……欢迎使用指针），我正在尝试用pyCUDA实现它

下面是直方图的外观（使用Numpy）：

以下是到目前为止我得到的信息：

code = '''
__global__ void histogram2d(const float *in_x, const float *in_y, const float *in_w, float *out) {{
    int start = blockIdx.x * blockDim.x + threadIdx.x;

    float *block_out = &out[{xres} * {yres} * {num_chans} * blockIdx.x];

    for(int i = 0; i < {length}; i++) {{
        float x = in_x[start + i];
        float y = in_y[start + i];
        int w_idx = (start + i) * {num_chans};

        int xbin = (int) (((x - {xmin}) / {xptp}) * {xres});
        int ybin = (int) (((y - {ymin}) / {yptp}) * {yres});

        if (0 <= xbin && xbin < {xres} && 0 <= ybin && ybin < {yres}) {{
            for(int c = 0; c < {num_chans}; c++) {{
                atomicAdd(&block_out[(ybin * {xres} + xbin) * {num_chans} + c], in_w[w_idx + c]);
            }}
        }}
    }}
}}
'''.format(**args)

------

__global__ void histogram2d(const float *in_x, const float *in_y, const float *in_w, float *out) {
    int start = blockIdx.x * blockDim.x + threadIdx.x;

    float *block_out = &out[50 * 50 * 4 * blockIdx.x];

    for(int i = 0; i < 100; i++) {
        float x = in_x[start + i];
        float y = in_y[start + i];
        int w_idx = (start + i) * 4;

        int xbin = (int) (((x - -10.0) / 20.0) * 50);
        int ybin = (int) (((y - -10.0) / 20.0) * 50);

        if (0 <= xbin && xbin < 50 && 0 <= ybin && ybin < 50) {
            for(int c = 0; c < 4; c++) {
                atomicAdd(&block_out[(ybin * 50 + xbin) * 4 + c], in_w[w_idx + c]);
            }
        }
    }
}

code=''
__全局无效历史图2D（常量浮点*输入x，常量浮点*输入y，常量浮点*输入w，浮点*输出）{{
int start=blockIdx.x*blockDim.x+threadIdx.x；
float*block_out=&out[{xres}*{yres}*{num_chans}*blockIdx.x]；
对于（int i=0；i<{length}；i++）{{
浮动x=in_x[开始+i]；
浮动y=in_y[start+i]；
int w_idx=（开始+i）*{num_chans}；
int xbin=（int）（（x-{xmin}）/{xptp}）*{xres}）；
int-ybin=（int）（（y-{ymin}）/{yptp}）*{yres}）；
如果（0为CUDA部分的输出数组分配的数组使用Numpy的默认float64而不是float32，那么内存是预期的两倍。下面是新的直方图输出：

我仍然非常感谢有助于解释为什么这些直方图彼此如此不同的评论或回答。对于n个样本，in_x和in_y有形状（n，），in_w有形状（n，4）。out有形状（num_blocks，yres，xres，4）。我的目标是为每个CUDA块分配足够的空间以拥有自己的（yres，xres，4）要写入的区域（这样原子加法就不会互相阻塞），然后我在轴0上求和，得到最终的直方图
def slow_hist(in_x, in_y, in_w, out, blockx, blockdimx, threadx):
    start = blockx * blockdimx + threadx

    block_out_addr = args['xres'] * args['yres'], args['num_chans'] * blockx

    for i in range(args['length']):
        x = in_x[start + i]
        y = in_y[start + i]
        w_idx = (start + i) * args['num_chans']

        xbin = int(((x - args['xmin']) / args['xptp']) * args['xres'])
        ybin = int(((y - args['ymin']) / args['yptp']) * args['yres'])

        if 0 <= xbin < args['xres'] and 0 <= ybin < args['yres']:
            for c in range(args['num_chans']):
                out[(ybin * args['xres'] + xbin) * args['num_chans'] + c] += in_w[w_idx + c]