Warning: file_get_contents(/data/phpspider/zhask/data//catemap/1/typo3/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ Cuda-3D block&;网格维度混乱-另一个_C++_Cuda - Fatal编程技术网

C++ Cuda-3D block&;网格维度混乱-另一个

C++ Cuda-3D block&;网格维度混乱-另一个,c++,cuda,C++,Cuda,在下面的简单示例中,我使用cudamaloc3d在设备上分配内存,并将3D数据的每个体素增加一个,只要我使用对称3D体积,效果就很好 主机代码如下所示: int main(void) { typedef float PixelType; // Set up test data dim3 image_dimensions = dim3(32, 32, 32); size_t num_elements = image_dimensions.x * image_di

在下面的简单示例中,我使用
cudamaloc3d
在设备上分配内存,并将3D数据的每个体素增加一个,只要我使用对称3D体积,效果就很好

主机代码如下所示:

int main(void)
{
    typedef float PixelType;

    // Set up test data
    dim3  image_dimensions = dim3(32, 32, 32);
    size_t num_elements = image_dimensions.x * image_dimensions.y * image_dimensions.z;
    PixelType *image_data = new float[num_elements];
    for(int i = 0; i < num_elements; ++i)
    {
        image_data[i] = float(i);
    }

    // Allocate 3D memory on the device
    cudaExtent volumeSizeBytes = make_cudaExtent(sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
    cudaPitchedPtr devicePitchedPointer;
    cudaMalloc3D(&devicePitchedPointer, volumeSizeBytes);
    cudaMemset3D(devicePitchedPointer, 1.0f, volumeSizeBytes);

    // Copy image data from the host to the device
    cudaMemcpy3DParms copy_params_host_to_device = {0};
    copy_params_host_to_device.srcPtr = make_cudaPitchedPtr((void *)image_data, sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
    copy_params_host_to_device.dstPtr = devicePitchedPointer;
    copy_params_host_to_device.extent = volumeSizeBytes;
    copy_params_host_to_device.kind   = cudaMemcpyHostToDevice;
    cudaMemcpy3D(&copy_params_host_to_device);

    // Kernel Launch Configuration
    dim3 threads_per_block = dim3(8, 8, 8);
    dim3 blocks_per_grid = dim3((image_dimensions.x + threads_per_block.x - 1) / threads_per_block.x, (image_dimensions.y + threads_per_block.y - 1) / threads_per_block.y, (image_dimensions.z + threads_per_block.z - 1) / threads_per_block.z);
    extract_patches_from_image_data<<<blocks_per_grid, threads_per_block>>>(devicePitchedPointer, image_dimensions);
    cudaDeviceSynchronize();

    // Copy image data back from the device to the host
    cudaMemcpy3DParms copy_params_device_to_host = {0};
    copy_params_device_to_host.srcPtr = devicePitchedPointer;
    copy_params_device_to_host.dstPtr = make_cudaPitchedPtr((void *)image_data, sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
    copy_params_device_to_host.extent = volumeSizeBytes;
    copy_params_device_to_host.kind   = cudaMemcpyDeviceToHost;
    cudaMemcpy3D(&copy_params_device_to_host);

    // Check image data
    for(int i = 0; i < num_elements; ++i)
    {
        std::cout << "Element: " << i << " - " << image_data[i] << std::endl;
    }

    // Free Memory
    cudaFree(devicePitchedPointer.ptr);

    delete [] image_data;
}
int main(无效)
{
typedef浮点像素类型;
//设置测试数据
dim3图像尺寸=dim3(32,32,32);
size\t num\u elements=image\u dimensions.x*image\u dimensions.y*image\u dimensions.z;
PixelType*image_data=新浮点[num_元素];
对于(int i=0;i我建议您在CUDA代码出现问题的任何时候都可以这样做,尽管这并不能解决问题
  • 您正在将一个
    float
    传递给
    cudaMemset3D
    。如果您打算将每个float数量设置为该值,那么这将不起作用。cudaMemset3D的工作方式与主机
    memset
    函数类似。它接受一个
    无符号字符
    值并设置
    无符号字符
    数量。您无法使用此方法进行正确设置将
    float
    值初始化为1.0f。但这也不是问题的症结所在
  • 您没有正确使用
    make_cudaPitchedPtr
    功能。请检查。最后两个参数应分别为
    x
    y
    维度,而不是
    y
    z
    。您的代码中有两个实例
  • 通过修改
    make_cudaPitchedPtr

    __global__ void extract_patches_from_image_data(cudaPitchedPtr devicePitchedPointer, dim3 image_dimensions)
    {
        // Index Calculation
        int x = threadIdx.x + blockDim.x * blockIdx.x;
        int y = threadIdx.y + blockDim.y * blockIdx.y;
        int z = threadIdx.z + blockDim.z * blockIdx.z;
    
        // Get attributes from device pitched pointer
        char     *devicePointer  =   (char *)devicePitchedPointer.ptr;
        size_t    pitch          =   devicePitchedPointer.pitch;
        size_t    slicePitch     =   pitch * image_dimensions.y;
    
        // Loop over image data
        if(z < image_dimensions.z)
        {
            char *current_slice_index = devicePointer + z * slicePitch;
    
            if(y < image_dimensions.y)
            {
                // Get data array containing all elements from the current row
                PixelType *current_row = (PixelType *)(current_slice_index + y * pitch);
    
                if(x < image_dimensions.x)
                {
                    current_row[x] = current_row[x] + 1.0f;
    
                    // Get values of all all neighbors
                }
            }
        }
    }