Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/opencv/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
CUDA仅处理OpenCV 16位灰度矩阵中总列数的一半_Opencv_Cuda_16 Bit - Fatal编程技术网

CUDA仅处理OpenCV 16位灰度矩阵中总列数的一半

CUDA仅处理OpenCV 16位灰度矩阵中总列数的一半,opencv,cuda,16-bit,Opencv,Cuda,16 Bit,我正在制作一个初级CUDA程序,它基本上使用OpenCV对输入的灰度图像进行降采样。经测试,它在8位灰度图像上运行良好,但在16位灰度图像作为输入时,会给出一个带噪的降采样图像,图像的右半部分为空白。下面是我写的代码 提供了示例输入和输出图像 及 将图像加载到Mat中的My main.cpp代码: int main() { cv::Mat im1 = cv::imread("test.png", -1); std::string output_file = "resultou

我正在制作一个初级CUDA程序,它基本上使用OpenCV对输入的灰度图像进行降采样。经测试,它在8位灰度图像上运行良好,但在16位灰度图像作为输入时,会给出一个带噪的降采样图像,图像的右半部分为空白。下面是我写的代码

提供了示例输入和输出图像

将图像加载到Mat中的My main.cpp代码:

int main()
{
    cv::Mat im1 = cv::imread("test.png", -1);
    std::string output_file = "resultout.png";
    binFilter(im1, output_file);

    return 0;
}
我的CUDA内核代码:

__global__ void binCUDAKernel(unsigned char *input, unsigned char *output, int binDim, int outputWidth, int outputHeight, int inputWstep, int outputWstep, int nChannels)
    {
        int outXind = blockIdx.x * blockDim.x + threadIdx.x;
        int outYind = blockIdx.y * blockDim.y + threadIdx.y;
        if ((outXind < outputWidth) && (outYind < outputHeight)) // Only run threads in output image coordinate range
        {
            if (nChannels == 1) // Test only for greyscale images
            {
                // Calculate x & y index of input binned pixels corresponding to current output pixel
                int inXstart = outXind * binDim;
                int inYstart = outYind * binDim;

                // Perform binning on identified input pixels
                float sum = 0;
                for (int binY = inYstart; binY < (inYstart + binDim); binY++) {
                    for (int binX = inXstart; binX < (inXstart + binDim); binX++) {
                        int input_tid = binY * inputWstep + binX;
                        sum += input[input_tid];
                    }
                }

                // Establish output thread index in current output pixel index
                int output_tid = outYind * outputWstep + outXind;

                // Assign binned pixel value to output pixel
                output[output_tid] = static_cast<unsigned short>(sum / (binDim*binDim));
            }
        }
    }
\uuuuu全局\uuuuuvoid binCUDAKernel(无符号字符*输入,无符号字符*输出,int binDim,int outputWidth,int outputhweight,int inputWstep,int outputWstep,int nChannels)
{
int-outXind=blockIdx.x*blockDim.x+threadIdx.x;
int-outYind=blockIdx.y*blockDim.y+threadIdx.y;
if((outXind
我的CPU代码:

void binFilter(const cv::Mat input, std::string output_file)
{
    // 2X2 binning
    int binDim = 2;

    // Create blank output image & calculate size of input and output
    cv::Size outsize(input.size().width / binDim, input.size().height / binDim);
    cv::Mat output(outsize, input.type());
    const int inputBytes = input.step * input.rows;
    const int outputBytes = output.step * output.rows;

    // Allocate memory in device
    unsigned char *d_input, *d_output;
    gpuErrchk(cudaMalloc<unsigned char>(&d_input, inputBytes));
    gpuErrchk(cudaMalloc<unsigned char>(&d_output, outputBytes));

    // Copy input image to device
    gpuErrchk(cudaMemcpy(d_input, input.ptr(), inputBytes, cudaMemcpyHostToDevice));

    // Configure size of block and grid
    const dim3 block(16, 16);
    const dim3 grid((output.cols + block.x - 1) / block.x, (output.rows + block.y - 1) / block.y); // Additional block for rounding up

    // Execute kernel
    binCUDAKernel <<<grid, block>>> (d_input, d_output, binDim, output.cols, output.rows, input.step, output.step, input.channels());
    gpuErrchk(cudaPeekAtLastError());

    // Wait for all threads to finish
    //gpuErrchk(cudaDeviceSynchronize());

    // Copy output image from device back to host (cudaMemcpy is a blocking instruction)
    gpuErrchk(cudaMemcpy(output.ptr(), d_output, outputBytes, cudaMemcpyDeviceToHost));

    // Free device memory
    gpuErrchk(cudaFree(d_input));
    gpuErrchk(cudaFree(d_output));

    // Write image to specified output_file path
    cv::imwrite(output_file, output);
}
void binFilter(常量cv::Mat输入,std::字符串输出文件)
{
//2X2装箱
int-binDim=2;
//创建空白输出图像并计算输入和输出的大小
cv::Size outsize(input.Size().width/binDim,input.Size().height/binDim);
cv::Mat输出(超大,input.type());
const int inputBytes=input.step*input.rows;
const int outputBytes=output.step*output.rows;
//在设备中分配内存
无符号字符*d_输入,*d_输出;
gpuErrchk(cudamaloc(&d_输入,inputBytes));
gpuErrchk(cudamaloc(&d_输出,outputBytes));
//将输入图像复制到设备
gpuErrchk(cudaMemcpy(d_input,input.ptr(),inputBytes,cudaMemcpyHostToDevice));
//配置块和网格的大小
常数dim3块(16,16);
常量dim3网格((output.cols+block.x-1)/block.x,(output.rows+block.y-1)/block.y);//用于四舍五入的附加块
//执行内核
binCUDAKernel(d_输入,d_输出,binDim,output.cols,output.rows,input.step,output.step,input.channels());
gpuerchk(cudaPeekAtLastError());
//等待所有线程完成
//gpuErrchk(cudaDeviceSynchronize());
//将输出映像从设备复制回主机(cudaMemcpy是一条阻塞指令)
gpuErrchk(cudaMemcpy(output.ptr(),d_output,outputBytes,cudaMemcpyDeviceToHost));
//可用设备内存
gpuErrchk(cudaFree(d_输入));
gpuErrchk(cudaFree(d_输出));
//将图像写入指定的输出文件路径
cv::imwrite(输出文件,输出);
}

我怀疑这可能是某种类型的数据类型不匹配,但我无法理解

首先,对于处理16位图像,必须将像素数据解释为16位宽的数据类型,该数据类型可能是
无符号短
。请记住,我们只需将图像数据解释为
无符号短
类型;不是类型转换。为此,我们仅将图像数据指针转换为所需类型,如下例所示:

unsigned short* ptr16 = reinterpret_cast<unsigned short*>(im1.ptr());

在上面的方法中,步长值是没有任何除法的原始值。

在我看来,您将数据读取为无符号字符(8位),而不是无符号短字符(16位),然后尝试将无符号短字符分配给无符号字符数组。。。。可能需要对输入使用reinterpret_cast/output@api55您的意思是我应该在下面的这一点将输入/输出转换为unsigned short吗?//在设备无符号短*d_输入、*d_输出中分配内存;gpuErrchk(cudamaloc(&d_输入,inputBytes));gpuErrchk(cudamaloc(&d_输出,outputBytes));非常感谢@sgarizvi提供的解决方案!你能给我指一个参考页面吗?这里更详细地解释了使用自定义CUDA内核处理OpenCV Mat的过程,该内核要求图像步长除以数据类型的大小(以字节为单位)。。请检查我答案中的更新。
template<typename T>
__global__ void binCUDAKernel(T *input, T *output, int binDim, int outputWidth, int outputHeight, int inputWstep, int outputWstep, int nChannels)
{
    int outXind = blockIdx.x * blockDim.x + threadIdx.x;
    int outYind = blockIdx.y * blockDim.y + threadIdx.y;

    if ((outXind < outputWidth) && (outXind > outputWidth/2) && (outYind < outputHeight)) // Only run threads in output image coordinate range
    {
        if (nChannels == 1) // Test only for greyscale images
        {
            // Calculate x & y index of input binned pixels corresponding to current output pixel
            int inXstart = outXind * binDim;
            int inYstart = outYind * binDim;

            // Perform binning on identified input pixels
            float sum = 0;
            for (int binY = inYstart; binY < (inYstart + binDim); binY++) {
                for (int binX = inXstart; binX < (inXstart + binDim); binX++) {
                    int input_tid = binY * inputWstep + binX;
                    sum += float(input[input_tid]);
                }
            }

            // Establish output thread index in current output pixel index
            int output_tid = outYind * outputWstep + outXind;

            // Assign binned pixel value to output pixel
            output[output_tid] = static_cast<T>(sum / (binDim*binDim));
        }
    }
}   
void binFilter(const cv::Mat input, std::string output_file)
{
    // 2X2 binning
    int binDim = 2;

    // Create blank output image & calculate size of input and output
    cv::Size outsize(input.size().width / binDim, input.size().height / binDim);
    cv::Mat output(outsize, input.type());
    const int inputBytes = input.step * input.rows;
    const int outputBytes = output.step * output.rows;

    // Allocate memory in device
    unsigned char *d_input, *d_output;
    gpuErrchk(cudaMalloc<unsigned char>(&d_input, inputBytes));
    gpuErrchk(cudaMalloc<unsigned char>(&d_output, outputBytes));

    // Copy input image to device
    gpuErrchk(cudaMemcpy(d_input, input.ptr(), inputBytes, cudaMemcpyHostToDevice));

    // Configure size of block and grid
    const dim3 block(16, 16);
    const dim3 grid((output.cols + block.x - 1) / block.x, (output.rows + block.y - 1) / block.y); // Additional block for rounding up


    int depth = input.depth();
    // Execute kernel

    if (input.depth() == CV_16U)
    {
        typedef unsigned short t16;

        t16* input16 = reinterpret_cast<t16*>(d_input);
        t16* output16 = reinterpret_cast<t16*>(d_output);

        int inputStep16 = input.step / sizeof(t16);
        int outputStep16 = output.step / sizeof(t16);

        binCUDAKernel <t16> <<<grid, block>>> (input16, output16, binDim, output.cols, output.rows, inputStep16, outputStep16, input.channels());
    }
    else
    {
        binCUDAKernel <unsigned char> <<<grid, block>>> (d_input, d_output, binDim, output.cols, output.rows, input.step, output.step, input.channels());   
    }


    gpuErrchk(cudaPeekAtLastError());

    // Wait for all threads to finish
    //gpuErrchk(cudaDeviceSynchronize());

    // Copy output image from device back to host (cudaMemcpy is a blocking instruction)
    gpuErrchk(cudaMemcpy(output.ptr(), d_output, outputBytes, cudaMemcpyDeviceToHost));

    // Free device memory
    gpuErrchk(cudaFree(d_input));
    gpuErrchk(cudaFree(d_output));

    // Write image to specified output_file path
    cv::imwrite(output_file, output);
}
template<typename T>
T* getPixelAddress(unsigned char* data, int x, int y, int step)
{
    T* row = (T*)((unsigned char*)(data) + y * step);
    return row + x;
}