C++ 如何为GpuMat编写内核？_C++_Opencv_Cuda

C++ 如何为GpuMat编写内核？

c++ opencv cuda

C++ 如何为GpuMat编写内核？,c++,opencv,cuda,C++,Opencv,Cuda,我尝试使用以下代码迭代cv:：cuda:：GpuMat： __global__ void kernel(uchar* src, int rows, int cols, size_t step) { int rowInd = blockIdx.y * blockDim.y + threadIdx.y; int colInd = blockIdx.x * blockDim.x + threadIdx.x; if ((rowInd < rows) &&

我尝试使用以下代码迭代cv:：cuda:：GpuMat：

__global__ void kernel(uchar* src, int rows, int cols, size_t step)
{
    int rowInd = blockIdx.y * blockDim.y + threadIdx.y;
    int colInd = blockIdx.x * blockDim.x + threadIdx.x;

    if ((rowInd < rows) && (colInd < cols))
    {
        uchar * rowptr = src + (rowInd * step);
        rowptr[colInd] = 255;
    }

}

void invoke_kernel(cv::cuda::GpuMat _img)
{
    dim3 tpb(50, 50);
    dim3 bpg(((_img.cols + 49) / 50), ((_img.rows + 49)/ 50));
    kernel<<<bpg, tpb>>> (_img.data, _img.rows, _img.cols, _img.step);

}

int main()
{


    cv::cuda::GpuMat mat;
    mat.create(cv::Size(500, 500), CV_8UC1);
    std::cout << mat.rows << " " << mat.cols << std::endl;
    invoke_kernel(mat);

    cv::Mat img;
    mat.download(img);

    cv::namedWindow("test");
    cv::imshow("test", img);
    cv::waitKey(0);

    return 0;
}

\uuuuu全局\uuuuuu无效内核（uchar*src，int行，int列，size\t步）
{
int rowInd=blockIdx.y*blockDim.y+threadIdx.y；
int colInd=blockIdx.x*blockDim.x+threadIdx.x；
如果（（行数<行数）&（列数<列数））
{
uchar*rowptr=src+（rowInd*step）；
rowptr[colInd]=255；
}
}
void invoke_内核（cv:：cuda:：GpuMat\u img）
{
dim3 tpb（50,50）；
dim3 bpg（（（（img.cols+49）/50），（（img.rows+49）/50））；
内核（_img.data、_img.rows、_img.cols、_img.step）；
}
int main（）
{
cv:：cuda:：GpuMat mat；
材料创建（cv：：尺寸（500500），cv_8UC1）；
std：：cout事实证明，问题是块大小50x50=2500有点太大。有某种限制，我还没有弄清楚，但是，正如CUDA文档中所述，16x16是可以的
因此，我想澄清一下：
dim3 tpb(16, 16);
dim3 bpg(((_img.cols + 15) / 16), ((_img.rows + 15)/ 16));

在invoke_中，内核执行该任务
孩子们，一定要阅读文档。
好吧，事实证明内核还没有启动，idk为什么，但即使我只是在那里得到一个inifnite循环，没有什么变化。每当你在使用cuda代码时遇到问题，使用cuda memcheck
@RobertCrovella来运行代码是一个很好的做法，是的，我就是这样想出来的，cudaGetLastEError返回给我cudaErrorInvalidConfiguration，我在谷歌上搜索了一些，发现块大小太大了限制是CUDA内核每个块限制1024个线程，这个限制是由块尺寸x，y，z的乘积组成的。这在CUDA编程指南中有说明，也可以通过CUDA示例代码deviceQuery和这是CUDA标签上几十个问题的来源。