C++ GpuMat-访问自定义内核中的2通道浮点数据
据我所知,C++ GpuMat-访问自定义内核中的2通道浮点数据,c++,opencv,cuda,gpu,C++,Opencv,Cuda,Gpu,据我所知,cv::cuda::PtrStep用于将GpuMat数据直接传递到自定义内核。我找到了单通道访问的示例,但我的示例是双通道mat(CV_32FC2)。在这种情况下,我试图获得复数绝对平方值,其中复数值被编码为:实部是第一个平面,虚部是给定Mat的第二个平面 我试过: __global__ void testKernel(const cv::cuda::PtrStepSz<cv::Vec2f> input, cv::cuda::PtrStepf output) { i
cv::cuda::PtrStep
用于将GpuMat
数据直接传递到自定义内核。我找到了单通道访问的示例,但我的示例是双通道mat(CV_32FC2
)。在这种情况下,我试图获得复数绝对平方值,其中复数值被编码为:实部是第一个平面,虚部是给定Mat
的第二个平面
我试过:
__global__ void testKernel(const cv::cuda::PtrStepSz<cv::Vec2f> input, cv::cuda::PtrStepf output)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x <= input.cols - 1 && y <= input.rows - 1 && y >= 0 && x >= 0)
{
float val_re = input(x, y)[0];
float val_im = input(x, y) [1];
output(x, y) = val_re * val_re + val_im * val_im;
}
}
但是我想知道是否有一个“更干净”的解决方案,像Vec2f一样。如果您想使用内核的单个输入,您可以
现在,您可以将
d_mat_flat
作为PtrStepSzf
传递到内核。您可以使用原始数据类型访问自定义CUDA内核中的GpuMat
数据。e、 g.CUDA运行时提供的float2
类型可用作cv::Vec2f
的部分替换。下面是一个示例代码,演示如何使用原始数据类型访问GpuMat
数据
#include <iostream>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
using std::cout;
using std::endl;
__global__ void kernel_absolute(float2* src, float* dst, int rows, int cols, int iStep, int oStep)
{
int i = blockIdx.y * blockDim.y + threadIdx.y; //Row number
int j = blockIdx.x * blockDim.x + threadIdx.x; //Column number
if (i<rows && j<cols)
{
/* Compute linear index from 2D indices */
int tidIn = i * iStep + j;
int tidOut = i * oStep + j;
/* Read input value */
float2 input = src[tidIn];
/* Calculate absolute value */
float output = sqrtf(input.x * input.x + input.y * input.y);
/* Write output value */
dst[tidOut] = output;
}
}
int main(int argc, char** argv)
{
/* Example to compute absolute value of each element of a complex matrix */
int rows = 10;
int cols = 10;
int input_data_type = CV_32FC2; //input is complex
int output_data_type = CV_32FC1; //output is real
/* Create input matrix on host */
cv::Mat input = cv::Mat::zeros(rows,cols,input_data_type) + cv::Vec2f(1,1) /* Initial value is (1,1) */;
/* Display input */
cout<<input<<endl;
/* Create input matrix on device */
cv::cuda::GpuMat input_d;
/* Copy from host to device */
input_d.upload(input);
/* Create output matrix on device */
cv::cuda::GpuMat output_d(rows,cols, output_data_type);
/* Compute element step value of input and output */
int iStep = input_d.step / sizeof(float2);
int oStep = output_d.step / sizeof(float);
/* Choose appropriate block size */
dim3 block(8,8);
/* Compute grid size using input size and block size */
dim3 grid ( (cols + block.x -1)/block.x, (rows + block.y -1)/block.y );
/* Launch CUDA kernel to compute absolute value */
kernel_absolute<<<grid, block>>>( reinterpret_cast<float2*>(input_d.data), reinterpret_cast<float*>(output_d.data), rows, cols, iStep, oStep );
/* Check kernel launch errors */
assert( cudaSuccess == cudaDeviceSynchronize() );
cv::Mat output;
/* Copy results from device to host */
output_d.download(output);
/* Display output */
cout<<endl<<output<<endl;
return 0;
}
#包括
#包括
#包括
使用std::cout;
使用std::endl;
__全局无效内核绝对值(float2*src、float*dst、int行、int列、int iStep、int oStep)
{
int i=blockIdx.y*blockDim.y+threadIdx.y;//行号
int j=blockIdx.x*blockDim.x+threadIdx.x;//列号
如果(你可以使用<代码>浮点2 < /代码>代替<代码> CV::VEC2F。也<代码>输入(x,y)< /代码>应该是代码>输入(y,x)< /代码>,因为第一个参数是行和第二个列。我喜欢它。没有CV::拆分为分离的平面-没有数据复制。@迈克尔逊……如果解决了这个问题,你可以考虑接受答案:)。
__global__ void testKernel(const cv::cuda::PtrStepSzf re, const cv::cuda::PtrStepSzf im, cv::cuda::PtrStepf output)
// test image
Mat h_mat(Size(50,50),CV_32FC2,Scalar(0.0));
// Mat::reshape takes number of channels and rows, for your example 1,1
Mat h_mat_flat = h_mat.reshape(1,1);
// to upload to gpu
GpuMat d_mat_flat(h_mat_flat.size(), h_mat_flat.type());
d_mat_flat.upload(h_mat_flat);
#include <iostream>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
using std::cout;
using std::endl;
__global__ void kernel_absolute(float2* src, float* dst, int rows, int cols, int iStep, int oStep)
{
int i = blockIdx.y * blockDim.y + threadIdx.y; //Row number
int j = blockIdx.x * blockDim.x + threadIdx.x; //Column number
if (i<rows && j<cols)
{
/* Compute linear index from 2D indices */
int tidIn = i * iStep + j;
int tidOut = i * oStep + j;
/* Read input value */
float2 input = src[tidIn];
/* Calculate absolute value */
float output = sqrtf(input.x * input.x + input.y * input.y);
/* Write output value */
dst[tidOut] = output;
}
}
int main(int argc, char** argv)
{
/* Example to compute absolute value of each element of a complex matrix */
int rows = 10;
int cols = 10;
int input_data_type = CV_32FC2; //input is complex
int output_data_type = CV_32FC1; //output is real
/* Create input matrix on host */
cv::Mat input = cv::Mat::zeros(rows,cols,input_data_type) + cv::Vec2f(1,1) /* Initial value is (1,1) */;
/* Display input */
cout<<input<<endl;
/* Create input matrix on device */
cv::cuda::GpuMat input_d;
/* Copy from host to device */
input_d.upload(input);
/* Create output matrix on device */
cv::cuda::GpuMat output_d(rows,cols, output_data_type);
/* Compute element step value of input and output */
int iStep = input_d.step / sizeof(float2);
int oStep = output_d.step / sizeof(float);
/* Choose appropriate block size */
dim3 block(8,8);
/* Compute grid size using input size and block size */
dim3 grid ( (cols + block.x -1)/block.x, (rows + block.y -1)/block.y );
/* Launch CUDA kernel to compute absolute value */
kernel_absolute<<<grid, block>>>( reinterpret_cast<float2*>(input_d.data), reinterpret_cast<float*>(output_d.data), rows, cols, iStep, oStep );
/* Check kernel launch errors */
assert( cudaSuccess == cudaDeviceSynchronize() );
cv::Mat output;
/* Copy results from device to host */
output_d.download(output);
/* Display output */
cout<<endl<<output<<endl;
return 0;
}