用于RGB图像的cuda nppiResize()
nVidia Cuda nppiResize_32f_C1R在灰度为1 x 32f时工作正常,但nppiResize_32f_C3R返回垃圾。显然,一个解决办法是通过首先将数据解交错为平面R、G、B来调用此例程3次,但我希望能够在一次过程中运行它。nVidia有很多用于单平面图像处理的示例代码,但对于交错颜色平面的示例代码很少,因此我转向stackoverflow寻求帮助。我不知道步幅是如何计算的,但我知道步幅是图像宽度乘以每列索引的字节数。所以在我的例子中,没有填充线的情况下,RGB的宽度应该是32f x 3 在cudaMemcpy2D()中尝试了不同的跨步/音高。无法获得适用于彩色RGB代码的可行解决方案。编译并运行正常,无错误。第一部分是灰度(正常工作)。第二部分是RGB(垃圾)用于RGB图像的cuda nppiResize(),cuda,npp,Cuda,Npp,nVidia Cuda nppiResize_32f_C1R在灰度为1 x 32f时工作正常,但nppiResize_32f_C3R返回垃圾。显然,一个解决办法是通过首先将数据解交错为平面R、G、B来调用此例程3次,但我希望能够在一次过程中运行它。nVidia有很多用于单平面图像处理的示例代码,但对于交错颜色平面的示例代码很少,因此我转向stackoverflow寻求帮助。我不知道步幅是如何计算的,但我知道步幅是图像宽度乘以每列索引的字节数。所以在我的例子中,没有填充线的情况下,RGB的宽度应该
//使用二维对齐的分配进行nppiResize
#包括
#包括
#包括
#包括
#包括
#define CUDA_CALL(CALL)do{cudaError\u t CUDA_error=CALL;if(CUDA_error!=cudaSuccess){std::cerr在对cudaMemcpy2D
的调用中出现了各种错误(这两种错误都在3通道代码中)。这段代码似乎对我有用:
$ cat t1521.cu
#include <cuda_runtime.h>
#include <npp.h>
#include <nppi.h>
#include <nppdefs.h>
#include <iostream>
#include <stdint.h>
#include <stdio.h>
#define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
using namespace std;
float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
{
if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
size_t srcStep;
size_t dstStep;
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH; // Y
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source : Grayscale 1 x 32f, YYYY...
else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
size_t srcStep;
size_t dstStep;
// rows = height; columns = width
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, 3 * nSrcW * sizeof(Npp32f), 3*nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C3R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH * 3; // RGB
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW*3 * sizeof(Npp32f), devDst, dstStep, nDstW*3 * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source - 3 x 32f, interleaved RGBRGBRGB...
return(0);
}
int main(){
uint32_t nSrcH = 480;
uint32_t nSrcW = 640;
uint8_t byteperpixel = 3;
float *readbuff = (float *)malloc(nSrcW*nSrcH*byteperpixel*sizeof(float));
for (int i = 0; i < nSrcH*nSrcW; i++){
readbuff [i*3+0] = 1.0f;
readbuff [i*3+1] = 2.0f;
readbuff [i*3+2] = 3.0f;}
uint32_t nDstW = nSrcW/2;
uint32_t nDstH = nSrcH/2;
float *res = decimate_cuda(readbuff, nSrcH, nSrcW, nDstH, nDstW, byteperpixel);
for (int i = 0; i < nDstH*nDstW*byteperpixel; i++) if (res[i] != ((i%3)+1.0f)) {std::cout << "error at: " << i << std::endl; return 0;}
return 0;
}
$ nvcc -o t1521 t1521.cu -lnppig
$ cuda-memcheck ./t1521
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
$cat t1521.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#定义CUDA_调用(CALL)do{cudaError\u t CUDA_error=CALL;if(CUDA_error!=cudaSuccess){std::cerr您的最终cudaMemcpy2D
看起来是错误的。怎么可能nDstW*sizeof(Npp32f)
对于3通道图像可能是正确的吗?非常感谢。在本例中,对于倾斜和线性内存之间的速度差异,您的看法是正确的。我回到Cudamalocitch,认为我可以克服使用NPPImaloc_32f_C3时遇到的相同问题,今天看到dst,我使用NPPImaloc_32f_C1(用于RGB),而src正确地分配了nppiMalloc_32f_C3,尽管存在cudaMemcpy2D错误…绝望导致鲁莽。我是CUDA的新手。感谢main(),并让我知道它是必需的。它将在将来调试时派上用场。您好,是的,您是正确的。这是一个混乱的情况。感谢您查看和评论。您知道为什么NPPI_INTER_CUBIC、NPPI_INTER_LANCZOS、NPPI_INTER_CUBIC2P_Catmulrom、NPPI_INTER_CUBIC2P_B05C03、NPPI_INTER_LANCZOS3_ADVANCED、NPPI_INTER_Cubic2;_Cubic2;_BSPLINE,NPPI_SMOOTH_EDGE在缩小到高度和宽度的4倍左右时看起来都是一样的。唯一看起来不是“双线性”的过滤器是NPPI_INTER_SUPER。我很想看看LANCZOS的缩小结果。谢谢。
$ cat t1521.cu
#include <cuda_runtime.h>
#include <npp.h>
#include <nppi.h>
#include <nppdefs.h>
#include <iostream>
#include <stdint.h>
#include <stdio.h>
#define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
using namespace std;
float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
{
if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
size_t srcStep;
size_t dstStep;
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH; // Y
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source : Grayscale 1 x 32f, YYYY...
else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
size_t srcStep;
size_t dstStep;
// rows = height; columns = width
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, 3 * nSrcW * sizeof(Npp32f), 3*nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C3R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH * 3; // RGB
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW*3 * sizeof(Npp32f), devDst, dstStep, nDstW*3 * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source - 3 x 32f, interleaved RGBRGBRGB...
return(0);
}
int main(){
uint32_t nSrcH = 480;
uint32_t nSrcW = 640;
uint8_t byteperpixel = 3;
float *readbuff = (float *)malloc(nSrcW*nSrcH*byteperpixel*sizeof(float));
for (int i = 0; i < nSrcH*nSrcW; i++){
readbuff [i*3+0] = 1.0f;
readbuff [i*3+1] = 2.0f;
readbuff [i*3+2] = 3.0f;}
uint32_t nDstW = nSrcW/2;
uint32_t nDstH = nSrcH/2;
float *res = decimate_cuda(readbuff, nSrcH, nSrcW, nDstH, nDstW, byteperpixel);
for (int i = 0; i < nDstH*nDstW*byteperpixel; i++) if (res[i] != ((i%3)+1.0f)) {std::cout << "error at: " << i << std::endl; return 0;}
return 0;
}
$ nvcc -o t1521 t1521.cu -lnppig
$ cuda-memcheck ./t1521
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$