cuda memcheck报告nppiFilterGauss_8u_C1R超出范围_Cuda_Npp

cuda memcheck报告nppiFilterGauss_8u_C1R超出范围

cuda

cuda memcheck报告nppiFilterGauss_8u_C1R超出范围,cuda,npp,Cuda,Npp,我想使用nppiFilterGauss_8u_C1R，但当我使用cuda memcheck时，它总是报告超出范围，以下是我的源代码： Npp8u* p1 = NULL; Npp8u* p2 = NULL; unsigned char* p3 = NULL; unsigned char* p4 = NULL; int step1 = 0; int step2 = 0; NppiSize roi; roi.width = 352*288; roi.height = 1; int ret = 0;

我想使用nppiFilterGauss_8u_C1R，但当我使用cuda memcheck时，它总是报告超出范围，以下是我的源代码：

Npp8u* p1 = NULL;
Npp8u* p2 = NULL;
unsigned char* p3 = NULL;
unsigned char* p4 = NULL;
int step1 = 0;
int step2 = 0;
NppiSize roi;
roi.width = 352*288;
roi.height = 1;
int ret = 0;

p1 = nppiMalloc_8u_C1(352, 288, &step1);
p2 = nppiMalloc_8u_C1(352, 288, &step2);
cudaMalloc((void**)&p3, 352*288);
cudaMalloc((void**)&p4, 352*288);

printf("p1[%x],p2[%x],p3[%x],p4[%x]\n", p1, p2, p3, p4);
printf("step1[%d]\n", step1);
printf("step2[%d]\n", step2);

int count = 1;
while(count < 3) {
  // ret = nppiFilterGauss_8u_C1R(p1, step1, p2, step2, roi, NPP_MASK_SIZE_3_X_3);
  ret = nppiFilterGauss_8u_C1R(p3, 352*288, p4, 352*288, roi, NPP_MASK_SIZE_3_X_3);
  printf("count[%d],ret[%d]\n", count, ret);
  if(ret) {
    break;
  }
  count++;
}

nppiFree(p1);
nppiFree(p2);
cudaFree(p3);
cudaFree(p4);

Npp8u*p1=NULL；
Npp8u*p2=NULL；
无符号字符*p3=NULL；
无符号字符*p4=NULL；
int step1=0；
int step2=0；
提高投资回报率；
roi.width=352*288；
roi.height=1；
int-ret=0；
p1=NPPImaloc_8u_C1（352、288和步骤1）；
p2=NPPImaloc_8u_C1（352、288和步骤2）；
Cudamaloc（（无效**）和P3352*288）；
Cudamaloc（（无效**）和P4352*288）；
printf（“p1[%x]、p2[%x]、p3[%x]、p4[%x]\n”、p1、p2、p3、p4）；
printf（“步骤1[%d]\n”，步骤1）；
printf（“步骤2[%d]\n”，步骤2）；
整数计数=1；
而（计数<3）{
//ret=nppiFilterGauss_8u_C1R（p1，步骤1，p2，步骤2，roi，NPP_掩模尺寸_3_X_3）；
ret=nppiFilterGauss_8u_C1R（p3，352*288，p4，352*288，roi，NPP_掩模尺寸_3_X_3）；
printf（“计数[%d]，返回[%d]\n”，计数，返回）；
如果（ret）{
打破
}
计数++；
}
无NPP（p1）；
无NPP（p2）；
cudaFree（p3）；
cudaFree（p4）；

以下是错误：

GPU Device 0: "GK20A" with compute capability 3.2
p1[ab84a000],p2[ab86e000],p3[ab892000],p4[ab8aac00]
step1[512]
step2[512]
count[1],ret[0]
count[2],ret[0]
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 1
=========     at 0x00000448 in void ForEachTupleByteQuad<unsigned char, int=1, TupleByteQuadFunctor<unsigned char, int=1, FilterGauss3x3QuadNew<unsigned char, int=1>>>(Tuple8<unsigned char, int=1>*, int, NppiSize, unsigned char)
=========     by thread (31,0,0) in block (395,0,0)
=========     Address 0xab8c3800 is out of bounds
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= ERROR SUMMARY: 5 errors*

GPU设备0：“GK20A”，具有计算能力3.2
p1[ab84a000]，p2[ab86e000]，p3[ab892000]，p4[ab8aac00]
步骤1[512]
步骤2[512]
计数[1]，返回[0]
计数[2]，返回[0]
==========CUDA-MEMCHECK
==========大小为1的无效全局读取
======在0x00000448处，在void ForEachTupleByteQuad中（Tuple8*，int，NppiSize，unsigned char）
=======通过块（395,0,0）中的线程（31,0,0）
======地址0xab8c3800超出范围
=========
======由于CUDA API调用cudaFree时出现“未指定的启动失败”，程序命中cudaErrorLaunchFailure（错误4）。
=========
======由于CUDA API调用cudaFree时出现“未指定的启动失败”，程序命中cudaErrorLaunchFailure（错误4）。
=========
======由于CUDA API调用cudaFree时出现“未指定的启动失败”，程序命中cudaErrorLaunchFailure（错误4）。
=========
======由于CUDA API调用cudaFree时出现“未指定的启动失败”，程序命中cudaErrorLaunchFailure（错误4）。
=========
======错误摘要：5个错误*

有人能解释一下正确的方法吗？

应用遮罩大小为3x3的高斯滤波器需要从当前像素读取上/下和左/右像素。这意味着您在计算像素（0,0）的值时实际读取像素（-1，-1）。为了避免这种情况，您需要调整投资回报率或使用自动正确处理边界的NPP功能

根据您的代码，这将给出如下内容：

Npp8u* p1 = NULL;
Npp8u* p2 = NULL;

int step1 = 0;
int step2 = 0;
NppiSize roi;
roi.width = 352 - 2; //roi is two pixels smaller: one pixel removed left, one on right side
roi.height = 288 - 2; //same for height
int ret = 0;

p1 = nppiMalloc_8u_C1(352, 288, &step1); //use nppiMalloc and not cudaMalloc for best performance
p2 = nppiMalloc_8u_C1(352, 288, &step2); //(NPP uses internal heuristics which step size suits best...)

printf("p1[%x],p2[%x]\n", p1, p2);
printf("step1[%d]\n", step1);
printf("step2[%d]\n", step2);

int count = 1;
while (count < 3) {
    //move pointers from pixel (0,0) to pixel (1,1) = add one line step plus one, roi is two pixels smaller:
    ret = nppiFilterGauss_8u_C1R(p1 + step1 + 1, step1, p2 + step2 + 1, step2, roi, NPP_MASK_SIZE_3_X_3);
    printf("count[%d],ret[%d]\n", count, ret);
    if (ret) {
        break;
    }
    count++;
}

//Or use NPP function including border handling:
NppiPoint srcPoint;
srcPoint.x = 0;
srcPoint.y = 0;

roi.width = 352;
roi.height = 288;
ret = nppiFilterGaussBorder_8u_C1R(p1, step1, roi, srcPoint, p2, step2, roi, NPP_MASK_SIZE_3_X_3, NPP_BORDER_REPLICATE);

nppiFree(p1);
nppiFree(p2);

Npp8u*p1=NULL；
Npp8u*p2=NULL；
int step1=0；
int step2=0；
提高投资回报率；
roi.width=352-2//roi小两个像素：一个像素从左侧移除，一个像素从右侧移除
roi.height=288-2//高度相同
int-ret=0；
p1=NPPImaloc_8u_C1（352、288和步骤1）//为获得最佳性能，请使用NPPImaloc而不是Cudamaloc
p2=NPPImaloc_8u_C1（352、288和步骤2）//（NPP使用内部启发式算法，步长最适合…）
printf（“p1[%x]，p2[%x]\n”，p1，p2）；
printf（“步骤1[%d]\n”，步骤1）；
printf（“步骤2[%d]\n”，步骤2）；
整数计数=1；
而（计数<3）{
//将指针从像素（0,0）移动到像素（1,1）=添加一行步长加上一，roi小两个像素：
ret=nppiFilterGauss_8u_C1R（p1+step1+1，step1，p2+step2+1，step2，roi，NPP_掩模尺寸_3_X_3）；
printf（“计数[%d]，返回[%d]\n”，计数，返回）；
如果（ret）{
打破
}
计数++；
}
//或使用NPP功能，包括边界处理：
NPP点；
srcPoint.x=0；
srcPoint.y=0；
roi.width=352；
roi.height=288；
ret=nppiFilterGaussBorder_8u_C1R（p1，第1步，roi，srcPoint，p2，第2步，roi，NPP_掩模_尺寸_3_X_3，NPP_边界_复制）；
无NPP（p1）；
无NPP（p2）；

这段代码通过了CudaMemCheck，没有任何问题。

您的代码在Maxwell GPU上使用CUDA 7.5工具包时不会为我产生任何错误。我使用的是CUDA 6.5。这是正确的，非常感谢！！！我还有一个问题，你能帮我吗，这就是问题