Cuda 内核不启动?

Cuda 内核不启动?,cuda,Cuda,我写了一个在Matlab中使用的前景提取内核,它没有打印任何东西,所以我将它移植到纯Cuda C中,并取出了大部分逻辑。这东西什么都没做,甚至在返回之前都没有打印cuPrintf语句,知道为什么吗 #include <cuda.h> #include <stdio.h> /* printf, scanf, NULL */ #include <stdlib.h> /* calloc, exit, free */ #include "cuPrin

我写了一个在Matlab中使用的前景提取内核,它没有打印任何东西,所以我将它移植到纯Cuda C中,并取出了大部分逻辑。这东西什么都没做,甚至在返回之前都没有打印cuPrintf语句,知道为什么吗

#include <cuda.h>
#include <stdio.h>      /* printf, scanf, NULL */
#include <stdlib.h>     /* calloc, exit, free */
#include "cuPrintf.cu"
#include "utils.h" 
#include <time.h>       /* clock_t, clock, CLOCKS_PER_SEC */



__global__ void foreground_extract(      unsigned char* inputImageRed,
                                         unsigned char* inputImageGreen,
                                         unsigned char* inputImageBlue,

                                         unsigned char* outputImageRed,
                                         unsigned char* outputImageGreen,
                                         unsigned char* outputImageBlue,                                          

                                         const int xDim, 
                                         const int yDim)
{


    cuPrintf("print something \n");
    //x = col, y = row
    //xDim = col_dim, yDim = row_dim
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y *blockDim.x *gridDim.x;

    int nnodes = xDim*yDim;
    if (offset >= nnodes) return;


    //test equality

    outputImageRed[offset] = inputImageRed[offset];
    outputImageGreen[offset] = inputImageGreen[offset];
    outputImageBlue[offset] = inputImageBlue[offset];

    cuPrintf("print something here too \n");
    cuPrintf("%d \n", outputImageRed[offset]);

}

int main()
{

        int xDim = 3;
        int yDim = 3;

                                         unsigned char* h_inputImageRed;
                                         unsigned char* h_inputImageGreen;
                                         unsigned char* h_inputImageBlue;

                                         unsigned char* h_outputImageRed;
                                         unsigned char* h_outputImageGreen;
                                         unsigned char* h_outputImageBlue;


                    h_inputImageRed = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
                    h_inputImageGreen = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
                    h_inputImageBlue = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));

                    h_outputImageRed = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
                    h_outputImageGreen = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
                    h_outputImageBlue = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));


      //initiate input only 
      unsigned char init =0;
      for (int i=0; i<(xDim*yDim);i++){

                                          h_inputImageRed[i] = init;
                                          h_inputImageGreen[i] = init;
                                          h_inputImageBlue[i] = init;

                                          init++;

                                          printf("%d\n", h_inputImageRed[i]);

      }

                                         //device arrays
                                         unsigned char* d_inputImageRed;
                                         unsigned char* d_inputImageGreen;
                                         unsigned char* d_inputImageBlue;

                                         unsigned char* d_outputImageRed;
                                         unsigned char* d_outputImageGreen;
                                         unsigned char* d_outputImageBlue;


     //cudaMallocs

     checkCudaErrors(cudaMalloc((void**)&d_inputImageRed, (sizeof(unsigned char)*xDim*yDim)));
     checkCudaErrors(cudaMalloc((void**)&d_inputImageGreen, (sizeof(unsigned char)*xDim*yDim)));
     checkCudaErrors(cudaMalloc((void**)&d_inputImageBlue, (sizeof(unsigned char)*xDim*yDim)));

     checkCudaErrors(cudaMalloc((void**)&d_outputImageRed, (sizeof(unsigned char)*xDim*yDim)));
     checkCudaErrors(cudaMalloc((void**)&d_outputImageGreen, (sizeof(unsigned char)*xDim*yDim)));
     checkCudaErrors(cudaMalloc((void**)&d_outputImageBlue, (sizeof(unsigned char)*xDim*yDim)));

     //cudaMemcpys, Host to Device

     checkCudaErrors(cudaMemcpy(d_inputImageRed, h_inputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(d_inputImageGreen, h_inputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(d_inputImageBlue, h_inputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));

     checkCudaErrors(cudaMemcpy(d_outputImageRed, h_outputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(d_outputImageGreen, h_outputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(d_outputImageBlue, h_outputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));

     cudaPrintfInit();

     int gridSizeX = ceil(float(xDim/8));
     int gridSizeY = ceil(float(yDim/8));
     int gridSizeZ = 1;

     int blockSizeX=8;
     int blockSizeY=8;
     int blockSizeZ=1;

     const dim3 gridSize(gridSizeX,gridSizeY,gridSizeZ);
     const dim3 blockSize(blockSizeX,blockSizeY,blockSizeZ);

     foreground_extract <<< gridSize, blockSize >>>(d_inputImageRed,
                                                    d_inputImageGreen,
                                                    d_inputImageBlue,

                                                    d_outputImageRed,
                                                    d_outputImageGreen,
                                                    d_outputImageBlue,

                                                    xDim,yDim);


      cudaPrintfDisplay(stdout,true);
      cudaPrintfEnd();

      checkCudaErrors(cudaMemcpy(h_outputImageRed, d_outputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));
      checkCudaErrors(cudaMemcpy(h_outputImageGreen, d_outputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));
      checkCudaErrors(cudaMemcpy(h_outputImageBlue, d_outputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));

      //free gpu data
     checkCudaErrors( cudaFree(d_outputImageRed) );
     checkCudaErrors( cudaFree(d_outputImageGreen) );
     checkCudaErrors( cudaFree(d_outputImageBlue) );
     checkCudaErrors( cudaFree(d_inputImageRed) );
     checkCudaErrors( cudaFree(d_inputImageGreen) );
     checkCudaErrors( cudaFree(d_inputImageBlue) );

     //free host data
     free(h_outputImageRed);
     free(h_outputImageGreen);
     free(h_outputImageBlue);
     free(h_inputImageRed);
     free(h_inputImageGreen);
     free(h_inputImageBlue);



      while(true){}
      return 0;
}
#包括
#include/*printf、scanf、NULL*/
#包括/*calloc、退出、免费*/
#包括“cuPrintf.cu”
#包括“utils.h”
#包括/*时钟、时钟、每秒时钟*/
__全局\uuuuu无效前景\u提取(无符号字符*inputImageRed,
未签名字符*inputImageGreen,
未签名字符*inputImageBlue,
未签名字符*输出图像,
未签名字符*outputImageGreen,
未签名字符*outputImageBlue,
常量int xDim,
常数(整数)
{
cuPrintf(“打印某物”);
//x=列,y=行
//xDim=列尺寸,yDim=行尺寸
int x=threadIdx.x+blockIdx.x*blockDim.x;
int y=线程IDX.y+块IDX.y*块DIM.y;
int offset=x+y*blockDim.x*gridDim.x;
int nnodes=xDim*yDim;
如果(偏移量>=nnodes)返回;
//检验等式
outputImageRed[offset]=inputImageRed[offset];
outputImageGreen[offset]=inputImageGreen[offset];
outputImageBlue[偏移量]=inputImageBlue[偏移量];
cuPrintf(“在这里也打印一些东西\n”);
cuPrintf(“%d\n”,outputImageRed[offset]);
}
int main()
{
int-xDim=3;
int-yDim=3;
无符号字符*h_输入图像;
无符号字符*h_inputImageGreen;
无符号字符*h_inputImageBlue;
无符号字符*h_输出图像;
无符号字符*h_输出图像绿色;
无符号字符*h_outputImageBlue;
h_inputImageRed=(无符号字符*)calloc((xDim*yDim),sizeof(无符号字符));
h_inputImageGreen=(无符号字符*)calloc((xDim*yDim),sizeof(无符号字符));
h_inputImageBlue=(无符号字符*)calloc((xDim*yDim),sizeof(无符号字符));
h_outputImageRed=(无符号字符*)calloc((xDim*yDim),sizeof(无符号字符));
h_outputImageGreen=(无符号字符*)calloc((xDim*yDim),sizeof(无符号字符));
h_outputImageBlue=(无符号字符*)calloc((xDim*yDim),sizeof(无符号字符));
//仅启动输入
无符号字符init=0;
对于(int i=0;i(d_)输入图像,
d_inputImageGreen,
d_inputImageBlue,
d_输出图像,
d_outputImageGreen,
d_输出图像蓝色,
xDim,yDim);
cudaPrintfDisplay(标准输出,真);
cudaPrintfEnd();
检查CUDAERRORS(cudaMemcpy(h_outputImaged,d_outputImaged,(sizeof(unsigned char)*xDim*yDim),cudaMemcpyDeviceToHost));
检查CUDAERRORS(cudaMemcpy(h_outputImageGreen,d_outputImageGreen,(sizeof(未签名字符)*xDim*yDim),cudaMemcpyDeviceToHost));
检查CUDAERRORS(cudaMemcpy(h_outputImageBlue,d_outputImageBlue,(sizeof(未签名字符)*xDim*yDim),cudaMemcpyDeviceToHost));
//免费gpu数据
检查CUDAERRORS(cudaFree(d_输出图像));
检查CUDAERRORS(cudaFree(d_outputImageGreen));
检查CUDAERRORS(cudaFree(d_outputImageBlue));
检查CUDAERRORS(cudaFree(d_InputImaged));
检查CUDAERRORS(cudaFree(d_inputImageGreen));
检查CUDAERRORS(cudaFree(d_inputImageBlue));
//免费主机数据
免费(h_输出图像);
免费(h_outputImageGreen);
免费(h_outputImageBlue);
免费(h_输入图像);
免费(h_inputImageGreen);
免费(h_inputImageBlue);
while(true){}
返回0;
}

内核没有启动,这就是为什么内核中的printf没有输出。 如果您在内核启动时做得正确,您将发现这一点

内核启动返回的错误是配置参数无效

您正在
gridSize.x
gridSize.y
中传递无效值

如果您想查看它们是什么,请在调用内核之前将它们打印出来。(一般调试提示)

让我们看一看这一行,因为它没有做你认为的事情:

 int gridSizeX = ceil(float(xDim/8));
                              ^  ^
                              both values inside the parenthesis are *integers*

您尚未强制转换这些值(
xDim
8
)因此,主机编译器使用整数除法解析括号内的数量。3/8的整数除法为零。之后的值不会改变。仍然为零。

内核未启动,这就是为什么内核中的printf没有输出。 如果您在内核启动时做得正确,您将发现这一点

内核启动返回的错误是配置参数无效

您正在
gridSize.x
gridSize.y
中传递无效值

如果您想查看它们是什么,请在调用内核之前将它们打印出来。(一般调试提示)

让我们看一看这一行,因为它没有做你认为的事情:

 int gridSizeX = ceil(float(xDim/8));
                              ^  ^
                              both values inside the parenthesis are *integers*
您尚未将这些值(
xDim
8
)强制转换为
浮点值。因此,主机编译器使用整数除法解析括号内的数量。3/8的整数除法为零。之后该值不会改变。仍然为零。

谢谢。我不知道ab