C++ NVIDIA Visual profiler不生成时间线

C++ NVIDIA Visual profiler不生成时间线,c++,cuda,nvidia,C++,Cuda,Nvidia,我的问题与[在此之前在SO提出的][1]几乎相同。但是没有人回答这个问题,所以我要问另外一个问题 我正在Windows-7操作系统上使用CUDA 7.0工具包。我使用的是VS-2013 我试图生成矢量加法示例程序的时间轴,结果成功了。但当我按照完全相同的步骤生成自己代码的时间线时,它会不断显示一条消息“正在运行应用程序以生成时间线”。我知道内核会被调用,并且一切正常 cudaDeviceReset()完成所有与CUDA相关的操作后,调用也会出现 程序:我改变了原来的问题,提供了一个可以产生相同问

我的问题与[在此之前在SO提出的][1]几乎相同。但是没有人回答这个问题,所以我要问另外一个问题

我正在Windows-7操作系统上使用CUDA 7.0工具包。我使用的是VS-2013

我试图生成矢量加法示例程序的时间轴,结果成功了。但当我按照完全相同的步骤生成自己代码的时间线时,它会不断显示一条消息“正在运行应用程序以生成时间线”。我知道内核会被调用,并且一切正常

cudaDeviceReset()
完成所有与CUDA相关的操作后,调用也会出现

程序:我改变了原来的问题,提供了一个可以产生相同问题的最小工作示例。以下代码没有使用
nvvp
生成时间线,无论我将
cudaDeviceReset()
放在何处

#包括“cuda_runtime.h”
#包括“设备启动参数.h”
//OpenCV
#包括
#包括
#包括
#包括
使用名称空间cv;
__全局无效颜色转换内核(int numChannels、int iw、int ih、unsigned char*ptr_源、unsigned char*ptr_dst)
{
//计算像素的位置
intx=(blockIdx.x*blockDim.x)+threadIdx.x;
inty=(blockIdx.y*blockDim.y)+threadIdx.y;
//仅当我们在正确的边界内时才进行操作
如果(x>=0&&x=0&&y

非常重要的线索:如果我在(1)
时注释行
,因此只运行一次代码,那么
nvvp
将生成时间线。但在我最初的项目中,我无法通过这样做来获得时间线概要文件,因为它包含多线程和其他内容,因此在第一次运行期间没有要处理的图像。因此,我必须使用包含无限
while loop

的代码来生成时间轴。我的代码中的问题是无限
while loop
,因此从未调用
cudadeviceset()
。处理这种情况有两种可能的解决方案:

  • 如果您有兴趣只在那时查看时间轴评测,只需对
    while循环进行注释
    ,而
    nvvp
    将能够到达
    cudaDeviceReset()
    末尾的
    cudaDeviceReset()

  • 可能存在这样一种情况,您必须在程序中保持一个循环。例如,在我的包含多线程的原始项目中,在最初180次运行
    while loop
    期间没有要处理的图像。要处理这种情况,请将while循环替换为
    for循环
    ,该循环可以运行有限的次数。例如,以下代码帮助我获得了4次运行的时间轴分析。我只发布修改后的
    main()

    intmain()
    {
    cudaStream\u t stream\u one;
    cudaStream_t stream_二;
    cudaStream_t stream_三;
    //而(1)
    对于(int i=0;i<4;i++)
    {
    cudaStreamCreate(&stream_one);
    cudaStreamCreate(&stream_-two);
    cudaStreamCreate(&stream_三);
    Mat image=imread(“DijSDK_test_image.jpg”,1);
    //Mat图像(108011920,CV_8UC3,标量(0,0255));
    size\u t numBytes=image.rows*image.cols*3;
    int numChannels=3;
    int iw=image.rows;
    int ih=image.cols;
    size\u t totalMemSize=numBytes*sizeof(无符号字符);
    大小\u t三分之一内存大小=总内存大小/3;
    未签名字符*dev_src_1、*dev_src_2、*dev_src_3、*dev_dst_1、*dev_dst_2、*dev_dst_3、*h_src、*h_dst;
    //在设备上为源和目标分配memomry,并获取它们的指针
    cudamaloc((void**)和dev_src_1(totalMemSize)/3);
    cudamaloc((void**)和dev_src_2(totalMemSize)/3);
    cudamaloc((void**)和dev_src_3(totalMemSize)/3);
    Cudamaloc((void**)和dev_dst_1(totalMemSize)/3);
    cudamaloc((void**)和dev_dst_2(totalMemSize)/3);
    cudamaloc((void**)和dev_dst_3(totalMemSize)/3);
    //获取处理后的图像
    Mat org_dijSDK_img(image.rows,image.cols,CV_8UC3,标量(0,025));
    h_dst=org_dijSDK_img.data;
    //将映像的新数据复制到主机指针
    h_src=image.data;
    //将源映像复制到设备(即GPU)
    cudaMemcpyAsync(dev_src_1,h_src,(totalMemSize)
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    //OpenCV
    #include <opencv2/highgui.hpp>
    #include <opencv2/core.hpp>
    #include <opencv2/imgproc.hpp>
    
    #include <stdio.h>
    
    using namespace cv;
    
    __global__ void colorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
    {
        // Calculate our pixel's location
        int x = (blockIdx.x * blockDim.x) + threadIdx.x;
        int y = (blockIdx.y * blockDim.y) + threadIdx.y;
    
        // Operate only if we are in the correct boundaries
        if (x >= 0 && x < iw && y >= 0 && y < ih)
        {   
            ptr_dst[numChannels*  (iw*y + x) + 0] = ptr_source[numChannels*  (iw*y + x) + 0];
            ptr_dst[numChannels*  (iw*y + x) + 1] = ptr_source[numChannels*  (iw*y + x) + 1];
            ptr_dst[numChannels*  (iw*y + x) + 2] = ptr_source[numChannels*  (iw*y + x) + 2];
        }
    }
    
    int main()
    {
        while (1)
        { 
            Mat image(400, 400, CV_8UC3, Scalar(0, 0, 255));
            unsigned char *h_src = image.data;
            size_t numBytes = image.rows * image.cols * 3;
            int numChannels = 3;
    
    
            unsigned char *dev_src, *dev_dst, *h_dst;
    
            //Allocate memomry at device for SOURCE and DESTINATION and get their pointers
            cudaMalloc((void**)&dev_src, numBytes * sizeof(unsigned char));
            cudaMalloc((void**)&dev_dst, numBytes * sizeof(unsigned char));
    
            ////Copy the source image to the device i.e. GPU
            cudaMemcpy(dev_src, h_src, numBytes * sizeof(unsigned char), cudaMemcpyHostToDevice);
    
            ////KERNEL
            dim3 numOfBlocks(3 * (image.cols / 20), 3 * (image.rows / 20)); //multiplied by 3 because we have 3 channel image now
            dim3 numOfThreadsPerBlocks(20, 20);
            colorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks >> >(numChannels, image.cols, image.rows, dev_src, dev_dst);
            cudaDeviceSynchronize();
    
            //Get the processed image 
            Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3);
            h_dst = org_dijSDK_img.data;
            cudaMemcpy(h_dst, dev_dst, numBytes * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    
            //DISPLAY PROCESSED IMAGE           
            imshow("Processed dijSDK image", org_dijSDK_img);
            waitKey(33);
    
        }
    
        cudaDeviceReset();
        return 0;
    }
    
    int main()
    {
    cudaStream_t stream_one;
    cudaStream_t stream_two;
    cudaStream_t stream_three;
    
    //while (1)
    for (int i = 0; i < 4; i++)
    {
        cudaStreamCreate(&stream_one);
        cudaStreamCreate(&stream_two);
        cudaStreamCreate(&stream_three);
    
        Mat image = imread("DijSDK_test_image.jpg", 1);
        //Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
        size_t numBytes = image.rows * image.cols * 3;
        int numChannels = 3;
    
        int iw = image.rows;
        int ih = image.cols;
        size_t totalMemSize = numBytes * sizeof(unsigned char);
        size_t oneThirdMemSize = totalMemSize / 3;
    
        unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;
    
    
        //Allocate memomry at device for SOURCE and DESTINATION and get their pointers
        cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
        cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
        cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
        cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
        cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
        cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);
    
        //Get the processed image 
        Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
        h_dst = org_dijSDK_img.data;
        //copy new data of image to the host pointer
        h_src = image.data;
    
        //Copy the source image to the device i.e. GPU
        cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
        cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
        cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);
    
        //KERNEL--stream-1
        callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);
        //KERNEL--stream-2
        callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);
        //KERNEL--stream-3
        callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);
    
    
        //RESULT copy: GPU to CPU
        cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
        cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
        cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);
    
        // wait for results 
        cudaStreamSynchronize(stream_one);
        cudaStreamSynchronize(stream_two);
        cudaStreamSynchronize(stream_three);
    
    
        //Assign the processed data to the display image.
        org_dijSDK_img.data = h_dst;
        //DISPLAY PROCESSED IMAGE           
        imshow("Processed dijSDK image", org_dijSDK_img);
        waitKey(33);
    
    
    }
    
    cudaDeviceReset();
    return 0;
       }