C++ 基于计算机视觉算法的CUDA性能_C++_C_Performance_Opencv_Cuda

C++ 基于计算机视觉算法的CUDA性能

c++ c performance opencv cuda

C++ 基于计算机视觉算法的CUDA性能,c++,c,performance,opencv,cuda,C++,C,Performance,Opencv,Cuda,我在CUDA C编程世界中迈出了第一步作为第一个测试，我编写了一个简单的算法来对图像进行灰度转换和阈值化（我是计算机视觉和OpenCV的粉丝！）。我决定将我的CUDA性能结果与CPU上的类似算法以及相应的OpenCV（CPU）函数进行比较。以下是全高清视频的结果： Frame Count: 4754 Frame Resolution: 1920x1080 Total time CPU: 67418.6 ms Frame Avg CPU: 14.1814 ms Frame Count:

我在CUDA C编程世界中迈出了第一步

作为第一个测试，我编写了一个简单的算法来对图像进行灰度转换和阈值化（我是计算机视觉和OpenCV的粉丝！）。我决定将我的CUDA性能结果与CPU上的类似算法以及相应的OpenCV（CPU）函数进行比较。以下是全高清视频的结果：

Frame Count: 4754
Frame Resolution: 1920x1080
Total time CPU: 67418.6 ms
Frame Avg CPU:  14.1814 ms

Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23805.3 ms
Frame Avg OpenCV:  5.00742 ms

Frame Count: 4754
Frame Resolution: 1920x1080
==6149== NVPROF is profiling process 6149, command: ./OpenCV_test
Total time CUDA: 28018.2 ms
Frame Avg CUDA:  5.89361 ms

==6149== Profiling application: ./OpenCV_test
==6149== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
55.45%  4.05731s      4754  853.45us  849.54us  1.1141ms doThreshold(unsigned char const *, unsigned char*, unsigned int, unsigned int, unsigned int)
34.03%  2.49028s      4754  523.83us  513.67us  1.3338ms  [CUDA memcpy HtoD]
10.52%  769.46ms      4754  161.85us  161.15us  301.06us  [CUDA memcpy DtoH]

==6149== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 80.11%  8.19501s      9508  861.91us  490.81us  2.7719ms  cudaMemcpy
 12.82%  1.31106s      9508  137.89us  66.639us  218.56ms  cudaMalloc
  5.74%  587.05ms      9508  61.742us  39.566us  2.0234ms  cudaFree
  1.21%  124.16ms      4754  26.116us  16.990us  365.86us  cudaLaunch
  0.06%  5.7645ms     23770     242ns      97ns  106.27us  cudaSetupArgument
  0.05%  5.4291ms      4754  1.1410us     602ns  10.150us  cudaConfigureCall
  0.01%  594.89us        83  7.1670us     249ns  282.44us  cuDeviceGetAttribute
  0.00%  45.536us         1  45.536us  45.536us  45.536us  cuDeviceTotalMem
  0.00%  35.649us         1  35.649us  35.649us  35.649us  cuDeviceGetName
  0.00%  1.8960us         2     948ns     345ns  1.5510us  cuDeviceGetCount
  0.00%     892ns         2     446ns     255ns     637ns  cuDeviceGet

正如您所看到的，OpenCV比我的cpu实现和Cuda算法要好得多！诀窍在哪里？我怀疑OpenCV使用了一些特殊的cpu硬件指令集。我对CUDA的期望更高：人们谈论原始图像处理中的20-30倍加速！我错过了什么

以下是有关我的系统配置的一些详细信息：

Cpu Intel Core i7 5820k@4ghz
GeForce GTX 970
Linux Mint 17.2 Mate 64位
驱动程序nVidia 352.55
Cuda工具包7.5.18

以下是有关我的OpenCV 3.0版本的一些信息：

Cuda已启用
OpenCL禁用
TBB已禁用（尝试强制单线程cpu执行）
启用英特尔IPP

以下是为测试执行的代码：

#include <iostream>
#include <numeric>
#include <string>
#include <stdlib.h>
#include <chrono>

#include <opencv2/opencv.hpp>

using namespace cv;
using namespace std;
using namespace std::chrono;

const char* file = "PATH TO A VIDEO FILE";

__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal)
{
    uint i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < inputSize)
    {
        output[i] = 0.5f + ((bgrInput[3 * i] + bgrInput[3 * i + 1] + bgrInput[3 * i + 2]) / 3.0f); // gray conversion
        output[i] = output[i] > soglia ? maxVal : 0; // thresholding
    }

}


void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal)
{
    if (mat.type() == CV_8UC3)
    {
        uint size = mat.rows * mat.cols;
        uint blockSize = 128; // no significant result varying this variable
        uint gridSize = ceil(size/(float)blockSize);
        uchar* d_bgrInput, *d_output;
        cudaMalloc((void**)&d_bgrInput, mat.channels() * size);
        cudaMalloc((void**)&d_output, size);
        cudaMemcpy(d_bgrInput, mat.data, mat.channels() * size, cudaMemcpyHostToDevice);

        doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal);

        result = Mat(mat.rows, mat.cols, CV_8UC1);
        cudaMemcpy(result.data, d_output, size, cudaMemcpyDeviceToHost);
        cudaFree(d_bgrInput);
        cudaFree(d_output);
    }
    else
        cerr << "Only CV_8UC3 matrix supported" << endl;

}

void cpuCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal)
{
    if (mat.type() == CV_8UC3)
    {
        uint size = mat.rows * mat.cols;
        result = Mat(mat.rows, mat.cols, CV_8UC1);
        uchar* input = mat.data;
        uchar* output = result.data;
        for (uint i = 0; i < size; ++i)
        {
            output[i] = 0.5f + ((input[3 * i] + input[3 * i + 1] + input[3 * i + 2]) / 3.0f); // gray conversion
            output[i] = output[i] > soglia ? maxVal : 0; // thresholding
        }
    }
    else
        cerr << "Only CV_8UC3 matrix supported" << endl;
}

void cudaTest(const string src)
{
    VideoCapture cap(src);
    Mat frame, result;
    uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
    cout << "Frame Count: " << frameCount << endl;
    auto startTs = system_clock::now();
    cap >> frame;
    cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;

    while (not frame.empty()) {
        cudaCvtThreshold(frame, result, 127, 255);
        cap >> frame;
    }

    auto stopTs = system_clock::now();
    auto diff = stopTs - startTs;
    auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
    cout << "Total time CUDA: " << elapsed << " ms" << endl;
    cout << "Frame Avg CUDA:  " << elapsed / frameCount << " ms" << endl << endl;
}

void naiveCpu(const string src)
{
    VideoCapture cap(src);
    Mat frame, result;
    uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
    cout << "Frame Count: " << frameCount << endl;
    auto startTs = system_clock::now();
    cap >> frame;
    cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;

    while (not frame.empty()) {
        cpuCvtThreshold(frame, result, 127, 255);
        cap >> frame;
    }
    auto stopTs = system_clock::now();
    auto diff = stopTs - startTs;
    auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
    cout << "Total time CPU: " << elapsed << " ms" << endl;
    cout << "Frame Avg CPU:  " << elapsed / frameCount << " ms" << endl << endl;
}


void opencv(const string src)
{
    VideoCapture cap(src);
    Mat frame, result;
    uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
    cout << "Frame Count: " << frameCount << endl;
    auto startTs = system_clock::now();
    cap >> frame;
    cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;

    while (not frame.empty()) {
        cv::cvtColor(frame, result, COLOR_BGR2GRAY);
        threshold(result, result, 127, 255, THRESH_BINARY);
        cap >> frame;
    }
    auto stopTs = system_clock::now();
    auto diff = stopTs - startTs;
    auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
    cout << "Total time OpenCV: " << elapsed << " ms" << endl;
    cout << "Frame Avg OpenCV:  " << elapsed / frameCount << " ms" << endl << endl;
}

int main(void)
{
    naiveCpu(file);
    opencv(file);
    cudaTest(file);
    return 0;
}

单malloc和free的性能更好，但改进很小

编辑2：

根据Jez的建议，我修改了Cuda内核，以便在每个GPU线程内处理多个像素（以下执行中为8个）：

下面是修改后的代码：

__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal, uint pixelPerThread)
{
    uint i = pixelPerThread * (blockIdx.x * blockDim.x + threadIdx.x);

    if (i < inputSize)
    {
        for (uint j = 0; j < pixelPerThread; j++) {
            uchar grayPix = 0.5f + ( (bgrInput[3 * (i + j)] + bgrInput[3 * (i + j) + 1] + bgrInput[3 * (i + j) + 2]) / 3.0f ); // gray conversion
            output[i + j] = grayPix > soglia ? maxVal : 0; // thresholding
        }
    }
}

void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal, uchar* d_bgrInput, uchar* d_output)
{
    uint size = mat.rows * mat.cols;
    uint pixelPerThread = 8;
    uint blockSize = 128; // no significant result varying this variable
    uint gridSize = ceil(size/(float)(blockSize * pixelPerThread));
    doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal, pixelPerThread);
}

请注意，内核执行的平均时间现在是664,39 us，而不是792,26 us 不错！：-）但是OpenCV（使用Intel IPP）仍然更快

编辑3：我在没有IPP和各种SSE指令的情况下重新编译了OpenCV。OpenCV的性能似乎是一样的

Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23541.7 ms
Frame Avg OpenCV:  4.95198 ms

这里发生了两件事

日常开支您将花费大约一半的GPU时间向GPU分配内存和从GPU复制内存。CPU-GPU连接是一个相对较慢的链路，与数据在GPU上开始和结束并且内存分配一次的情况相比，性能会直接减半。在这里，您可以做一些事情来提供帮助，例如将分配移到循环之外，并将一帧的数据传输与下一帧的计算重叠，但是copy->execute->copy的模式很少产生很好的运行时，除非执行非常复杂

内核您的内核应该是内存受限的。您（理想情况下）每线程移动4个字节，大约有200万个线程（像素），运行时间为853us，大约为10GB/s。GTX 970的峰值为224GB/s。你离这里很远

这里的问题是，您正在执行8位事务。这种情况下的解决方案是使用共享内存。如果在内核开始时以高性能方式将数据加载到共享内存中（例如，将指针强制转换到int4s，确保对齐），则可以从该内存中读取数据，然后以每个线程32+位的速度写回。这意味着您必须在一个线程中处理多个像素，但这不是问题

另一种解决方案是找到一个库来执行此操作，例如，它涵盖了许多与图像相关的任务，可能比手写代码更快

有了一个好的内存访问模式，我希望这个内核的速度会提高10倍以上。根据阿姆达尔定律，一旦你这样做了，你将被开销所支配，因此除非你能摆脱它们，否则运行时间只会快2倍。

你可以做的第一个优化是删除

cudaCvtThreshold

中的冗余内存分配和删除。只需在

cudaTest

函数中执行单个设备内存分配，并将其用于后续的

cudaCvtThreshold

调用。此外，opencv在其原语中使用CPU向量指令，如SSE、SSE2、AVX等，这是其速度的原因之一。在内核中，您可以使用寄存器存储灰度转换的结果，然后设置阈值，这样现在全局内存只需写入一次，减少了1次全局读取和1次全局写入的开销。（虽然gpu可以缓存内存访问，但仍然值得一试）在这种情况下，编译器将避免从全局执行中间写/读操作。尽管如此，这仍然是一个很好的建议，因为编译器不能总是这样做，特别是如果没有提供别名信息，那么简单的回答是CPU和CUDA代码都很差。据我所知，您认为幼稚的CPU实现会很慢。对于CUDA，您不能仅仅在内核中移动代码，就指望它会神奇地工作得更快。并行编程需要完全不同的思维、数据布局和模式。甚至没有提到你的malloc和内核在循环中的启动。@sgarizvi：我决定在函数cudacvtsthreshold中执行malloc和free，以便有一个函数可以直接用于Mat对象。但是，我将发布与外部malloc和free相同的测试。我还将在Cuda内核中进行一次内存访问，但我认为这不会导致显著的性能损失。

__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal, uint pixelPerThread)
{
    uint i = pixelPerThread * (blockIdx.x * blockDim.x + threadIdx.x);

    if (i < inputSize)
    {
        for (uint j = 0; j < pixelPerThread; j++) {
            uchar grayPix = 0.5f + ( (bgrInput[3 * (i + j)] + bgrInput[3 * (i + j) + 1] + bgrInput[3 * (i + j) + 2]) / 3.0f ); // gray conversion
            output[i + j] = grayPix > soglia ? maxVal : 0; // thresholding
        }
    }
}

void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal, uchar* d_bgrInput, uchar* d_output)
{
    uint size = mat.rows * mat.cols;
    uint pixelPerThread = 8;
    uint blockSize = 128; // no significant result varying this variable
    uint gridSize = ceil(size/(float)(blockSize * pixelPerThread));
    doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal, pixelPerThread);
}

Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23628.8 ms
Frame Avg OpenCV:  4.97031 ms

Frame Count: 4754
Frame Resolution: 1920x1080
==13441== NVPROF is profiling process 13441, command: ./OpenCV_test
Total time CUDA (out malloc-free): 25655.5 ms
Frame Avg CUDA (out malloc-free):  5.39662 ms

==13441== Profiling application: ./OpenCV_test
==13441== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 49.30%  3.15853s      4754  664.39us  658.24us  779.04us  doThreshold(unsigned char const *, unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int)
 38.69%  2.47838s      4754  521.32us  513.35us  870.69us  [CUDA memcpy HtoD]
 12.01%  769.53ms      4754  161.87us  161.31us  200.58us  [CUDA memcpy DtoH]

==13441== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 95.78%  7.26387s      9508  763.97us  491.11us  1.6589ms  cudaMemcpy
  2.51%  190.70ms         2  95.350ms  82.529us  190.62ms  cudaMalloc
  1.53%  116.31ms      4754  24.465us  16.844us  286.56us  cudaLaunch
  0.09%  6.7052ms     28524     235ns      98ns  233.19us  cudaSetupArgument
  0.08%  5.9538ms      4754  1.2520us     642ns  12.039us  cudaConfigureCall
  0.00%  263.87us        83  3.1790us     225ns  111.03us  cuDeviceGetAttribute
  0.00%  174.45us         2  87.227us  52.521us  121.93us  cudaFree
  0.00%  34.612us         1  34.612us  34.612us  34.612us  cuDeviceTotalMem
  0.00%  29.376us         1  29.376us  29.376us  29.376us  cuDeviceGetName
  0.00%  1.6950us         2     847ns     343ns  1.3520us  cuDeviceGetCount
  0.00%     745ns         2     372ns     217ns     528ns  cuDeviceGet

Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23541.7 ms
Frame Avg OpenCV:  4.95198 ms