C++ 基于计算机视觉算法的CUDA性能
我在CUDA C编程世界中迈出了第一步 作为第一个测试,我编写了一个简单的算法来对图像进行灰度转换和阈值化(我是计算机视觉和OpenCV的粉丝!)。 我决定将我的CUDA性能结果与CPU上的类似算法以及相应的OpenCV(CPU)函数进行比较。 以下是全高清视频的结果:C++ 基于计算机视觉算法的CUDA性能,c++,c,performance,opencv,cuda,C++,C,Performance,Opencv,Cuda,我在CUDA C编程世界中迈出了第一步 作为第一个测试,我编写了一个简单的算法来对图像进行灰度转换和阈值化(我是计算机视觉和OpenCV的粉丝!)。 我决定将我的CUDA性能结果与CPU上的类似算法以及相应的OpenCV(CPU)函数进行比较。 以下是全高清视频的结果: Frame Count: 4754 Frame Resolution: 1920x1080 Total time CPU: 67418.6 ms Frame Avg CPU: 14.1814 ms Frame Count:
Frame Count: 4754
Frame Resolution: 1920x1080
Total time CPU: 67418.6 ms
Frame Avg CPU: 14.1814 ms
Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23805.3 ms
Frame Avg OpenCV: 5.00742 ms
Frame Count: 4754
Frame Resolution: 1920x1080
==6149== NVPROF is profiling process 6149, command: ./OpenCV_test
Total time CUDA: 28018.2 ms
Frame Avg CUDA: 5.89361 ms
==6149== Profiling application: ./OpenCV_test
==6149== Profiling result:
Time(%) Time Calls Avg Min Max Name
55.45% 4.05731s 4754 853.45us 849.54us 1.1141ms doThreshold(unsigned char const *, unsigned char*, unsigned int, unsigned int, unsigned int)
34.03% 2.49028s 4754 523.83us 513.67us 1.3338ms [CUDA memcpy HtoD]
10.52% 769.46ms 4754 161.85us 161.15us 301.06us [CUDA memcpy DtoH]
==6149== API calls:
Time(%) Time Calls Avg Min Max Name
80.11% 8.19501s 9508 861.91us 490.81us 2.7719ms cudaMemcpy
12.82% 1.31106s 9508 137.89us 66.639us 218.56ms cudaMalloc
5.74% 587.05ms 9508 61.742us 39.566us 2.0234ms cudaFree
1.21% 124.16ms 4754 26.116us 16.990us 365.86us cudaLaunch
0.06% 5.7645ms 23770 242ns 97ns 106.27us cudaSetupArgument
0.05% 5.4291ms 4754 1.1410us 602ns 10.150us cudaConfigureCall
0.01% 594.89us 83 7.1670us 249ns 282.44us cuDeviceGetAttribute
0.00% 45.536us 1 45.536us 45.536us 45.536us cuDeviceTotalMem
0.00% 35.649us 1 35.649us 35.649us 35.649us cuDeviceGetName
0.00% 1.8960us 2 948ns 345ns 1.5510us cuDeviceGetCount
0.00% 892ns 2 446ns 255ns 637ns cuDeviceGet
正如您所看到的,OpenCV比我的cpu实现和Cuda算法要好得多!诀窍在哪里?我怀疑OpenCV使用了一些特殊的cpu硬件指令集。
我对CUDA的期望更高:人们谈论原始图像处理中的20-30倍加速!我错过了什么
以下是有关我的系统配置的一些详细信息:
- Cpu Intel Core i7 5820k@4ghz
- GeForce GTX 970
- Linux Mint 17.2 Mate 64位
- 驱动程序nVidia 352.55
- Cuda工具包7.5.18
- Cuda已启用
- OpenCL禁用
- TBB已禁用(尝试强制单线程cpu执行)
- 启用英特尔IPP
#include <iostream>
#include <numeric>
#include <string>
#include <stdlib.h>
#include <chrono>
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;
using namespace std::chrono;
const char* file = "PATH TO A VIDEO FILE";
__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal)
{
uint i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < inputSize)
{
output[i] = 0.5f + ((bgrInput[3 * i] + bgrInput[3 * i + 1] + bgrInput[3 * i + 2]) / 3.0f); // gray conversion
output[i] = output[i] > soglia ? maxVal : 0; // thresholding
}
}
void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal)
{
if (mat.type() == CV_8UC3)
{
uint size = mat.rows * mat.cols;
uint blockSize = 128; // no significant result varying this variable
uint gridSize = ceil(size/(float)blockSize);
uchar* d_bgrInput, *d_output;
cudaMalloc((void**)&d_bgrInput, mat.channels() * size);
cudaMalloc((void**)&d_output, size);
cudaMemcpy(d_bgrInput, mat.data, mat.channels() * size, cudaMemcpyHostToDevice);
doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal);
result = Mat(mat.rows, mat.cols, CV_8UC1);
cudaMemcpy(result.data, d_output, size, cudaMemcpyDeviceToHost);
cudaFree(d_bgrInput);
cudaFree(d_output);
}
else
cerr << "Only CV_8UC3 matrix supported" << endl;
}
void cpuCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal)
{
if (mat.type() == CV_8UC3)
{
uint size = mat.rows * mat.cols;
result = Mat(mat.rows, mat.cols, CV_8UC1);
uchar* input = mat.data;
uchar* output = result.data;
for (uint i = 0; i < size; ++i)
{
output[i] = 0.5f + ((input[3 * i] + input[3 * i + 1] + input[3 * i + 2]) / 3.0f); // gray conversion
output[i] = output[i] > soglia ? maxVal : 0; // thresholding
}
}
else
cerr << "Only CV_8UC3 matrix supported" << endl;
}
void cudaTest(const string src)
{
VideoCapture cap(src);
Mat frame, result;
uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
cout << "Frame Count: " << frameCount << endl;
auto startTs = system_clock::now();
cap >> frame;
cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;
while (not frame.empty()) {
cudaCvtThreshold(frame, result, 127, 255);
cap >> frame;
}
auto stopTs = system_clock::now();
auto diff = stopTs - startTs;
auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
cout << "Total time CUDA: " << elapsed << " ms" << endl;
cout << "Frame Avg CUDA: " << elapsed / frameCount << " ms" << endl << endl;
}
void naiveCpu(const string src)
{
VideoCapture cap(src);
Mat frame, result;
uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
cout << "Frame Count: " << frameCount << endl;
auto startTs = system_clock::now();
cap >> frame;
cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;
while (not frame.empty()) {
cpuCvtThreshold(frame, result, 127, 255);
cap >> frame;
}
auto stopTs = system_clock::now();
auto diff = stopTs - startTs;
auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
cout << "Total time CPU: " << elapsed << " ms" << endl;
cout << "Frame Avg CPU: " << elapsed / frameCount << " ms" << endl << endl;
}
void opencv(const string src)
{
VideoCapture cap(src);
Mat frame, result;
uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
cout << "Frame Count: " << frameCount << endl;
auto startTs = system_clock::now();
cap >> frame;
cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;
while (not frame.empty()) {
cv::cvtColor(frame, result, COLOR_BGR2GRAY);
threshold(result, result, 127, 255, THRESH_BINARY);
cap >> frame;
}
auto stopTs = system_clock::now();
auto diff = stopTs - startTs;
auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
cout << "Total time OpenCV: " << elapsed << " ms" << endl;
cout << "Frame Avg OpenCV: " << elapsed / frameCount << " ms" << endl << endl;
}
int main(void)
{
naiveCpu(file);
opencv(file);
cudaTest(file);
return 0;
}
单malloc和free的性能更好,但改进很小
编辑2:
根据Jez的建议,我修改了Cuda内核,以便在每个GPU线程内处理多个像素(以下执行中为8个):
下面是修改后的代码:
__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal, uint pixelPerThread)
{
uint i = pixelPerThread * (blockIdx.x * blockDim.x + threadIdx.x);
if (i < inputSize)
{
for (uint j = 0; j < pixelPerThread; j++) {
uchar grayPix = 0.5f + ( (bgrInput[3 * (i + j)] + bgrInput[3 * (i + j) + 1] + bgrInput[3 * (i + j) + 2]) / 3.0f ); // gray conversion
output[i + j] = grayPix > soglia ? maxVal : 0; // thresholding
}
}
}
void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal, uchar* d_bgrInput, uchar* d_output)
{
uint size = mat.rows * mat.cols;
uint pixelPerThread = 8;
uint blockSize = 128; // no significant result varying this variable
uint gridSize = ceil(size/(float)(blockSize * pixelPerThread));
doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal, pixelPerThread);
}
请注意,内核执行的平均时间现在是664,39 us,而不是792,26 us
不错!:-)
但是OpenCV(使用Intel IPP)仍然更快
编辑3:
我在没有IPP和各种SSE指令的情况下重新编译了OpenCV。OpenCV的性能似乎是一样的
Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23541.7 ms
Frame Avg OpenCV: 4.95198 ms
这里发生了两件事 日常开支 您将花费大约一半的GPU时间向GPU分配内存和从GPU复制内存。CPU-GPU连接是一个相对较慢的链路,与数据在GPU上开始和结束并且内存分配一次的情况相比,性能会直接减半。在这里,您可以做一些事情来提供帮助,例如将分配移到循环之外,并将一帧的数据传输与下一帧的计算重叠,但是copy->execute->copy的模式很少产生很好的运行时,除非执行非常复杂 内核 您的内核应该是内存受限的。您(理想情况下)每线程移动4个字节,大约有200万个线程(像素),运行时间为853us,大约为10GB/s。GTX 970的峰值为224GB/s。你离这里很远 这里的问题是,您正在执行8位事务。这种情况下的解决方案是使用共享内存。如果在内核开始时以高性能方式将数据加载到共享内存中(例如,将指针强制转换到int4s,确保对齐),则可以从该内存中读取数据,然后以每个线程32+位的速度写回。这意味着您必须在一个线程中处理多个像素,但这不是问题 另一种解决方案是找到一个库来执行此操作,例如,它涵盖了许多与图像相关的任务,可能比手写代码更快
有了一个好的内存访问模式,我希望这个内核的速度会提高10倍以上。根据阿姆达尔定律,一旦你这样做了,你将被开销所支配,因此除非你能摆脱它们,否则运行时间只会快2倍。你可以做的第一个优化是删除
cudaCvtThreshold
中的冗余内存分配和删除。只需在cudaTest
函数中执行单个设备内存分配,并将其用于后续的cudaCvtThreshold
调用。此外,opencv在其原语中使用CPU向量指令,如SSE、SSE2、AVX等,这是其速度的原因之一。在内核中,您可以使用寄存器存储灰度转换的结果,然后设置阈值,这样现在全局内存只需写入一次,减少了1次全局读取和1次全局写入的开销。(虽然gpu可以缓存内存访问,但仍然值得一试)在这种情况下,编译器将避免从全局执行中间写/读操作。尽管如此,这仍然是一个很好的建议,因为编译器不能总是这样做,特别是如果没有提供别名信息,那么简单的回答是CPU和CUDA代码都很差。据我所知,您认为幼稚的CPU实现会很慢。对于CUDA,您不能仅仅在内核中移动代码,就指望它会神奇地工作得更快。并行编程需要完全不同的思维、数据布局和模式。甚至没有提到你的malloc和内核在循环中的启动。@sgarizvi:我决定在函数cudacvtsthreshold中执行malloc和free,以便有一个函数可以直接用于Mat对象。但是,我将发布与外部malloc和free相同的测试。我还将在Cuda内核中进行一次内存访问,但我认为这不会导致显著的性能损失。
__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal, uint pixelPerThread)
{
uint i = pixelPerThread * (blockIdx.x * blockDim.x + threadIdx.x);
if (i < inputSize)
{
for (uint j = 0; j < pixelPerThread; j++) {
uchar grayPix = 0.5f + ( (bgrInput[3 * (i + j)] + bgrInput[3 * (i + j) + 1] + bgrInput[3 * (i + j) + 2]) / 3.0f ); // gray conversion
output[i + j] = grayPix > soglia ? maxVal : 0; // thresholding
}
}
}
void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal, uchar* d_bgrInput, uchar* d_output)
{
uint size = mat.rows * mat.cols;
uint pixelPerThread = 8;
uint blockSize = 128; // no significant result varying this variable
uint gridSize = ceil(size/(float)(blockSize * pixelPerThread));
doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal, pixelPerThread);
}
Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23628.8 ms
Frame Avg OpenCV: 4.97031 ms
Frame Count: 4754
Frame Resolution: 1920x1080
==13441== NVPROF is profiling process 13441, command: ./OpenCV_test
Total time CUDA (out malloc-free): 25655.5 ms
Frame Avg CUDA (out malloc-free): 5.39662 ms
==13441== Profiling application: ./OpenCV_test
==13441== Profiling result:
Time(%) Time Calls Avg Min Max Name
49.30% 3.15853s 4754 664.39us 658.24us 779.04us doThreshold(unsigned char const *, unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int)
38.69% 2.47838s 4754 521.32us 513.35us 870.69us [CUDA memcpy HtoD]
12.01% 769.53ms 4754 161.87us 161.31us 200.58us [CUDA memcpy DtoH]
==13441== API calls:
Time(%) Time Calls Avg Min Max Name
95.78% 7.26387s 9508 763.97us 491.11us 1.6589ms cudaMemcpy
2.51% 190.70ms 2 95.350ms 82.529us 190.62ms cudaMalloc
1.53% 116.31ms 4754 24.465us 16.844us 286.56us cudaLaunch
0.09% 6.7052ms 28524 235ns 98ns 233.19us cudaSetupArgument
0.08% 5.9538ms 4754 1.2520us 642ns 12.039us cudaConfigureCall
0.00% 263.87us 83 3.1790us 225ns 111.03us cuDeviceGetAttribute
0.00% 174.45us 2 87.227us 52.521us 121.93us cudaFree
0.00% 34.612us 1 34.612us 34.612us 34.612us cuDeviceTotalMem
0.00% 29.376us 1 29.376us 29.376us 29.376us cuDeviceGetName
0.00% 1.6950us 2 847ns 343ns 1.3520us cuDeviceGetCount
0.00% 745ns 2 372ns 217ns 528ns cuDeviceGet
Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23541.7 ms
Frame Avg OpenCV: 4.95198 ms