C++ opencv在多线程处理中的速度要慢得多_C++_Multithreading_Opencv

C++ opencv在多线程处理中的速度要慢得多

c++ multithreading opencv

C++ opencv在多线程处理中的速度要慢得多,c++,multithreading,opencv,C++,Multithreading,Opencv,我正在编写一个使用多线程的控制台应用程序。每个线程使用opencv函数处理一组图像如果使用opencv函数的函数在单个线程中执行，我将获得一个参考计算时间。如果我从多个线程执行这个函数，那么每个线程中的函数都会慢得多，几乎是原来的两倍？opencv是否并行化、序列化或阻止自身执行我使用opencv库测试了这个应用程序，该库使用了TBB编译，没有TBB编译，结果几乎相同。我不知道它是否会有任何影响，但我也看到一些函数，如cv:：threshold或cv:：findcontours，在执行be

我正在编写一个使用多线程的控制台应用程序。每个线程使用opencv函数处理一组图像

如果使用opencv函数的函数在单个线程中执行，我将获得一个参考计算时间。如果我从多个线程执行这个函数，那么每个线程中的函数都会慢得多，几乎是原来的两倍

？opencv是否并行化、序列化或阻止自身执行

我使用opencv库测试了这个应用程序，该库使用了TBB编译，没有TBB编译，结果几乎相同。我不知道它是否会有任何影响，但我也看到一些函数，如cv:：threshold或cv:：findcontours，在执行beein时会创建12个额外的子进程。如果open cv调用被注释，那么所有线程的时间都是相同的，并且在单线程执行中获得的时间也是相同的，因此在这种情况下，多线程工作得很好。问题是，是否有一个opencv编译选项或函数调用允许在多线程和单线程执行中获得相同的时间

编辑这是在4核CPU中增加线程核数的结果，使用1、2、3和4核执行相同的功能。每个核心在for循环中处理768个分辨率为1600x1200的图像。在循环内部，调用导致延迟增加的函数。我应该期望，独立于内核数量，单线程35000ms或10%以上的时间大致相同，但是，可以看出，当线程数量增加时，时间会增加，我不知道为什么

时报：对不起，系统不允许我上传图片到帖子

time in File No. 3 --> 35463
 Mean time using 1 cores is: 47ms

time in File No. 3 --> 42747
 time in File No. 3 --> 42709
 Mean time using 2 cores is: 28ms

time in File No. 3 --> 54587
 time in File No. 3 --> 54595
 time in File No. 3 --> 54437
 Mean time using 3 cores is: 24ms

time in File No. 3 --> 68751
 time in File No. 3 --> 68865
 time in File No. 3 --> 68878
 time in File No. 3 --> 68622
 Mean time using 4 cores is: 22ms

如果函数中未使用opencv代码，则所有情况下1、2、3或4线程的时间与预期时间相似，但当使用opencv函数时，例如仅通过简单调用：

img.CONVERTOIMG，CV_32F

在img a cv:：Mat中，当线程数增加时，时间会增加。我还做了一个测试，在CPU Bios中禁用hiper线程选项。在这种情况下，所有时间都会减少，1个线程的时间为25.000ms，但时间增加的问题仍然存在，2个线程为33秒，3个线程为43秒，4个线程为57秒。。。我不知道这是否告诉你一些事情

编辑2 mcve：

#include "stdafx.h"
#include <future>
#include <chrono>
#include "Filter.h"
#include <iostream>
#include <future>


#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

long long Ticks();
int WithOpencv(cv::Mat img);
int With_OUT_Opencv(cv::Mat img);
int TestThreads (char *buffer,std::string file);
#define Blur3x3(matrix,f,c) ((matrix[(f-1)*1600+(c-1)] + matrix[(f-1)*1600+c] + matrix[(f-1)*1600+(c+1)] + matrix[f*1600+(c-1)] + matrix[f*1600+c] + matrix[f*1600+(c+1)] + matrix[(f+1)*1600+(c-1)] + matrix[(f+1)*1600+c] + matrix[(f+1)*1600+(c+1)])/9)


int _tmain(int argc, _TCHAR* argv[])
{

    std::string file="Test.bmp";

    auto function = [&](char *buffer){return TestThreads(buffer,file);};
    char *buffers[12];
    std::future<int> frames[12];
    DWORD tid;
    int i,j;
    int nframes = 0;
    int ncores;

    cv::setNumThreads(8);

    for (i=0;i<8;i++) buffers[i] = new char[1000*1024*1024];
    for (j=1;j<9;j++)
    {
        ncores = j;
        long long t = Ticks();
        for (i=0;i<ncores;i++) frames[i] = std::async(std::launch::async,function,buffers[i]);
        for (i=0;i<ncores;i++) nframes += frames[i].get();
        t = Ticks() - t;

        std::cout << "Mean time using " << ncores << " cores is: " << t/nframes << "ms" << std::endl << std::endl;
        nframes = 0;
        Sleep(2000);
    }
    for (int i=0;i<8;i++) delete buffers[i];

    return NULL;

    return 0;
}



int TestThreads (char *buffer,std::string file)
{

    long long ta;
    int res;

        char *ruta=new char[file.length() + 1];
        strcpy(ruta,file.c_str());


        cv::Mat img (1200, 1600, CV_8UC1);
        img=cv::imread(file);


        ta = Ticks();
        for (int i=0;i<15;i++) {

            //Uncomment this and comment next line to test without opencv calls. With_OUT_Opencv implements simple filters with direct operations over mat data
            //res = With_OUT_Opencv(img);

            res = WithOpencv(img);


        }

        ta = Ticks() - ta;
        std::cout << "Time in file No. 3--> " << ta << std::endl;


        return 15;
}



int WithOpencv(cv::Mat img){

    cv::Mat img_bin;    
    cv::Mat img_filtered;
    cv::Mat img_filtered2;
    cv::Mat img_res;
    int Crad_morf=2;
    double Tthreshold=20;
    cv::Mat element = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(2*Crad_morf + 1, 2*Crad_morf+1));

    img.convertTo(img,CV_32F);
    cv::blur(img, img_filtered, cv::Size(3, 3));
    cv::blur(img.mul(img), img_filtered2, cv::Size(3, 3));
    cv::sqrt(img_filtered2 - img_filtered.mul(img_filtered), img_res);
    cv::normalize(img_res, img_res, 0.0, 1.0, cv::NORM_MINMAX);
    img_res.convertTo(img_res,CV_8UC1,255.0);
    cv::threshold(img_res, img_bin, Tthreshold, 255, cv::THRESH_BINARY);

    if (Crad_morf!=0){
        cv::dilate(img_bin, img_bin, element);
    }

    return 0;
}





int With_OUT_Opencv(cv::Mat img){

    unsigned char *baux1 = new unsigned char[1600*1200];
    unsigned short *baux2 = new unsigned short[1600*1200];
    unsigned char max=0; 
    int f,c,i;
    unsigned char threshold = 177;

    for (f=1;f<1199;f++)                                // Bad Blur filters
    {
        for (c=1; c<1599; c++)
        {
            baux1[f*1600+c] = Blur3x3(img.data,f,c);
            baux1[f*1600+c] = baux1[f*1600+c] * baux1[f*1600+c];
            baux2[f*1600+c] = img.data[f*1600+c] * img.data[f*1600+c];
        }
    }
    for (f=1;f<1199;f++)
    {
        for (c=1; c<1599; c++)
        {
            baux1[f*1600+c] = sqrt(Blur3x3(baux2,f,c) - baux1[f*1600+c]);
            if (baux1[f*1600+c] > max) max = baux1[f*1600+c];
        }
    }
    threshold = threshold * ((float)max/255.0);         // Bad Norm/Bin
    for (i=0;i<1600*1200;i++)
    {
        if (baux1[i]>threshold) baux1[i] = 1;
        else baux1[i] = 0;
    }

    delete []baux1;
    delete []baux2;

    return 0;
}




long long Ticks()
{
   static long long last = 0;
   static unsigned ticksPerMS = 0;
   LARGE_INTEGER largo;

   if (last==0)
   {
       QueryPerformanceFrequency(&largo);
       ticksPerMS = (unsigned)(largo.QuadPart/1000);
       QueryPerformanceCounter(&largo);
       last = largo.QuadPart;
       return 0;
   }
   QueryPerformanceCounter(&largo);
   return (largo.QuadPart-last)/ticksPerMS;
}

我不明白你的问题是什么

您最初的问题表明，串行运行x次迭代比并行运行要快得多。注意：当使用相同的目标函数时。您想知道为什么在多线程场景中运行相同的目标函数要慢得多

但是，我现在看到您的示例正在将OpenCV的性能与其他一些自定义代码进行比较。这就是你的问题吗

与我最初认为的问题相关，答案是：不，串行运行目标函数并不比并行运行快多少。请参阅下面的结果和代码

后果关于苹果MBA 2012 i5和opencv3

测试代码

你在衡量三件事：

所有线程完成整个任务所需的时间除以整个任务的大小。每个线程完成其部分任务所需的时间。完成整个任务所需的时间。您观察到，当增加线程数时，第一次从47ms下降到22ms。那太好了！同时，您也意识到，单个线程所需的时间从35463增加到68751左右，不管是什么单位。最后，您意识到总体执行时间增加了

关于第二个度量：当增加线程数时，单个线程需要更长的时间来执行相应的操作。两种可能的解释：

您的线程正在争夺内存总线带宽。你的线程触发的计算本身是多线程的，因此它们有效地相互竞争CPU时间。

现在是关于为什么总工作时间增加的问题。原因很简单：您不仅增加了线程的数量，而且还以相同的速度增加了工作负载。如果您的线程之间根本没有竞争，并且不会涉及开销，那么N个线程将需要相同的时间来完成N倍的工作。它没有，因此您注意到速度减慢。

1您的函数做什么？2不要忘记磁盘IO也可能成为瓶颈3尝试设置setNumThreads的值。。。对于1或0:setNumThreadsint nthreads1，它基本上执行两个模糊、一些多重操作、一个阈值、放大和查找轮廓。2在函数中不进行IO操作，所有读取操作都在之前进行。最后，当使用cv:：setNumThreads将线程数设置为0或1时，子进程将创建

threshold或findcontours调用中的d已消失，但当我增加应用程序中的线程时，时间会增加，而opencv中的线程则不一样。。。谢谢你的回复！！每次迭代增加多少时间=函数调用以及每秒完成多少次迭代？创建线程本机c++11线程或任何其他操作系统本机线程需要花费时间，调度程序需要切换到它。成本应该在几微秒到几毫秒之间，这取决于同时执行的线程数量。根据经验，如果没有阻塞文件IO/网络，则创建的线程数与CPU上的内核总数相同。为了进一步帮助您，您需要将一些代码作为stackoverflow.com/help/mcve的一部分共享。例如，如果您的CPU有4个内核，并且您运行40个线程，那么每个内核将有10个预定的上下文切换。如果所有这40个线程都只是在执行暴力CPU处理，那么运行40个线程将比运行4个线程慢；您也应该包括这一点。其他自定义代码的目的是表明，当您调用其他函数而不是OpenCV时，行为与预期一样，无论使用的线程数多少，时间或多或少是相等的，因此您必须回答这个问题。在您的代码中，每个线程进行的迭代取决于线程的数量，因为它们分布在int-nIterationsPerThread=nIterations/nThreads，但是每个线程应该执行相同数量的迭代。请尝试每个线程的固定迭代次数，这是我要求的行为。线程越多，固定迭代次数的时间就越长，当在单线程执行中添加更多线程时，或多或少地增加10-20%的增量，这与线程数量无关。或者我猜错了？？？如果你使用带有_OUT _Opencv的函数作为targetfunction，你应该看到发生这种情况的时间或多或少是相同的，增加了一些线程，但是当你使用Opencv指令时，代码没有被滥用，当添加更多线程时，总时间呈线性增长。我不确定我是否理解您的评论，但您似乎缺少一些关于线程的基本知识。上面的例子表明，如果所需的工作是迭代64次，那么我可以决定以串行方式迭代64次，需要7秒，或者以并行方式迭代64次-一次迭代两次需要4秒，或者并行方式迭代四次。。。等如果你的问题是，为什么并行迭代128次比串行迭代64次慢，那么答案很清楚：128次比64次多；两个核心！=2倍于单个内核的速度。关于你的问题，为什么总运行时间是x秒，而不管我迭代非opencv自定义函数的频率是多少：这不是opencv问题，我建议为此创建一个单独的问题。我不会对此作进一步的回答，但浏览一下您的代码，编译器可能会用NOP替换您的所有实现，因为它发现运行您的实现对系统没有影响。Thaks Marcur感谢您澄清这个问题。关于这两种可能的解释，第二种可能是opencv中的一个原因，因为像cv:：threshold这样的一些函数是用parallel_for实现的，它们自己创建线程。如果设置cv:：setnumThreads0，则它们不会创建自己的线程，可以跟踪查看任务的子进程，但在这种情况下，第二个线程的时间也会增加。所以第一个原因可能是正确的。因为这只发生在使用opencv调用执行函数时，我认为这是一个opencv问题。如果我使用另一个函数，我可以看到第二次增加的时间在理论上应该增加得更少，但是使用opencv，每个线程的时间增量是线性的，因此并行化不起作用。我觉得opencv应该能够处理这个问题，这就是为什么我在问一个可能的解决方案，关于使用自己编程的虚拟过滤器获得的结果。关于你的最后一行，我希望N个线程做N倍的功，一个线程需要相同的时间加上一个小的线性量，1个线程做1倍的功。就像发生在另一个世界一样function@JoséLuisGiral：关于你的期望，在大多数现实情况下，你都会感到失望，至少在使用如此简单的方法时是如此。只有当根本没有顺序代码，并且线程之间根本不进行交互或竞争时，它们才是真的。关于您的观察，您可能不得不接受这样一个事实，即OpenCV比您的代码要高效得多，因此内存总线带宽成为一个主要问题。

eight threads took 4104.38 ms
single thread took 7272.68 ms
four threads took 3687 ms
two threads took 4500.15 ms

#include <iostream>
#include <vector>
#include <chrono>
#include <thread>
#include <opencv2/opencv.hpp>

using namespace std;
using namespace std::chrono;
using namespace cv;

class benchmark {
    time_point<steady_clock> start = steady_clock::now();
    string title;
public:
    benchmark(const string& title) : title(title) {}
    
    ~benchmark() {
        auto diff = steady_clock::now() - start;
        cout << title << " took " << duration <double, milli> (diff).count() << " ms" << endl;
    }
};

template <typename F>
void repeat(unsigned n, F f) {
    while (n--) f();
};



int targetFunction(Mat img){
    cv::Mat img_bin;
    cv::Mat img_filtered;
    cv::Mat img_filtered2;
    cv::Mat img_res;
    int Crad_morf=2;
    double Tthreshold=20;
    cv::Mat element = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(2*Crad_morf + 1, 2*Crad_morf+1));
    
    img.convertTo(img,CV_32F);
    cv::blur(img, img_filtered, cv::Size(3, 3));
    cv::blur(img.mul(img), img_filtered2, cv::Size(3, 3));
    cv::sqrt(img_filtered2 - img_filtered.mul(img_filtered), img_res);
    cv::normalize(img_res, img_res, 0.0, 1.0, cv::NORM_MINMAX);
    img_res.convertTo(img_res,CV_8UC1,255.0);
    cv::threshold(img_res, img_bin, Tthreshold, 255, cv::THRESH_BINARY);
    
    if (Crad_morf!=0){
        cv::dilate(img_bin, img_bin, element);
    }
    
    //imshow("WithOpencv", img_bin);
    
    return 0;
}

void runTargetFunction(int nIterations, int nThreads, const Mat& img) {
    int nIterationsPerThread = nIterations / nThreads;
    vector<thread> threads;
    auto targetFunctionFn = [&img]() {
        targetFunction(img);
    };
    
    setNumThreads(nThreads);
    
    repeat(nThreads, [&] {
        threads.push_back(thread([=]() {
            repeat(nIterationsPerThread, targetFunctionFn);
        }));
    });
    
    for(auto& thread : threads)
        thread.join();
}

int main(int argc, const char * argv[]) {
    string file = "../../opencv-test/Test.bmp";
    auto img = imread(file);
    
    const int nIterations = 64;
    
    // let's run using eight threads
    {
        benchmark b("eight threads");
        runTargetFunction(nIterations, 8, img);
    }
    
    // let's run using a single thread
    {
        benchmark b("single thread");
        runTargetFunction(nIterations, 1, img);
    }
    
    // let's run using four threads
    {
        benchmark b("four threads");
        runTargetFunction(nIterations, 4, img);
    }
    
    // let's run using a two threads
    {
        benchmark b("two threads");
        runTargetFunction(nIterations, 2, img);
    }
    
    return 0;
}