C++ 为什么我的程序在1个线程上运行比在8个线程上运行快。C++；_C++_Multithreading_C++11_Concurrency_Parallel Processing

C++ 为什么我的程序在1个线程上运行比在8个线程上运行快。C++；

c++ multithreading c++11 concurrency parallel-processing

C++ 为什么我的程序在1个线程上运行比在8个线程上运行快。C++；,c++,multithreading,c++11,concurrency,parallel-processing,C++,Multithreading,C++11,Concurrency,Parallel Processing,请看下面的代码： #include <iostream> #include <thread> #include <numeric> #include <algorithm> #include <vector> #include <chrono> template<typename Iterator, typename T> struct accumulate_block { void operator(

请看下面的代码：

#include <iostream>
#include <thread>
#include <numeric>
#include <algorithm>
#include <vector>
#include <chrono>

template<typename Iterator, typename T>
struct accumulate_block
{
    void operator()(Iterator begin, Iterator end, T& result)
    {
        result = std::accumulate(begin, end, result);
    }    
};

template<typename Iterator, typename T>
int accumulate_all(Iterator begin, Iterator end, T& init)
{
    auto numOfThreads = std::thread::hardware_concurrency();
    std::vector<std::thread> threads(numOfThreads);
    auto step = std::distance(begin, end) / numOfThreads;
    std::vector<int> results(numOfThreads,0);
    for(int i=0; i<numOfThreads-1; ++i)
    {
        auto block_end = begin;
        std::advance(block_end, step);
        threads[i] = std::thread(accumulate_block<Iterator, T>(), begin, block_end, std::ref(results[i]));
        begin = block_end;
    }
    threads[numOfThreads-1] = std::thread(accumulate_block<Iterator, T>(), begin, end, std::ref(results[numOfThreads-1]));
    for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
    return accumulate(results.begin(), results.end(), 0);
}

int main()
{ 
   int x=0;
   std::vector<int> V(20000000,1);
   auto t1 = std::chrono::high_resolution_clock::now();
   //std::accumulate(std::begin(V), std::end(V), x); singe threaded option
   std::cout<<accumulate_all(std::begin(V), std::end(V), x);
   auto t2 = std::chrono::high_resolution_clock::now();
   std::cout << "process took: "
    << std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count()
    << " nanoseconds\n";
    return 0;
}

#包括
#包括
#包括
#包括
#包括
#包括
模板
结构块
{
void运算符（）（迭代器开始、迭代器结束、T和结果）
{
结果=标准：：累积（开始、结束、结果）；
}    
};
模板
int累加_all（迭代器开始、迭代器结束、T&init）
{
自动numOfThreads=std:：thread:：hardware_concurrency（）；
std：：向量线程（numOfThreads）；
自动步进=标准：：距离（开始、结束）/numof线程；
std：：向量结果（numOfThreads，0）；
对于（int i=0；i，编译器将删除对std:：acculate
的调用，因为它没有副作用，并且不会使用结果
修正：
auto sum=std:：accumulate（std:：begin（V），std:：end（V），x）；//单线程选项
//最后。
std:：cout编译器删除对std:：acculate
的调用，因为它没有副作用，并且结果也没有被使用
修正：
auto sum=std:：accumulate（std:：begin（V），std:：end（V），x）；//单线程选项
//最后。
std:：cout主线程不是作为线程计数吗？你确定计数不是简单地优化出来的吗？@EduardRostomyan通过查看生成的程序集。但这里有一些简单的数学就足够了：4GHz cpu每.25ns循环一次（大多数指令使用不止一次）。它不可能在124ns中通过20000000个元素。计数显然在单线程版本中得到了优化。请尝试添加std:：cout@EduardRostomyan是的..你很高兴主线程不算线程吗？你确定计数不是简单地优化的吗？@EduardRostomyan通过查看生成的程序集。但是他一些简单的数学就足够了：4GHz的cpu每.25ns运行一个周期（大多数指令使用不止一个周期）。它不可能在124ns中运行20000000个元素。计数显然在单线程版本中得到了优化。尝试添加std:：cout@Eduard罗斯托扬：是的，不客气
auto sum = std::accumulate(std::begin(V), std::end(V), x); // singe threaded option

// At the very end.
std::cout << sum << '\n';