C++ 我可以报告openmp任务的进度吗?
想象一个典型的OMP任务:C++ 我可以报告openmp任务的进度吗?,c++,multithreading,parallel-processing,openmp,progress,C++,Multithreading,Parallel Processing,Openmp,Progress,想象一个典型的OMP任务: 对[0.0,1.0]范围内的大向量进行求和 using namespace std; int main() { vector<double> v; // generate some data generate_n(back_inserter(v), 1ul << 18, bind(uniform_real_distribution<double>(0,1.0), default_ra
- 对[0.0,1.0]范围内的大向量进行求和
using namespace std;
int main() {
vector<double> v;
// generate some data
generate_n(back_inserter(v), 1ul << 18,
bind(uniform_real_distribution<double>(0,1.0), default_random_engine { random_device {}() }));
long double sum = 0;
{
#pragma omp parallel for reduction(+:sum)
for(size_t i = 0; i < v.size(); i++)
{
sum += v[i];
}
}
std::cout << "Done: sum = " << sum << "\n";
}
#include <omp.h>
#include <vector>
#include <random>
#include <algorithm>
#include <iterator>
#include <functional>
#include <iostream>
#include <iomanip>
using namespace std;
int main() {
vector<double> v;
// generate some data
generate_n(back_inserter(v), 1ul << 18, bind(uniform_real_distribution<double>(0,1.0), default_random_engine { random_device {}() }));
auto step_size = 100ul;
auto total_steps = v.size() / step_size + 1;
size_t steps_completed = 0;
long double sum = 0;
#pragma omp parallel
{
size_t local_count = 0;
#pragma omp for reduction(+:sum)
for(size_t i = 0; i < v.size(); i++)
{
sum += v[i];
if (local_count++ % step_size == step_size-1)
{
#pragma omp atomic
++steps_completed;
if (steps_completed % 100 == 1)
{
#pragma omp critical
std::cout << "Progress: " << steps_completed << " of " << total_steps << " (" << std::fixed << std::setprecision(1) << (100.0*steps_completed/total_steps) << "%)\n";
}
}
}
}
std::cout << "Done: sum = " << sum << "\n";
}
使用名称空间std;
int main(){
向量v;
//生成一些数据
生成插入器(v),1ul只需让团队中的每个线程跟踪本地进度并原子地更新全局计数器。您仍然可以让另一个线程观察它,或者,如下面的示例所示,您可以在OMP关键部分内执行终端输出
这里的关键是调整步长,使其不会导致频繁的更新,因为关键区域的锁定(原子负载/存储的锁定程度较低)会降低性能
using namespace std;
int main() {
vector<double> v;
// generate some data
generate_n(back_inserter(v), 1ul << 18,
bind(uniform_real_distribution<double>(0,1.0), default_random_engine { random_device {}() }));
long double sum = 0;
{
#pragma omp parallel for reduction(+:sum)
for(size_t i = 0; i < v.size(); i++)
{
sum += v[i];
}
}
std::cout << "Done: sum = " << sum << "\n";
}
#include <omp.h>
#include <vector>
#include <random>
#include <algorithm>
#include <iterator>
#include <functional>
#include <iostream>
#include <iomanip>
using namespace std;
int main() {
vector<double> v;
// generate some data
generate_n(back_inserter(v), 1ul << 18, bind(uniform_real_distribution<double>(0,1.0), default_random_engine { random_device {}() }));
auto step_size = 100ul;
auto total_steps = v.size() / step_size + 1;
size_t steps_completed = 0;
long double sum = 0;
#pragma omp parallel
{
size_t local_count = 0;
#pragma omp for reduction(+:sum)
for(size_t i = 0; i < v.size(); i++)
{
sum += v[i];
if (local_count++ % step_size == step_size-1)
{
#pragma omp atomic
++steps_completed;
if (steps_completed % 100 == 1)
{
#pragma omp critical
std::cout << "Progress: " << steps_completed << " of " << total_steps << " (" << std::fixed << std::setprecision(1) << (100.0*steps_completed/total_steps) << "%)\n";
}
}
}
}
std::cout << "Done: sum = " << sum << "\n";
}
我下面的代码与前面的代码类似,但存在一些差异,这使我能够处理要报告的跳过点,因为完全相等,包括按模除法。此外,全局计数器收集所有线程的实际循环执行,但可能不精确-这对于这个特定问题是可以接受的。我只使用用于报告的主线程
const size_t size = ...
const size_t step_size = size / 100;
const size_t nThreads = ...
const size_t local_count_max = step_size / nThreads;
size_t count = 0;
#pragma omp parallel num_threads(nThreads)
{
size_t reported_count = 0;
size_t local_count = 0;
#pragma omp for
for (size_t i = 0; i < size; ++i)
{
<... do some useful work ...>
// -------------------------- update local and global progress counters
if (local_count >= local_count_max)
{
#pragma omp atomic
count += local_count_max;
local_count = 0;
}
else
{
++local_count;
}
// ------------------------------ report progress (in master thread only)
#pragma omp master
if (count - reported_count >= step_size)
{
<... report the progress ...>
reported_count = count;
}
}
}
const size\u t size=。。。
常数大小t步长大小=大小/100;
常量大小\u t n读取=。。。
const size\u t local\u count\u max=步长/n次读取;
大小\u t计数=0;
#pragma omp并行num_线程(n线程)
{
报告的大小\u计数=0;
大小\u t本地\u计数=0;
#pragma omp for
对于(大小i=0;i=本地计数最大值)
{
#布拉格omp原子
计数+=本地计数\u最大值;
本地_计数=0;
}
其他的
{
++本地(u)计数;;
}
//------------------------------------报告进度(仅在主线程中)
#pragma-omp-master
如果(计数-报告的计数>=步长大小)
{
报告的计数=计数;
}
}
}
在没有本机原子支持的处理器上(即使有本机原子支持),使用#pragma omp-atomic
,正如这里的其他答案所建议的,可能会降低程序的速度
进度指示器的概念是让用户知道什么时候事情会完成。如果你在目标上加上/减去总运行时间的一小部分,用户就不会太烦恼。也就是说,用户希望事情更快完成,而不是以更准确地知道事情何时完成为代价
出于这个原因,我通常只跟踪单个线程的进度,并使用它来估计总进度。这对于每个线程都有类似工作负载的情况很好。因为您使用的是#pragma omp parallel For
,您可能在处理一系列类似的元素,而没有相互依赖性,所以我的假设是p可能对您的用例有效
我将此逻辑包装在一个类ProgressBar
,我通常将其与它的助手类Timer
一起包含在头文件中。该类使用ANSI控制信号来保持外观良好
输出如下所示:
[====== ] (12% - 22.0s - 4 threads)
通过声明-DNOPROGRESS
编译标志,让编译器消除progressbar的所有开销也很容易
代码和示例用法如下所示:
#include <iostream>
#include <chrono>
#include <thread>
#include <iomanip>
#include <stdexcept>
#ifdef _OPENMP
///Multi-threading - yay!
#include <omp.h>
#else
///Macros used to disguise the fact that we do not have multithreading enabled.
#define omp_get_thread_num() 0
#define omp_get_num_threads() 1
#endif
///@brief Used to time how intervals in code.
///
///Such as how long it takes a given function to run, or how long I/O has taken.
class Timer{
private:
typedef std::chrono::high_resolution_clock clock;
typedef std::chrono::duration<double, std::ratio<1> > second;
std::chrono::time_point<clock> start_time; ///< Last time the timer was started
double accumulated_time; ///< Accumulated running time since creation
bool running; ///< True when the timer is running
public:
Timer(){
accumulated_time = 0;
running = false;
}
///Start the timer. Throws an exception if timer was already running.
void start(){
if(running)
throw std::runtime_error("Timer was already started!");
running=true;
start_time = clock::now();
}
///Stop the timer. Throws an exception if timer was already stopped.
///Calling this adds to the timer's accumulated time.
///@return The accumulated time in seconds.
double stop(){
if(!running)
throw std::runtime_error("Timer was already stopped!");
accumulated_time += lap();
running = false;
return accumulated_time;
}
///Returns the timer's accumulated time. Throws an exception if the timer is
///running.
double accumulated(){
if(running)
throw std::runtime_error("Timer is still running!");
return accumulated_time;
}
///Returns the time between when the timer was started and the current
///moment. Throws an exception if the timer is not running.
double lap(){
if(!running)
throw std::runtime_error("Timer was not started!");
return std::chrono::duration_cast<second> (clock::now() - start_time).count();
}
///Stops the timer and resets its accumulated time. No exceptions are thrown
///ever.
void reset(){
accumulated_time = 0;
running = false;
}
};
///@brief Manages a console-based progress bar to keep the user entertained.
///
///Defining the global `NOPROGRESS` will
///disable all progress operations, potentially speeding up a program. The look
///of the progress bar is shown in ProgressBar.hpp.
class ProgressBar{
private:
uint32_t total_work; ///< Total work to be accomplished
uint32_t next_update; ///< Next point to update the visible progress bar
uint32_t call_diff; ///< Interval between updates in work units
uint32_t work_done;
uint16_t old_percent; ///< Old percentage value (aka: should we update the progress bar) TODO: Maybe that we do not need this
Timer timer; ///< Used for generating ETA
///Clear current line on console so a new progress bar can be written
void clearConsoleLine() const {
std::cerr<<"\r\033[2K"<<std::flush;
}
public:
///@brief Start/reset the progress bar.
///@param total_work The amount of work to be completed, usually specified in cells.
void start(uint32_t total_work){
timer = Timer();
timer.start();
this->total_work = total_work;
next_update = 0;
call_diff = total_work/200;
old_percent = 0;
work_done = 0;
clearConsoleLine();
}
///@brief Update the visible progress bar, but only if enough work has been done.
///
///Define the global `NOPROGRESS` flag to prevent this from having an
///effect. Doing so may speed up the program's execution.
void update(uint32_t work_done0){
//Provide simple way of optimizing out progress updates
#ifdef NOPROGRESS
return;
#endif
//Quick return if this isn't the main thread
if(omp_get_thread_num()!=0)
return;
//Update the amount of work done
work_done = work_done0;
//Quick return if insufficient progress has occurred
if(work_done<next_update)
return;
//Update the next time at which we'll do the expensive update stuff
next_update += call_diff;
//Use a uint16_t because using a uint8_t will cause the result to print as a
//character instead of a number
uint16_t percent = (uint8_t)(work_done*omp_get_num_threads()*100/total_work);
//Handle overflows
if(percent>100)
percent=100;
//In the case that there has been no update (which should never be the case,
//actually), skip the expensive screen print
if(percent==old_percent)
return;
//Update old_percent accordingly
old_percent=percent;
//Print an update string which looks like this:
// [================================================ ] (96% - 1.0s - 4 threads)
std::cerr<<"\r\033[2K["
<<std::string(percent/2, '=')<<std::string(50-percent/2, ' ')
<<"] ("
<<percent<<"% - "
<<std::fixed<<std::setprecision(1)<<timer.lap()/percent*(100-percent)
<<"s - "
<<omp_get_num_threads()<< " threads)"<<std::flush;
}
///Increment by one the work done and update the progress bar
ProgressBar& operator++(){
//Quick return if this isn't the main thread
if(omp_get_thread_num()!=0)
return *this;
work_done++;
update(work_done);
return *this;
}
///Stop the progress bar. Throws an exception if it wasn't started.
///@return The number of seconds the progress bar was running.
double stop(){
clearConsoleLine();
timer.stop();
return timer.accumulated();
}
///@return Return the time the progress bar ran for.
double time_it_took(){
return timer.accumulated();
}
uint32_t cellsProcessed() const {
return work_done;
}
};
int main(){
ProgressBar pg;
pg.start(100);
//You should use 'default(none)' by default: be specific about what you're
//sharing
#pragma omp parallel for default(none) schedule(static) shared(pg)
for(int i=0;i<100;i++){
pg.update(i);
std::this_thread::sleep_for(std::chrono::seconds(1));
}
}
#包括
#包括
#包括
#包括
#包括
#ifdef\u OPENMP
///多线程-耶!
#包括
#否则
///宏用来掩盖我们没有启用多线程的事实。
#定义omp\u get\u thread\u num()0
#定义omp_get_num_线程()1
#恩迪夫
///@用于在代码中计算时间间隔的简要说明。
///
///例如,给定函数运行所需的时间,或I/O所需的时间。
班级计时器{
私人:
typedef std::chrono::高分辨率时钟;
typedef std::chrono::持续时间秒;
std::chrono::time\u point start\u time;//<上次启动计时器的时间
双累计_时间;//<自创建以来的累计运行时间
bool running;//<计时器运行时为True
公众:
计时器(){
累计_时间=0;
运行=错误;
}
///启动计时器。如果计时器已在运行,则引发异常。
void start(){
如果(正在运行)
抛出std::runtime_错误(“计时器已启动!”);
运行=真;
开始时间=时钟::现在();
}
///停止计时器。如果计时器已停止,则引发异常。
///调用此选项会增加计时器的累计时间。
///@返回以秒为单位的累计时间。
双止点(){
如果(!正在运行)
抛出std::runtime_错误(“计时器已停止!”);
累计_时间+=圈();
运行=错误;
返回累计时间;
}
///返回计时器的累计时间。如果计时器为空,则引发异常
///跑步。
双倍累积(){
如果(正在运行)
抛出std::runtime_错误(“计时器仍在运行!”);
返回累计时间;
}
///返回计时器启动与当前时间之间的时间
///如果计时器未运行,则引发异常。
双圈(){
如果(!正在运行)
抛出std::runtime_错误(“计时器未启动!”);
返回std::chrono::duration_cast(clock::now()-start_time).count();
}
///停止计时器并重置其累计时间。不会引发异常
///永远。
无效重置(){
累计_时间=0;
运行=错误;
}
};
///@brief管理一个基于控制台的进度条,以保持用户的娱乐。
///
///定义全球“无进展”将
///禁用所有进度操作,可能会加快程序的速度
///进度条的路径显示在ProgressBar.hpp中。
类进度条{
私人:
uint32总工作量;//<待完成的总工作量
uint32\u t next\u update;//<更新可见进度条的下一点
uint32\u t调用\u diff;//<工作单元中更新之间的间隔
uint32未完成的工作;
uint16旧百分比;旧百分比值(又名: