C++ OpenMP并行化以提高构造性能
第一种方法(并行化内环):C++ OpenMP并行化以提高构造性能,c++,for-loop,parallel-processing,openmp,collapse,C++,For Loop,Parallel Processing,Openmp,Collapse,第一种方法(并行化内环): for(j=0;j用以下方法查看: for(int x = 0; x < 4; ++x) #pragma omp parallel for ordered for(int y = 0; y < 4; ++y) #pragma omp ordered cout << x << ',' << y << " (by thread " << omp_get_thread_num()
for(j=0;j用以下方法查看:
for(int x = 0; x < 4; ++x)
#pragma omp parallel for ordered
for(int y = 0; y < 4; ++y)
#pragma omp ordered
cout << x << ',' << y << " (by thread " << omp_get_thread_num() << ')' << endl;
每个线程只需等待一些cout
即可并行完成所有工作。
但是:
#pragma omp parallel for ordered
for(int x = 0; x < 4; ++x)
for(int y = 0; y < 4; ++y)
#pragma omp ordered
cout << x << ',' << y << " (by thread " << omp_get_thread_num() << ')' << endl;
因此线程1
必须等待线程0
完成所有工作,然后才能第一次执行,并且几乎无法并行执行任何操作
尝试添加计划(静态,1)
到折叠版本,它的性能应该至少与第一个版本一样好。您从方法2得到正确的输出吗?变量y
和prob
在那里可能也是私有的。对不起,它们在那里是私有的。刚刚编辑过它,内循环和外循环的跳闸计数是多少?以及排序是什么如果省略ordered
指令及其包含的代码行,会得到多少计时?如果翻转内部和外部循环会怎么样?我看到您正在访问psi[I][j]
并迭代j
,然后I
。
#pragma omp parallel for collapse(2) ordered private(x, y, prob)
for(j=0; j<LATTICE_VW; ++j) {
for(i=0; i<LATTICE_VH; ++i) {
x = j*DX + LATTICE_W;
y = i*DY + LATTICE_S;
prob = psi[i][j].norm();
#pragma omp ordered
out << x << " " << y << " " << prob << endl;
}
}
for(int x = 0; x < 4; ++x)
#pragma omp parallel for ordered
for(int y = 0; y < 4; ++y)
#pragma omp ordered
cout << x << ',' << y << " (by thread " << omp_get_thread_num() << ')' << endl;
0,0 (by thread 0)
0,1 (by thread 1)
0,2 (by thread 2)
0,3 (by thread 3)
1,0 (by thread 0)
1,1 (by thread 1)
1,2 (by thread 2)
1,3 (by thread 3)
#pragma omp parallel for ordered
for(int x = 0; x < 4; ++x)
for(int y = 0; y < 4; ++y)
#pragma omp ordered
cout << x << ',' << y << " (by thread " << omp_get_thread_num() << ')' << endl;
#pragma omp parallel for collapse(2) ordered
for(int x = 0; x < 4; ++x)
for(int y = 0; y < 4; ++y)
#pragma omp ordered
cout << x << ',' << y << " (by thread " << omp_get_thread_num() << ')' << endl;
0,0 (by thread 0)
0,1 (by thread 0)
0,2 (by thread 0)
0,3 (by thread 0)
1,0 (by thread 1)
1,1 (by thread 1)
1,2 (by thread 1)
1,3 (by thread 1)
2,0 (by thread 2)
2,1 (by thread 2)
2,2 (by thread 2)
2,3 (by thread 2)
3,0 (by thread 3)
3,1 (by thread 3)
3,2 (by thread 3)
3,3 (by thread 3)