OpenMP和reduce()
我有3个简单的函数,一个是控制函数,接下来的2个函数使用OpenMP以稍微不同的方式完成。但是函数thread1比thread2和control给出了另一个分数,我不知道为什么OpenMP和reduce(),c,openmp,numerical-methods,C,Openmp,Numerical Methods,我有3个简单的函数,一个是控制函数,接下来的2个函数使用OpenMP以稍微不同的方式完成。但是函数thread1比thread2和control给出了另一个分数,我不知道为什么 #include <stdio.h> #include <stdlib.h> #include <math.h> #include <omp.h> float function(float x){ return pow(x,pow(x,sin(x))); }
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
float function(float x){
return pow(x,pow(x,sin(x)));
}
float integrate(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), i=begin, y1, y2;
for(i = 0; i<count; i++){
score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
return score;
}
float thread1(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), y1, y2;
int i;
#pragma omp parallel for reduction(+:score) private(y1,i) shared(count)
for(i = 0; i<count; i++){
y1 = ((function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0);
score = score + y1;
}
return score;
}
float thread2(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), y1, y2;
int i;
float * tab = (float*)malloc(count * sizeof(float));
#pragma omp parallel for
for(i = 0; i<count; i++){
tab[i] = (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
for(i=0; i<count; i++)
score += tab[i];
return score;
}
unsigned long long int rdtsc(void){
unsigned long long int x;
unsigned a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long long)a) | (((unsigned long long)d) << 32);
}
int main(int argc, char** argv){
unsigned long long counter = 0;
//test
counter = rdtsc();
printf("control: %f \n ",integrate (atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("control count: %lld \n",rdtsc()-counter);
counter = rdtsc();
printf("thread1: %f \n ",thread1(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("thread1 count: %lld \n",rdtsc()-counter);
counter = rdtsc();
printf("thread2: %f \n ",thread2(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("thread2 count: %lld \n",rdtsc()-counter);
return 0;
}
更新:
好的,我试着更快地完成这项工作,并且不计算周期值两次
double thread3(double begin, double end, int count){
double score = 0 , width = (end-begin)/(1.0*count), yp, yk;
int i,j, k;
#pragma omp parallel private (yp,yk)
{
int thread_num = omp_get_num_threads();
k = count / thread_num;
#pragma omp for private(i) reduction(+:score)
for(i=0; i<thread_num; i++){
yp = function(begin + i*k*width);
yk = function(begin + (i*k+1)*width);
score += (yp + yk) * width / 2.0;
for(j=i*k +1; j<(i+1)*k; j++){
yp = yk;
yk = function(begin + (j+1)*width);
score += (yp + yk) * width / 2.0;
}
}
#pragma omp for private(i) reduction(+:score)
for(i = k*thread_num; i<count; i++)
score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
return score;
}
double thread3(双起点、双终点、整数计数){
双倍分数=0,宽度=(结束-开始)/(1.0*计数),yp,yk;
int i,j,k;
#pragma omp并行专用(yp,yk)
{
int thread_num=omp_get_num_threads();
k=计数/线程数;
#pragma omp用于私人(i)减少(+:分数)
对于(i=0;i你在积分一个非常强的峰值函数-x(xsin(x))-它覆盖了积分范围内的7个数量级。这大约是32位浮点数的极限,因此会有一些问题,取决于对数字求和的顺序。这不是OpenMP的问题,只是数字敏感性的问题
例如,考虑这个完全串行代码执行相同的积分:
#include <stdio.h>
#include <math.h>
float function(float x){
return pow(x,pow(x,sin(x)));
}
int main(int argc, char **argv) {
const float begin=3., end=13.;
const int count = 100000;
const float width=(end-begin)/(1.*count);
float integral1=0., integral2=0., integral3=0.;
/* left to right */
for (int i=0; i<count; i++) {
integral1 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
/* right to left */
for (int i=count-1; i>=0; i--) {
integral2 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
/* centre outwards, first right-to-left, then left-to-right */
for (int i=count/2; i<count; i++) {
integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
for (int i=count/2-1; i>=0; i--) {
integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
printf("Left to right: %lf\n", integral1);
printf("Right to left: %lf\n", integral2);
printf("Centre outwards: %lf\n", integral3);
return 0;
}
--和你看到的一样,用两个线程求和必然会改变求和的顺序,因此你的答案也会改变
这里有几个选项。如果这只是一个测试问题,而此函数实际上并不表示要积分的内容,那么您可能已经很好了。否则,使用不同的数值方法可能会有所帮助
但这里也有一个简单的解决方案——数字的范围超过了浮点数的范围,使得答案对求和顺序非常敏感,但舒适地符合双精度的范围,使得问题不那么严重。请注意,更改为双精度
并不是解决所有问题的神奇方法g、 在某些情况下,它只是推迟了问题或允许您掩盖数值方法中的缺陷。但在这里,它实际上很好地解决了根本问题。将上面所有的浮点值更改为双精度
s可提供:
$ ./reduce
Left to right: 5407589.272885
Right to left: 5407589.272885
Centre outwards: 5407589.272885
另一方面,如果需要在范围(18,23)内集成此函数,即使是双精度也无法节省您的时间。添加了数值方法标记,因为这是这里的最终问题。顺便说一句,您可以使用omp\u get\u wtime()
或omp\u get\u wtick()
而不是rdtsc()
。
$ ./reduce
Left to right: 5407308.500000
Right to left: 5407430.000000
Centre outwards: 5407335.500000
$ ./reduce
Left to right: 5407589.272885
Right to left: 5407589.272885
Centre outwards: 5407589.272885