C 使用openMP并行化代码时,哪些变量应该是private和/或firstprivate,什么时候合适?

C 使用openMP并行化代码时,哪些变量应该是private和/或firstprivate,什么时候合适?,c,parallel-processing,openmp,C,Parallel Processing,Openmp,我的任务是并行化这个函数,使其比顺序运行时更快,但是我尝试过的语句的#pragma omp parallel似乎没有实质性的效果 除了#pragma语句外,此代码的顺序版本基本相同。我意识到代码写得很糟糕,这是任务的一部分,任务的目标是实现8倍的加速。运行代码的linux机器是一个具有超线程的8核系统 测试运行时的方法是通过以下代码行的输出: clock_gettime(CLOCK_MONOTONIC, &start); work_it_par(original, new

我的任务是并行化这个函数,使其比顺序运行时更快,但是我尝试过的语句的#pragma omp parallel似乎没有实质性的效果

除了#pragma语句外,此代码的顺序版本基本相同。我意识到代码写得很糟糕,这是任务的一部分,任务的目标是实现8倍的加速。运行代码的linux机器是一个具有超线程的8核系统

测试运行时的方法是通过以下代码行的输出:

    clock_gettime(CLOCK_MONOTONIC, &start);
    work_it_par(original, new);
    clock_gettime(CLOCK_MONOTONIC, &finish);
类似的代码调用相同函数的顺序版本,然后通过顺序时间/并行时间计算加速。然而,我的结果似乎是高度不一致的,我似乎无法并行化超过1.5

void work_it_par(long *old, long *new) {
    int i, j, k;
    int u, v, w;
    long compute_it;
    long aggregate=1.0;
    long weNeedTheFunc = we_need_the_func();
    long gimmieTheFunc = gimmie_the_func();
    int marker = DIM-1;
    #pragma omp parallel for private(i, j, k, compute_it)
    for (i=1; i<marker; i++) {
        for (j=1; j<marker; j++) {
            for (k=1; k<marker; k++) {
                compute_it = old[i*DIM*DIM+j*DIM+k] * weNeedTheFunc;
                aggregate+= compute_it / gimmieTheFunc;
            }
        }
    }
    printf("AGGR:%ld\n",aggregate);
//#pragma omp parallel for private(i, j, u, v)
    for (i=1; i<marker; i++) {
#pragma omp parallel for private(k)
    for (j=1; j<marker; j++) {
        for (k=1; k<marker; k++){
            new[i*DIM*DIM+j*DIM+k]=0;
            for (u=-1; u<=1; u++) {
                for (v=-1; v<=1; v++) {
                    for (w=-1; w<=1; w++) {
                        new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
                    }
                }
            }
        new[i*DIM*DIM+j*DIM+k]/=27;
      }
    }
  }
#pragma omp parallel for private(i, j)
    for (i=1; i<marker; i++) {
//#pragma omp parallel for private(k)
        for (j=1; j<marker; j++) {
            for (k=1; k<marker; k++) {
                u=(new[i*DIM*DIM+j*DIM+k]/100);
                if (u<=0) u=0;
                if (u>=9) u=9;
                histogrammy[u]++;
             }
         }
    }
}
void work\u it\u par(长*旧,长*新){
int i,j,k;
国际u,v,w;
长时间计算它;
长骨料=1.0;
long weNeedTheFunc=我们需要函数();
long gimmieTheFunc=gimmie_the_func();
int标记=DIM-1;
#pragma omp并行专用(i,j,k,compute_it)

对于(i=1;i首先,您的代码在许多地方都是错误的。我第一眼就计算了7个比赛条件

我建议使用以下一般规则:

  • 尽可能地将变量声明为本地变量。这比试图确定哪个变量需要私有更容易正确。将变量声明为常量也有助于确保它们可以安全地共享

  • 如果对并行循环中的变量求和,请使用reduction子句

  • 将这些原则应用于第一个循环如下所示:

    #pragma omp parallel for reduction(+:aggregate)
    for (int i=1; i<marker; i++) {
        for (int j=1; j<marker; j++) {
            for (int k=1; k<marker; k++) {
                long compute_it = old[i*DIM*DIM+j*DIM+k] * weNeedTheFunc;
                aggregate+= compute_it / gimmieTheFunc;
            }
        }
    }
    

    如果您要确保代码正确并希望查看性能,请考虑以下事项:使用知道OpenMP /线程的性能分析工具。如果您想讨论StAdvExcel上的实际性能,则必须

  • 包括a-包括您如何构建它
  • 描述您的具体绩效衡量方法
  • 包括您的具体绩效衡量结果
  • 描述您的系统(CPU、编译器版本)

  • 一般情况下,在变量上赋值时,必须小心地考虑变量的私密性。否则,线程间的竞争会导致随机错误行为。 在代码中的几种情况下都会发生这种情况。
    compute_it=…(已经是私有变量)
    agregate+=…(需要减少的特殊情况)
    u=…(应该是私有的)
    直方图[u]+=…(又是一个缩减问题)

    分配给数组元素也可能是一个问题,但这取决于索引。如果索引依赖于线程并且在每个线程中都不同,则通常是正确的,除了错误的共享情况。这是大多数分配给数组的情况。例如,
    new[i*DIM*DIM+j*DIM+k]=…
    ,所有线程都将具有不同的i(多亏了并行for),阵列的不同部分将受到关注,并且不存在特定的并行问题。
    对于
    历史语法[u]
    的分配,情况不同,因为u依赖于数据,并且在不同线程中可以相同。可以通过减少较新的omp版本来管理它,但否则必须进行局部累积,并且在线程结束时,在适当保护的区域中更新全局数组

    这是您的代码的一个修改版本(未经测试,因为您没有提供一个工作示例)。我还添加了一些与并行化无关的注释和修改。请使用三元组检查注释///

    void work_it_par(long *old, long *new) {
        int i, j, k;
        int u, v, w;
        long compute_it;
        long aggregate=1.0;    //// really?
                               //// I am really surprised that you use a long. 
                               ////   A double seems more appropriate
        long weNeedTheFunc = we_need_the_func();
        long gimmieTheFunc = gimmie_the_func();
        int marker = DIM-1; 
    ///    #pragma omp parallel for private(i, j, k, compute_it)
    #   pragma omp parallel for private(i, j, k, compute_it) reduction(+:aggregate) 
                        /// introduced a reduction on aggregate
        for (i=1; i<marker; i++) {
            for (j=1; j<marker; j++) {
                for (k=1; k<marker; k++) {
                    compute_it = old[i*DIM*DIM+j*DIM+k] * weNeedTheFunc;  
                    /// aggregate+= compute_it / gimmieTheFunc; /// race on shared var aggregate 
                                                                /// solved by the reduction
                    aggregate += compute_it ; /// Unrelated to parallel processing, 
                                              /// but do not do a division in your loop
                                              /// divisions are *expensive* and
                                              /// denominator is always the same
                }
            }
        }
        aggregate /= gimmieTheFunc ; /// now we do the division, but just once
        printf("AGGR:%ld\n",aggregate);
    //#pragma omp parallel for private(i, j, u, v)
        for (i=1; i<marker; i++) {
    #pragma omp parallel for private(k)
        for (j=1; j<marker; j++) {
            for (k=1; k<marker; k++){
                new[i*DIM*DIM+j*DIM+k]=0;
                for (u=-1; u<=1; u++) {  
                    for (v=-1; v<=1; v++) {
                        for (w=-1; w<=1; w++) {
                            new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
                        }
                    }
                }
            new[i*DIM*DIM+j*DIM+k]/=27;
          }
        }
      }
    ///#pragma omp parallel for private(i, j)
    #pragma omp parallel private(i, j, u)  /// parallel region
      {
        int private_histogrammy[10]; /// used to accumulate in the threads
        for (int ii=0; ii<10; ii++) private_histogrammy[ii]=0;
    #   pragma omp for             /// a parallel for loop in the parallel region
        for (i=1; i<marker; i++) {
            for (j=1; j<marker; j++) {
                for (k=1; k<marker; k++) {
    ///                u=(new[i*DIM*DIM+j*DIM+k]/100);
                    u=(new[i*DIM*DIM+j*DIM+k]); /// to reduce number of divisions
    ///                if (u<=0) u=0;
    ///                if (u>=9) u=9;
    ///                histogrammy[u]++;
                    if (u<=0) private_histogrammy[0]++;
                    else if (u>=900) private_histogrammy[9]++;
                    else private_histogrammy[u/100]++;
                 }
             }
        }
        /// all is done update the global histogrammy
    #   pragma omp critical
        /// update the shared array
        /// probably faster with a critical section that updates globally
        /// the (small) array than using atomic on array elements
        /// but alternatives may be tested
        for (int uu=0; uu<10; uu++) histogrammy[uu] += private_histogrammy[uu];
      }  /// end of parallel region
    }
    
    void work\u it\u par(长*旧,长*新){
    int i,j,k;
    国际u,v,w;
    长时间计算它;
    长聚合=1.0;///真的吗?
    ////我真的很惊讶你用了这么长的时间。
    ////双人房似乎更合适
    long weNeedTheFunc=我们需要函数();
    long gimmieTheFunc=gimmie_the_func();
    int标记=DIM-1;
    ///#专用pragma omp并行(i,j,k,compute#it)
    #pragma omp并行用于私有(i,j,k,compute_it)缩减(+:聚合)
    ///引入了对骨料的减少
    
    对于(i=1;i我用下面的答案实现了12倍的时钟时间。通过将所有三个大环路转换为一个三层环路,并在环路内设置并行omp pragma来控制流量,可以获得更精细的优化

    void work\u it\u par(长*旧,长*新){
    int i,j,k;
    int i1,j1,k1;
    国际u,v,w;
    长时间计算它;
    长骨料=1.0;
    int N=DIM-1;
    int gimme=gimmie_the_func();
    int need=我们需要函数();
    #pragma omp parallel for private(i,j,k,compute_it)reduce(+:aggregate)//reduce this part
    
    对于(i=1;iTo sum on
    agregate
    您需要openmp缩减。
    u
    在第三个块中必须是私有的。在第三个块中,历史语法的管理有问题。还必须使用缩减或局部变量来完成。下面是您代码中的一些明显问题。非常感谢。这非常有用。我认为我缺少的是w作为循环平铺的添加。这在#pragmas之外产生了如此大的差异。只是别忘了将此标记为答案并向上投票:)。
    void work_it_par(long *old, long *new) {
        int i, j, k;
        int u, v, w;
        long compute_it;
        long aggregate=1.0;    //// really?
                               //// I am really surprised that you use a long. 
                               ////   A double seems more appropriate
        long weNeedTheFunc = we_need_the_func();
        long gimmieTheFunc = gimmie_the_func();
        int marker = DIM-1; 
    ///    #pragma omp parallel for private(i, j, k, compute_it)
    #   pragma omp parallel for private(i, j, k, compute_it) reduction(+:aggregate) 
                        /// introduced a reduction on aggregate
        for (i=1; i<marker; i++) {
            for (j=1; j<marker; j++) {
                for (k=1; k<marker; k++) {
                    compute_it = old[i*DIM*DIM+j*DIM+k] * weNeedTheFunc;  
                    /// aggregate+= compute_it / gimmieTheFunc; /// race on shared var aggregate 
                                                                /// solved by the reduction
                    aggregate += compute_it ; /// Unrelated to parallel processing, 
                                              /// but do not do a division in your loop
                                              /// divisions are *expensive* and
                                              /// denominator is always the same
                }
            }
        }
        aggregate /= gimmieTheFunc ; /// now we do the division, but just once
        printf("AGGR:%ld\n",aggregate);
    //#pragma omp parallel for private(i, j, u, v)
        for (i=1; i<marker; i++) {
    #pragma omp parallel for private(k)
        for (j=1; j<marker; j++) {
            for (k=1; k<marker; k++){
                new[i*DIM*DIM+j*DIM+k]=0;
                for (u=-1; u<=1; u++) {  
                    for (v=-1; v<=1; v++) {
                        for (w=-1; w<=1; w++) {
                            new[i*DIM*DIM+j*DIM+k]+=old[(i+u)*DIM*DIM+(j+v)*DIM+(k+w)];
                        }
                    }
                }
            new[i*DIM*DIM+j*DIM+k]/=27;
          }
        }
      }
    ///#pragma omp parallel for private(i, j)
    #pragma omp parallel private(i, j, u)  /// parallel region
      {
        int private_histogrammy[10]; /// used to accumulate in the threads
        for (int ii=0; ii<10; ii++) private_histogrammy[ii]=0;
    #   pragma omp for             /// a parallel for loop in the parallel region
        for (i=1; i<marker; i++) {
            for (j=1; j<marker; j++) {
                for (k=1; k<marker; k++) {
    ///                u=(new[i*DIM*DIM+j*DIM+k]/100);
                    u=(new[i*DIM*DIM+j*DIM+k]); /// to reduce number of divisions
    ///                if (u<=0) u=0;
    ///                if (u>=9) u=9;
    ///                histogrammy[u]++;
                    if (u<=0) private_histogrammy[0]++;
                    else if (u>=900) private_histogrammy[9]++;
                    else private_histogrammy[u/100]++;
                 }
             }
        }
        /// all is done update the global histogrammy
    #   pragma omp critical
        /// update the shared array
        /// probably faster with a critical section that updates globally
        /// the (small) array than using atomic on array elements
        /// but alternatives may be tested
        for (int uu=0; uu<10; uu++) histogrammy[uu] += private_histogrammy[uu];
      }  /// end of parallel region
    }