C++ 如何解决多线程代码中for循环中的依赖关系？_C++_Multithreading_Parallel Processing_Openmp

C++ 如何解决多线程代码中for循环中的依赖关系？

c++ multithreading parallel-processing

C++ 如何解决多线程代码中for循环中的依赖关系？,c++,multithreading,parallel-processing,openmp,C++,Multithreading,Parallel Processing,Openmp,我无法使用OpenMP解决for循环中的依赖项，因此程序将执行得更快。这就是我如何做到的，它的工作原理，但我需要一个更快的解决方案。有人知道怎么做才能更快地工作吗 #pragma omp parallel for num_threads(tc) ordered schedule(dynamic, 1) private(i) shared(openSet, maxVal, current, fScores) for(i = 0;i < openSet.size

我无法使用OpenMP解决for循环中的依赖项，因此程序将执行得更快。这就是我如何做到的，它的工作原理，但我需要一个更快的解决方案。有人知道怎么做才能更快地工作吗

        #pragma omp parallel for num_threads(tc) ordered schedule(dynamic, 1) private(i) shared(openSet, maxVal, current, fScores)
        for(i = 0;i < openSet.size();i++){
            if(fScores[openSet[i].x * dim + openSet[i].y] < maxVal){
                #pragma omp ordered
                maxVal = fScores[openSet[i].x * dim + openSet[i].y];
                current = openSet[i];
            }
        }

首先，你需要确定，这是你的热点。然后给我们一个合适的测试套件，以确保您实际获得性能。使用“google_benchmark”等工具。确保您是在发布模式下编译的，否则您的度量将被完全破坏

这就是说，我认为你正在寻找最大的减少

 #pragma omp parallel for reduction(max : maxVal )
    for(i = 0;i < openSet.size();i++){
        if(fScores[openSet[i].x * dim + openSet[i].y] > maxVal){

            maxVal = fScores[openSet[i].x * dim + openSet[i].y];

        }
    }

#pragma omp并行减少（max:maxVal）
对于（i=0；imaxVal）{
maxVal=fScores[openSet[i].x*dim+openSet[i].y]；
}
}

“当前”接缝是多余的。我认为这种比较混淆了

您能否以线性方式访问“fScores”中的数据。使用“openSet”上的间接寻址将有大量缓存未命中。如果您能够以某种方式摆脱这种间接方式，那么在单线程和多线程的情况下，您将获得很高的加速比

在第二个循环中，“推回”会破坏你的表现。我也有类似的问题。对我来说，这是非常有益的

创建具有最大可能长度的向量
用空值初始化它
使用符合标准的openmp正确设置
使用向量时，检查空值

首先，您需要确保这是您的热点。然后给我们一个合适的测试套件，以确保您实际获得性能。使用“google_benchmark”等工具。确保您是在发布模式下编译的，否则您的度量将被完全破坏

这就是说，我认为你正在寻找最大的减少

 #pragma omp parallel for reduction(max : maxVal )
    for(i = 0;i < openSet.size();i++){
        if(fScores[openSet[i].x * dim + openSet[i].y] > maxVal){

            maxVal = fScores[openSet[i].x * dim + openSet[i].y];

        }
    }

#pragma omp并行减少（max:maxVal）
对于（i=0；imaxVal）{
maxVal=fScores[openSet[i].x*dim+openSet[i].y]；
}
}

“当前”接缝是多余的。我认为这种比较混淆了

在第二个循环中，“推回”会破坏你的表现。我也有类似的问题。对我来说，这是非常有益的

创建具有最大可能长度的向量
用空值初始化它
使用符合标准的openmp正确设置
使用向量时，检查空值

将按照循环迭代的顺序执行

reduce

reduce（max:maxVal）

reduce子句可用于执行某些形式的重复并行计算（…）。

maxVal

current

maxVal

current

 #pragma omp parallel for num_threads(tc) ordered schedule(dynamic, 1) private(i) shared(openSet, maxVal, current, fScores)
    for(i = 0;i < openSet.size();i++){
        if(fScores[openSet[i].x * dim + openSet[i].y] < maxVal){ // <-- you meant '>' not '<'
            #pragma omp ordered
            maxVal = fScores[openSet[i].x * dim + openSet[i].y];
            current = openSet[i];
        }
    }

将按照循环迭代的顺序执行

 #pragma omp parallel for num_threads(tc) ordered schedule(dynamic, 1) private(i) shared(openSet, maxVal, current, fScores)
    for(i = 0;i < openSet.size();i++){
        if(fScores[openSet[i].x * dim + openSet[i].y] < maxVal){ // <-- you meant '>' not '<'
            #pragma omp ordered
            maxVal = fScores[openSet[i].x * dim + openSet[i].y];
            current = openSet[i];
        }
    }

    int shared_maxVal[tc] = {INT32_MAX};
    int shared_current[tc] = {0};
    #pragma omp parallel num_threads(tc)
    { 
        int threadID = omp_get_thread_num();
        #pragma omp for shared(openSet, fScores)
        for(int i = 0;i < openSet.size();i++){ 
           if(fScores[openSet[i].x * dim + openSet[i].y] > shared_maxVal[threadID]){
              shared_maxVal[threadID] = fScores[openSet[i].x * dim + openSet[i].y];
              shared_current[threadID] = openSet[i];
           }
        }
    }
 
    for(int i = 0; i < tc; i++){
       if(maxVal < shared_maxVal[i]){
          maxVal = shared_maxVal[i];
          current = shared_current[i];
       }
    }

#pragma omp parallel for num_threads(tc) ordered schedule(dynamic, 1) private(i) shared(neighbours, openSet, gScores, fScores, tentative_gScore)
for(i = 0;i < neighbours.size();i++){
    #pragma omp ordered
    tentative_gScore = gScores[current.x * dim + current.y] + 1;

    if(tentative_gScore < gScores[neighbours[i].x * dim + neighbours[i].y]){
        cameFrom[neighbours[i].x * dim + neighbours[i].y] = current;
        gScores[neighbours[i].x * dim + neighbours[i].y] = tentative_gScore;
        fScores[neighbours[i].x * dim + neighbours[i].y] = tentative_gScore + hScore(); //(p.x, p.y, xEnd, yEnd)
        if(contains(openSet, neighbours[i]) == false){
            openSet.push_back(neighbours[i]);
        }
    }
}

    // Create an array of "openSets" let us named "shared_openSet"
    #pragma omp parallel num_threads(tc)
    {  
       int threadID = omp_get_thread_num();
       #pragma omp for shared(neighbours, gScores, fScores)
       for(int i = 0;i < neighbours.size();i++){
        // I just assume the type in but you can change if for the real type
        int tentative_gScore = gScores[current.x * dim + current.y] + 1;

        if(tentative_gScore < gScores[neighbours[i].x * dim + neighbours[i].y]){
            cameFrom[neighbours[i].x * dim + neighbours[i].y] = current;
            gScores[neighbours[i].x * dim + neighbours[i].y] = tentative_gScore;
            fScores[neighbours[i].x * dim + neighbours[i].y] = tentative_gScore + hScore();
            if(contains(openSet, neighbours[i]) == false){
                shared_openSet[threadID].push_back(neighbours[i]);
            }
        }
      }
   }
  // merge all the elements from shared_openSet into openSet.