C 在没有竞争条件或错误共享的情况下，如何使用OpenMP并行化此函数？_C_Performance_Parallel Processing_Openmp

C 在没有竞争条件或错误共享的情况下，如何使用OpenMP并行化此函数？

c performance parallel-processing

C 在没有竞争条件或错误共享的情况下，如何使用OpenMP并行化此函数？,c,performance,parallel-processing,openmp,C,Performance,Parallel Processing,Openmp,我需要并行化一个函数，没有竞争条件，也没有错误的共享。我已经尝试了很多方法，但我还没有做到。功能是： __inline static void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) { int * clusterMemberCo

我需要并行化一个函数，没有竞争条件，也没有错误的共享。我已经尝试了很多方法，但我还没有做到。功能是：

__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
    int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));

    // sum all points
    // for every point
    for (int i = 0; i < numObjs; ++i) {
        // which cluster is it in?
        int activeCluster = clusterAssignmentCurrent[i];

        // update count of members in that cluster
        ++clusterMemberCount[activeCluster];

        // sum point coordinates for finding centroid
        for (int j = 0; j < numCoords; ++j)
            clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
    }


    // now divide each coordinate sum by number of members to find mean/centroid
    // for each cluster
    for (int i = 0; i < numClusters; ++i) {
        if (clusterMemberCount[i] != 0)
            // for each coordinate
            for (int j = 0; j < numCoords; ++j)
                clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];  /// XXXX will divide by zero here for any empty clusters!
    }

\u内联静态
void calculateClusterCentroIDs（int numCoords、int numObjs、int numClusters、float*dataSetMatrix、int*clusterAssignmentCurrent、float*clustersCentroID）{
int*clusterMemberCount=（int*）calloc（numClusters，sizeof（float））；
//把所有的分数加起来
//每一点
对于（int i=0；i


你知道我怎样才能做到吗
谢谢。
这很直截了当
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
    // which cluster is it in?
    int activeCluster = clusterAssignmentCurrent[i];

    // update count of members in that cluster
    ++clusterMemberCount[activeCluster];

    // sum point coordinates for finding centroid
    #pragma omp parallel for
    for (int j = 0; j < numCoords; ++j)
        clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}

仅当简单解决方案不能产生足够的性能时才执行此操作
另一个循环也很简单
#pragma omp parallel for
for (int i = 0; i < numClusters; ++i) {
    if (clusterMemberCount[i] != 0)
        // for each coordinate
        for (int j = 0; j < numCoords; ++j)
            clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];
}

#pragma omp parallel for
对于（int i=0；i

同样，数据访问在正确性和（除边缘情况外）错误共享方面都是完全隔离的。
这是非常直截了当的
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
    // which cluster is it in?
    int activeCluster = clusterAssignmentCurrent[i];

    // update count of members in that cluster
    ++clusterMemberCount[activeCluster];

    // sum point coordinates for finding centroid
    #pragma omp parallel for
    for (int j = 0; j < numCoords; ++j)
        clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}

#ifdef _OPENMP
   #include <omp.h>
#else
   #define omp_get_num_threads() 1
   #define omp_get_thread_num() 0
#endif


__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
    int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
    // sum all points
    // for every point

    #pragma omp parallel
    {
        int nbOfThreads = omp_get_num_threads();
        int thisThread = omp_get_thread_num();
        // Schedule for the first step : process only cluster with ID in the [from , to[ range
        int clustFrom = (thisThread*numClusters)/nbOfThreads;
        int clustTo   = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;

        // Each thread will loop through all values of numObjs but only process them depending on activeCluster
        // The loop is skipped only if the thread was assigned no cluster
        if (clustTo>clustFrom){
            for (int i = 0; i < numObjs; ++i) {
                // which cluster is it in?
                int activeCluster = clusterAssignmentCurrent[i];

                if (activeCluster>=clustFrom && activeCluster<clustTo){
                    // update count of members in that cluster
                    ++clusterMemberCount[activeCluster];

                    // sum point coordinates for finding centroid
                    for (int j = 0; j < numCoords; ++j)
                        clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
                }
            }
        }

        #pragma omp barrier

        // now divide each coordinate sum by number of members to find mean/centroid
        // for each cluster
        #pragma omp for // straightforward
        for (int i = 0; i < numClusters; ++i) {
            if (clusterMemberCount[i] != 0)
                // for each coordinate
                for (int j = 0; j < numCoords; ++j)
                    clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];  /// XXXX will divide by zero here for any empty clusters!
        }
    }
    free(clusterMemberCount);
}

仅当简单解决方案不能产生足够的性能时才执行此操作
另一个循环也很简单
#pragma omp parallel for
for (int i = 0; i < numClusters; ++i) {
    if (clusterMemberCount[i] != 0)
        // for each coordinate
        for (int j = 0; j < numCoords; ++j)
            clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];
}

#pragma omp parallel for
对于（int i=0；i

同样，数据访问在正确性和错误共享（边缘情况除外）方面都是完全隔离的。
您应该给出numCoords
、numObjs
和numClusters
的预期值的数量级，因为并行化的最佳方式取决于此。尤其是numCoords
是重要的是要了解在坐标上并行化/向量化内部循环是否有意义；例如，您是采用三维坐标还是1000维
#ifdef _OPENMP
   #include <omp.h>
#else
   #define omp_get_num_threads() 1
   #define omp_get_thread_num() 0
#endif


__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
    int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
    // sum all points
    // for every point

    #pragma omp parallel
    {
        int nbOfThreads = omp_get_num_threads();
        int thisThread = omp_get_thread_num();
        // Schedule for the first step : process only cluster with ID in the [from , to[ range
        int clustFrom = (thisThread*numClusters)/nbOfThreads;
        int clustTo   = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;

        // Each thread will loop through all values of numObjs but only process them depending on activeCluster
        // The loop is skipped only if the thread was assigned no cluster
        if (clustTo>clustFrom){
            for (int i = 0; i < numObjs; ++i) {
                // which cluster is it in?
                int activeCluster = clusterAssignmentCurrent[i];

                if (activeCluster>=clustFrom && activeCluster<clustTo){
                    // update count of members in that cluster
                    ++clusterMemberCount[activeCluster];

                    // sum point coordinates for finding centroid
                    for (int j = 0; j < numCoords; ++j)
                        clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
                }
            }
        }

        #pragma omp barrier

        // now divide each coordinate sum by number of members to find mean/centroid
        // for each cluster
        #pragma omp for // straightforward
        for (int i = 0; i < numClusters; ++i) {
            if (clusterMemberCount[i] != 0)
                // for each coordinate
                for (int j = 0; j < numCoords; ++j)
                    clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];  /// XXXX will divide by zero here for any empty clusters!
        }
    }
    free(clusterMemberCount);
}

另一次尝试的缺点是第一个循环中的if
语句（对性能有害）、静态调度（可能的负载不平衡），但每个线程都会增加clusterMemberCount
和clustersCentroID
的相邻部分，从而限制错误共享的风险
#ifdef _OPENMP
   #include <omp.h>
#else
   #define omp_get_num_threads() 1
   #define omp_get_thread_num() 0
#endif


__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
    int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
    // sum all points
    // for every point

    #pragma omp parallel
    {
        int nbOfThreads = omp_get_num_threads();
        int thisThread = omp_get_thread_num();
        // Schedule for the first step : process only cluster with ID in the [from , to[ range
        int clustFrom = (thisThread*numClusters)/nbOfThreads;
        int clustTo   = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;

        // Each thread will loop through all values of numObjs but only process them depending on activeCluster
        // The loop is skipped only if the thread was assigned no cluster
        if (clustTo>clustFrom){
            for (int i = 0; i < numObjs; ++i) {
                // which cluster is it in?
                int activeCluster = clusterAssignmentCurrent[i];

                if (activeCluster>=clustFrom && activeCluster<clustTo){
                    // update count of members in that cluster
                    ++clusterMemberCount[activeCluster];

                    // sum point coordinates for finding centroid
                    for (int j = 0; j < numCoords; ++j)
                        clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
                }
            }
        }

        #pragma omp barrier

        // now divide each coordinate sum by number of members to find mean/centroid
        // for each cluster
        #pragma omp for // straightforward
        for (int i = 0; i < numClusters; ++i) {
            if (clusterMemberCount[i] != 0)
                // for each coordinate
                for (int j = 0; j < numCoords; ++j)
                    clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];  /// XXXX will divide by zero here for any empty clusters!
        }
    }
    free(clusterMemberCount);
}

\ifdef\u OPENMP
#包括
#否则
#定义omp_get_num_线程（）1
#定义omp\u get\u thread\u num（）0
#恩迪夫
__内联静态
void calculateClusterCentroIDs（int numCoords、int numObjs、int numClusters、float*dataSetMatrix、int*clusterAssignmentCurrent、float*clustersCentroID）{
int*clusterMemberCount=（int*）calloc（numClusters，sizeof（float））；
//把所有的分数加起来
//每一点
#pragma-omp并行
{
int nbOfThreads=omp_get_num_threads（）；
int thisThread=omp_get_thread_num（）；
//计划第一步：仅处理ID在[from，to]范围内的群集
int-clustFrom=（thisThread*numClusters）/nbOfThreads；
int clustTo=（thisThread+1==nbOfThreads）？numClusters:（（thisThread+1）*numClusters）/nbOfThreads；
//每个线程将循环遍历numObjs的所有值，但仅根据activeCluster处理它们
//仅当线程未分配任何群集时，才会跳过循环
如果（clustTo>clustFrom）{
对于（int i=0；i如果（activeCluster>=clustFrom&&activeCluster您应该给出numCoords
、numObjs
和numClusters
的预期值的数量级，因为并行化的最佳方式取决于此。特别是，numCoords
对于查看是否在坐标上并行化/向量化内部循环非常重要s是有意义的；比如，你是采用三维坐标还是1000维
另一次尝试的缺点是第一个循环中的if
语句（对性能有害）、静态调度（可能的负载不平衡），但每个线程都会增加clusterMemberCount
和clustersCentroID
的相邻部分，从而限制错误共享的风险
#ifdef _OPENMP
   #include <omp.h>
#else
   #define omp_get_num_threads() 1
   #define omp_get_thread_num() 0
#endif


__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
    int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
    // sum all points
    // for every point

    #pragma omp parallel
    {
        int nbOfThreads = omp_get_num_threads();
        int thisThread = omp_get_thread_num();
        // Schedule for the first step : process only cluster with ID in the [from , to[ range
        int clustFrom = (thisThread*numClusters)/nbOfThreads;
        int clustTo   = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;

        // Each thread will loop through all values of numObjs but only process them depending on activeCluster
        // The loop is skipped only if the thread was assigned no cluster
        if (clustTo>clustFrom){
            for (int i = 0; i < numObjs; ++i) {
                // which cluster is it in?
                int activeCluster = clusterAssignmentCurrent[i];

                if (activeCluster>=clustFrom && activeCluster<clustTo){
                    // update count of members in that cluster
                    ++clusterMemberCount[activeCluster];

                    // sum point coordinates for finding centroid
                    for (int j = 0; j < numCoords; ++j)
                        clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
                }
            }
        }

        #pragma omp barrier

        // now divide each coordinate sum by number of members to find mean/centroid
        // for each cluster
        #pragma omp for // straightforward
        for (int i = 0; i < numClusters; ++i) {
            if (clusterMemberCount[i] != 0)
                // for each coordinate
                for (int j = 0; j < numCoords; ++j)
                    clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];  /// XXXX will divide by zero here for any empty clusters!
        }
    }
    free(clusterMemberCount);
}

\ifdef\u OPENMP
#包括
#否则
#定义omp_get_num_线程（）1
#定义omp\u get\u thread\u num（）0
#恩迪夫
__内联静态
void calculateClusterCentroIDs（int numCoords、int numObjs、int numClusters、float*dataSetMatrix、int*clusterAssignmentCurrent、float*clustersCentroID）{
int*clusterMemberCount=（int*）calloc（numClusters，sizeof（float））；
//把所有的分数加起来
//每一点
#pragma-omp并行
{
int nbOfThreads=omp_get_num_threads（）；
int thisThread=omp_get_thread_num（）；
//计划第一步：仅处理ID在[from，to]范围内的群集
int-clustFrom=（thisThread*numClusters）/nbOfThreads；
int clustTo=（thisThread+1==nbOfThreads）？numClusters:（（thisThread+1）*numClusters）/nbOfThreads；
//每小时