C 在没有竞争条件或错误共享的情况下,如何使用OpenMP并行化此函数?
我需要并行化一个函数,没有竞争条件,也没有错误的共享。我已经尝试了很多方法,但我还没有做到。功能是:C 在没有竞争条件或错误共享的情况下,如何使用OpenMP并行化此函数?,c,performance,parallel-processing,openmp,C,Performance,Parallel Processing,Openmp,我需要并行化一个函数,没有竞争条件,也没有错误的共享。我已经尝试了很多方法,但我还没有做到。功能是: __inline static void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) { int * clusterMemberCo
__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
int activeCluster = clusterAssignmentCurrent[i];
// update count of members in that cluster
++clusterMemberCount[activeCluster];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount[i] != 0)
// for each coordinate
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
}
\u内联静态
void calculateClusterCentroIDs(int numCoords、int numObjs、int numClusters、float*dataSetMatrix、int*clusterAssignmentCurrent、float*clustersCentroID){
int*clusterMemberCount=(int*)calloc(numClusters,sizeof(float));
//把所有的分数加起来
//每一点
对于(int i=0;i
你知道我怎样才能做到吗
谢谢。这很直截了当
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
int activeCluster = clusterAssignmentCurrent[i];
// update count of members in that cluster
++clusterMemberCount[activeCluster];
// sum point coordinates for finding centroid
#pragma omp parallel for
for (int j = 0; j < numCoords; ++j)
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
仅当简单解决方案不能产生足够的性能时才执行此操作
另一个循环也很简单
#pragma omp parallel for
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount[i] != 0)
// for each coordinate
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];
}
#pragma omp parallel for
对于(int i=0;i
同样,数据访问在正确性和(除边缘情况外)错误共享方面都是完全隔离的。这是非常直截了当的
// sum all points
// for every point
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
int activeCluster = clusterAssignmentCurrent[i];
// update count of members in that cluster
++clusterMemberCount[activeCluster];
// sum point coordinates for finding centroid
#pragma omp parallel for
for (int j = 0; j < numCoords; ++j)
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#endif
__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
// sum all points
// for every point
#pragma omp parallel
{
int nbOfThreads = omp_get_num_threads();
int thisThread = omp_get_thread_num();
// Schedule for the first step : process only cluster with ID in the [from , to[ range
int clustFrom = (thisThread*numClusters)/nbOfThreads;
int clustTo = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;
// Each thread will loop through all values of numObjs but only process them depending on activeCluster
// The loop is skipped only if the thread was assigned no cluster
if (clustTo>clustFrom){
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
int activeCluster = clusterAssignmentCurrent[i];
if (activeCluster>=clustFrom && activeCluster<clustTo){
// update count of members in that cluster
++clusterMemberCount[activeCluster];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
}
}
#pragma omp barrier
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for // straightforward
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount[i] != 0)
// for each coordinate
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
}
}
free(clusterMemberCount);
}
仅当简单解决方案不能产生足够的性能时才执行此操作
另一个循环也很简单
#pragma omp parallel for
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount[i] != 0)
// for each coordinate
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];
}
#pragma omp parallel for
对于(int i=0;i
同样,数据访问在正确性和错误共享(边缘情况除外)方面都是完全隔离的。您应该给出
numCoords
、numObjs
和numClusters
的预期值的数量级,因为并行化的最佳方式取决于此。尤其是numCoords
是重要的是要了解在坐标上并行化/向量化内部循环是否有意义;例如,您是采用三维坐标还是1000维
#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#endif
__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
// sum all points
// for every point
#pragma omp parallel
{
int nbOfThreads = omp_get_num_threads();
int thisThread = omp_get_thread_num();
// Schedule for the first step : process only cluster with ID in the [from , to[ range
int clustFrom = (thisThread*numClusters)/nbOfThreads;
int clustTo = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;
// Each thread will loop through all values of numObjs but only process them depending on activeCluster
// The loop is skipped only if the thread was assigned no cluster
if (clustTo>clustFrom){
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
int activeCluster = clusterAssignmentCurrent[i];
if (activeCluster>=clustFrom && activeCluster<clustTo){
// update count of members in that cluster
++clusterMemberCount[activeCluster];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
}
}
#pragma omp barrier
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for // straightforward
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount[i] != 0)
// for each coordinate
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
}
}
free(clusterMemberCount);
}
另一次尝试的缺点是第一个循环中的if
语句(对性能有害)、静态调度(可能的负载不平衡),但每个线程都会增加clusterMemberCount
和clustersCentroID
的相邻部分,从而限制错误共享的风险
#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#endif
__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
// sum all points
// for every point
#pragma omp parallel
{
int nbOfThreads = omp_get_num_threads();
int thisThread = omp_get_thread_num();
// Schedule for the first step : process only cluster with ID in the [from , to[ range
int clustFrom = (thisThread*numClusters)/nbOfThreads;
int clustTo = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;
// Each thread will loop through all values of numObjs but only process them depending on activeCluster
// The loop is skipped only if the thread was assigned no cluster
if (clustTo>clustFrom){
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
int activeCluster = clusterAssignmentCurrent[i];
if (activeCluster>=clustFrom && activeCluster<clustTo){
// update count of members in that cluster
++clusterMemberCount[activeCluster];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
}
}
#pragma omp barrier
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for // straightforward
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount[i] != 0)
// for each coordinate
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
}
}
free(clusterMemberCount);
}
\ifdef\u OPENMP
#包括
#否则
#定义omp_get_num_线程()1
#定义omp\u get\u thread\u num()0
#恩迪夫
__内联静态
void calculateClusterCentroIDs(int numCoords、int numObjs、int numClusters、float*dataSetMatrix、int*clusterAssignmentCurrent、float*clustersCentroID){
int*clusterMemberCount=(int*)calloc(numClusters,sizeof(float));
//把所有的分数加起来
//每一点
#pragma-omp并行
{
int nbOfThreads=omp_get_num_threads();
int thisThread=omp_get_thread_num();
//计划第一步:仅处理ID在[from,to]范围内的群集
int-clustFrom=(thisThread*numClusters)/nbOfThreads;
int clustTo=(thisThread+1==nbOfThreads)?numClusters:((thisThread+1)*numClusters)/nbOfThreads;
//每个线程将循环遍历numObjs的所有值,但仅根据activeCluster处理它们
//仅当线程未分配任何群集时,才会跳过循环
如果(clustTo>clustFrom){
对于(int i=0;i 如果(activeCluster>=clustFrom&&activeCluster您应该给出numCoords
、numObjs
和numClusters
的预期值的数量级,因为并行化的最佳方式取决于此。特别是,numCoords
对于查看是否在坐标上并行化/向量化内部循环非常重要s是有意义的;比如,你是采用三维坐标还是1000维
另一次尝试的缺点是第一个循环中的if
语句(对性能有害)、静态调度(可能的负载不平衡),但每个线程都会增加clusterMemberCount
和clustersCentroID
的相邻部分,从而限制错误共享的风险
#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#endif
__inline static
void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
// sum all points
// for every point
#pragma omp parallel
{
int nbOfThreads = omp_get_num_threads();
int thisThread = omp_get_thread_num();
// Schedule for the first step : process only cluster with ID in the [from , to[ range
int clustFrom = (thisThread*numClusters)/nbOfThreads;
int clustTo = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;
// Each thread will loop through all values of numObjs but only process them depending on activeCluster
// The loop is skipped only if the thread was assigned no cluster
if (clustTo>clustFrom){
for (int i = 0; i < numObjs; ++i) {
// which cluster is it in?
int activeCluster = clusterAssignmentCurrent[i];
if (activeCluster>=clustFrom && activeCluster<clustTo){
// update count of members in that cluster
++clusterMemberCount[activeCluster];
// sum point coordinates for finding centroid
for (int j = 0; j < numCoords; ++j)
clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
}
}
}
#pragma omp barrier
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for // straightforward
for (int i = 0; i < numClusters; ++i) {
if (clusterMemberCount[i] != 0)
// for each coordinate
for (int j = 0; j < numCoords; ++j)
clustersCentroID[i*numCoords + j] /= clusterMemberCount[i]; /// XXXX will divide by zero here for any empty clusters!
}
}
free(clusterMemberCount);
}
\ifdef\u OPENMP
#包括
#否则
#定义omp_get_num_线程()1
#定义omp\u get\u thread\u num()0
#恩迪夫
__内联静态
void calculateClusterCentroIDs(int numCoords、int numObjs、int numClusters、float*dataSetMatrix、int*clusterAssignmentCurrent、float*clustersCentroID){
int*clusterMemberCount=(int*)calloc(numClusters,sizeof(float));
//把所有的分数加起来
//每一点
#pragma-omp并行
{
int nbOfThreads=omp_get_num_threads();
int thisThread=omp_get_thread_num();
//计划第一步:仅处理ID在[from,to]范围内的群集
int-clustFrom=(thisThread*numClusters)/nbOfThreads;
int clustTo=(thisThread+1==nbOfThreads)?numClusters:((thisThread+1)*numClusters)/nbOfThreads;
//每小时