有没有办法避免这种隐式的MPI_Allreduce（）同步？_Mpi

有没有办法避免这种隐式的MPI_Allreduce（）同步？

mpi

有没有办法避免这种隐式的MPI_Allreduce（）同步？,mpi,Mpi,我正在编写一个MPI程序，它使用一个库来进行自己的MPI调用。在我的程序中，我有一个从库中调用函数的循环。我从库中调用的函数使用了MPI\u Allreduce 这里的问题是，在我的程序中，一些列组可以在其他列组之前退出循环，这会导致MPI\u Allreduce调用挂起，因为并非所有列组都将再次调用MPI\u Allreduce 有没有办法在不修改我正在使用的库的源代码的情况下进行编程？下面是演示执行模式的示例代码 #include <stdio.h> #include <

我正在编写一个MPI程序，它使用一个库来进行自己的MPI调用。在我的程序中，我有一个从库中调用函数的循环。我从库中调用的函数使用了

MPI\u Allreduce

这里的问题是，在我的程序中，一些列组可以在其他列组之前退出循环，这会导致

MPI\u Allreduce

调用挂起，因为并非所有列组都将再次调用

MPI\u Allreduce

有没有办法在不修改我正在使用的库的源代码的情况下进行编程？

下面是演示执行模式的示例代码

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#include <math.h>
#include <assert.h>

#define N_ITEMS 100000
#define ITERATIONS 32 

float *create_rand_nums(int num_elements) {
  float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
  assert(rand_nums != NULL);
  int i;
  for (i = 0; i < num_elements; i++) {
    rand_nums[i] = (rand() / (float)RAND_MAX);
  }
  return rand_nums;
}

void reduce_stddev(int world_rank, int world_size, int num_elements_per_proc)
{
  fprintf(stdout, "Calling %s: %d\n", __func__, world_rank);
  fflush(stdout);

  srand(time(NULL)*world_rank);
  float *rand_nums = NULL;
  rand_nums = create_rand_nums(num_elements_per_proc);

  float local_sum = 0;
  int i;
  for (i = 0; i < num_elements_per_proc; i++) {
    local_sum += rand_nums[i];
  }

  float global_sum;
  fprintf(stdout, "%d: About to call all reduce\n", world_rank);
  fflush(stdout);
  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM,
                MPI_COMM_WORLD);
  fprintf(stdout, "%d: done calling all reduce\n", world_rank);
  fflush(stdout);
  float mean = global_sum / (num_elements_per_proc * world_size);

  float local_sq_diff = 0;
  for (i = 0; i < num_elements_per_proc; i++) {
    local_sq_diff += (rand_nums[i] - mean) * (rand_nums[i] - mean);
  }

  float global_sq_diff;
  MPI_Reduce(&local_sq_diff, &global_sq_diff, 1, MPI_FLOAT, MPI_SUM, 0,
             MPI_COMM_WORLD);

  if (world_rank == 0) {
    float stddev = sqrt(global_sq_diff /
                        (num_elements_per_proc * world_size));
    printf("Mean - %f, Standard deviation = %f\n", mean, stddev);
  }

  free(rand_nums);
}

int main(int argc, char* argv[]) {
  if (argc != 2) {
    fprintf(stderr, "Usage: avg num_elements_per_proc\n");
    exit(1);
  }

  int num_elements_per_proc = atoi(argv[1]);

  MPI_Init(NULL, NULL);

  int world_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
  int world_size;
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);

  unsigned long long j = 0;

  for(j = 0; j < ITERATIONS; j++)
  {
    /* Function which calls MPI_Allreduce */
    reduce_stddev(world_rank, world_size, num_elements_per_proc);

    /* Simulates some processes leaving the loop early */
    if( (j == (ITERATIONS/2)) && (world_rank % 2 == 0))
    {
      fprintf(stdout, "%d exiting\n", world_rank);
      fflush(stdout);
      break;
    }
  }

  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();

  return EXIT_SUCCESS;
}

#包括
#包括
#包括
#包括
#包括
#包括
#定义N_项目100000
#定义迭代32
float*create_rand_nums（int num_元素）{
float*rand_nums=（float*）malloc（sizeof（float）*num_元素）；
断言（rand_nums！=NULL）；
int i；
对于（i=0；i

这在MPI中始终是一个问题-当一个列组完成时，如何告诉所有其他列组？最简单的方法是每个人都设置一个真/假标志，然后执行allreduce以查看是否有人完成了。最后使用此代码似乎有效

  for(j = 0; j < ITERATIONS; j++)
    {
      /* Function which calls MPI_Allreduce */
      reduce_stddev(world_rank, world_size, num_elements_per_proc);

      int finished = 0;

      /* Simulates some processes leaving the loop early */
      if( (j == (ITERATIONS/2)) && (world_rank % 2 == 0))
      {
        fprintf(stdout, "%d finished\n", world_rank);
        fflush(stdout);
        finished = 1;
      }

      /* Check to see if anyone has finished */

      int anyfinished;

      MPI_Allreduce(&finished, &anyfinished, 1, MPI_INT, MPI_LOR,
        MPI_COMM_WORLD);

      if (anyfinished)
      {
         fprintf(stdout, "%d exiting\n", world_rank);
         break;
      }
   }

for（j=0；j


好的，我只是重读了你的问题，也许我误解了。您希望其他人继续计算吗？
您是否能够将通信器传递到库函数中，或者它是否完全如示例所示-即，在MPI_COMM_WORLD上硬编码调用MPI_Allreduce？@timdykes否，我无法将通信器传递到函数中。不幸的是，该函数是硬编码的。不退出循环，只需使用中性元素进行集合操作（例如，0
ifMPI\u SUM
）。也就是说，调用在引擎盖下执行集体操作但不是所有级别的子程序有意义吗？在意识到我误解了这个问题后，我按照@Gilles的思路思考。然而，reduce（而不是allreduce）可能存在一个问题，它挑出世界排名0的人可能没有参与……是的，其他人应该继续计算。谢谢你。