C 使用MPI_Isend时出现分段错误

C 使用MPI_Isend时出现分段错误,c,mpi,C,Mpi,我的程序的目的是计算内导体和外导体之间的静电势,方法是将其分成网格,然后分成网格片。每个处理器获得一个切片并在每个切片上运行计算。我使用MPI Isend和MPI Irecv在处理器之间发送数据。测试代码时,我发现一个分段错误: [physnode5:81440] *** Process received signal *** [physnode5:81440] Signal: Segmentation fault (11) [physnode5:81440] Signal code: Addr

我的程序的目的是计算内导体和外导体之间的静电势,方法是将其分成网格,然后分成网格片。每个处理器获得一个切片并在每个切片上运行计算。我使用MPI Isend和MPI Irecv在处理器之间发送数据。测试代码时,我发现一个分段错误:

[physnode5:81440] *** Process received signal ***
[physnode5:81440] Signal: Segmentation fault (11)
[physnode5:81440] Signal code: Address not mapped (1)
[physnode5:81440] Failing at address: 0x58
[physnode5:81440] [ 0] /lib64/libpthread.so.0(+0xf5d0)[0x2ab8069df5d0]
[physnode5:81440] [ 1] /opt/yarcc/libraries/openmpi/2.1.0/1/default/lib/libmpi.so.20(ompi_request_default_wait+0xd)[0x2ab8066495ed]
[physnode5:81440] [ 2] /opt/yarcc/libraries/openmpi/2.1.0/1/default/lib/libmpi.so.20(MPI_Wait+0x5d)[0x2ab80667a00d]
[physnode5:81440] [ 3] ./mpi_tezt.exe[0x400ffc]
[physnode5:81440] [ 4] /lib64/libc.so.6(__libc_start_main+0xf5)[0x2ab806c0e3d5]
[physnode5:81440] [ 5] ./mpi_tezt.exe[0x4009b9]
[physnode5:81440] *** End of error message ***
当执行此代码位时。请不要这样,我已经ssh'ed到集群中了。文件名是mpi_tezt.exe(是的,我错发了)。 我已检查要发送的阵列是否已正确分配,并且send和recv未发送或接收不存在的数据(即发送阵列范围之外的数据)。 我的MPI_Isend和MPI_Irecv代码如下:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>

int main(int argc, char *argv[])
{
  /*MPI Specific Variables*/
  int my_size, my_rank, up, down;
  MPI_Request reqU, reqD, sreqU, sreqD;
  MPI_Status rUstatus, rDstatus, sUstatus, sDstatus;

   /*Physical Dimensions*/
  double Linner = 5.0;/*mm*/
  double Rinner = 1.0;/*mm*/
  double phi_0 = 1000.0;/*V*/

  /*Other Variables*/
  int grid_size = 100;
  int slice;
  int x,y;
  double grid_res_y = 0.2;
  double grid_res_x = 0.1;
  int xboundary, yboundary;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &my_size);

  /*Determining neighbours*/
  if (my_rank != 0) /*if statemets used to stop highest and lowest rank neighbours arent outside 0 - my_size-1 range of ranks*/
    {
      up = my_rank-1;
    }
  else
    {
      up = 0;
    }

  if(my_rank != my_size-1)
    {
      down = my_rank+1;
    }
  else
    {
      down = my_size-1;
    }

  /*cross-check: presumed my_size is a factor of gridsize else there are odd sized slices and this is not coded for*/
  if (grid_size%my_size != 0)
    {
      printf("ERROR - number of procs =  %d, this is not a factor of grid_size %d\n", my_size, grid_size);
      exit(0);
    }

  /*Set Up Distributed Data Approach*/
  slice = grid_size/my_size;

  yboundary = Linner/grid_res_y; /*y grid index of inner conductor wall*/ 
  xboundary = Rinner/grid_res_x; /*x grid and individual array index of inner conductor wall*/


  double phi[slice+2][grid_size]; /*extra 2 rows to allow for halo data*/

  for (y=0; y < slice+2; y++)
    {
      for (x=0; x < grid_size; x++)
        { 
          phi[y][x] = 0.0;
        }
    }

  if(my_rank == 0) /*Boundary Containing rank does 2 loops. One over part with inner conductor and one over part without inner conductor*/
    {
      for(y=0; y < slice+1; y++)
        {
          for(x=xboundary; x < grid_size; x++)
            {
              phi[y][x] = phi_0;
            }
        }   
    }


  if (my_rank < my_size-1)
    {
      /*send top most strip up one node to be recieved as bottom halo*/
      MPI_Isend(&phi[1][0], grid_size  , MPI_DOUBLE, down, 1, MPI_COMM_WORLD, &sreqU);  
      /*recv top halo from up one node*/
      MPI_Irecv(&phi[slice+1][0], grid_size, MPI_DOUBLE, down, 2, MPI_COMM_WORLD, &reqU);
    }

  if (my_rank > 0)
    {
      /*recv top halo from down one node*/
      MPI_Irecv(&phi[0][0], grid_size , MPI_DOUBLE, up, 2, MPI_COMM_WORLD, &reqD);
      /*send bottom most strip down one node to be recieved as top halo*/
      MPI_Isend(&phi[slice][0], grid_size , MPI_DOUBLE, up, 1, MPI_COMM_WORLD, &sreqD);
    }

  if (my_rank<my_size-1)
    {
      /*Wait for send to down one rank to complete*/
      MPI_Wait(&sreqD, &sDstatus);
      /*Wait for recieve from up one rank to complete*/
      MPI_Wait(&reqD, &rDstatus);
    }

  if (my_rank>0)
    {
      /*Wait for send to up down one rank to complete*/
      MPI_Wait(&sreqU, &sUstatus);
      /*Wait for recieve from down one rank to complete*/
      MPI_Wait(&reqU, &rUstatus);
    }


  MPI_Finalize();

  return 0;
}
#包括
#包括
#包括
#包括
#包括
int main(int argc,char*argv[])
{
/*MPI特定变量*/
int我的大小,我的等级,向上,向下;
MPI_请求请求、请求、sreqU、sreqD;
MPI_状态rUstatus、rDstatus、sUstatus、sDstatus;
/*物理尺寸*/
双直线=5.0;/*毫米*/
双Rinner=1.0;/*毫米*/
双φ0=1000.0;/*V*/
/*其他变量*/
int grid_size=100;
int切片;
int x,y;
双网格分辨率y=0.2;
双网格分辨率x=0.1;
int xboundary,y boundary;
MPI_Init(&argc,&argv);
MPI通信等级(MPI通信世界和我的通信等级);
MPI_Comm_大小(MPI_Comm_WORLD和my_大小);
/*确定邻居*/
if(my_rank!=0)/*如果用于阻止最高和最低级别邻居的statemet不在0-my_size-1级别范围之外*/
{
向上=我的排名-1;
}
其他的
{
up=0;
}
如果(我的排名!=我的尺码-1)
{
向下=我的排名+1;
}
其他的
{
向下=我的尺寸-1;
}
/*交叉检查:假定my_大小是gridsize的一个因子,否则会有奇数大小的切片,而这并不是为其编码的*/
如果(网格大小%my\u大小!=0)
{
printf(“错误-进程数=%d,这不是网格大小%d的一个因素”,我的网格大小,网格大小);
出口(0);
}
/*建立分布式数据方法*/
切片=网格大小/我的大小;
yboundary=Linner/grid\u res\u y;/*y内导体壁的网格索引*/
xboundary=Rinner/grid_res_x;/*x网格和内导体壁的单个阵列索引*/
双phi[切片+2][网格大小];/*额外2行以允许光晕数据*/
对于(y=0;y0)
{
/*从下一个节点接收顶部光晕*/
MPI_Irecv(&phi[0][0]、网格大小、MPI_双精度、up、2、MPI_通信世界和需求);
/*将最底部的条带向下发送一个节点,作为顶部光环接收*/
MPI Isend(&phi[slice][0]、网格大小、MPI DOUBLE、up、1、MPI COMM\u WORLD和sreqD);
}
如果(我的分数为0)
{
/*等待发送上下一列完成*/
MPI_Wait(&sreqU,&sUstatus);
/*等待从下一级接收完成*/
MPI_等待(&reqU,&rUstatus);
}
MPI_Finalize();
返回0;
}
我一直在2个处理器(0级和1级)上测试,希望能扩展到更多处理器

有什么想法可能是错误所在吗?

您在第一个
MPI\u Wait
(对于等级0)中出错。这是下面示例代码中的步骤7

使用
mpirun-np2./无论什么

似乎未正确设置
sReqD
。这是在第5步由秩1设置的

但是,第7步由秩0执行,它不设置
sReqD

因此,您需要调整
if
语句以正确匹配哪个列组执行哪个
MPI\u等待
,等等


下面是您的代码和一些调试
printf
语句:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>

int
main(int argc, char *argv[])
{
    /* MPI Specific Variables */
    int my_size,
     my_rank,
     up,
     down;
    MPI_Request reqU,
     reqD,
     sreqU,
     sreqD;
    MPI_Status rUstatus,
     rDstatus,
     sUstatus,
     sDstatus;

    /* Physical Dimensions */
    double Linner = 5.0;                /* mm */
    double Rinner = 1.0;                /* mm */
    double phi_0 = 1000.0;

    /*V*/
        /* Other Variables */
    int grid_size = 100;
    int slice;
    int x,
     y;
    double grid_res_y = 0.2;
    double grid_res_x = 0.1;

    int xboundary,
     yboundary;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &my_size);

    /* Determining neighbours */
    /* if statemets used to stop highest and lowest rank neighbours arent
    outside 0 - my_size-1 range of ranks */
    if (my_rank != 0) {
        up = my_rank - 1;
    }
    else {
        up = 0;
    }

    if (my_rank != my_size - 1) {
        down = my_rank + 1;
    }
    else {
        down = my_size - 1;
    }

    printf("my_rank=%d my_size=%d up=%d down=%d\n",my_rank,my_size,up,down);

    /* cross-check: presumed my_size is a factor of gridsize else there are
    odd sized slices and this is not coded for */
    if (grid_size % my_size != 0) {
        printf("ERROR - number of procs =  %d, this is not a factor of grid_size %d\n", my_size, grid_size);
        exit(0);
    }

    /* Set Up Distributed Data Approach */
    slice = grid_size / my_size;

    /* y grid index of inner conductor wall */
    yboundary = Linner / grid_res_y;
    /* x grid and individual array index of inner conductor wall */
    xboundary = Rinner / grid_res_x;

    if (my_rank == 0) {
        printf("Linner=%g grid_res_y=%g yboundary=%d\n",
            Linner,grid_res_y,yboundary);
        printf("Rinner=%g grid_res_x=%g xboundary=%d\n",
            Rinner,grid_res_x,xboundary);
        printf("slice=%d grid_size=%d phi=%ld\n",
            slice,grid_size,sizeof(double) * (slice + 2) * grid_size);
    }

    /* extra 2 rows to allow for halo data */
    double phi[slice + 2][grid_size];

    for (y = 0; y < slice + 2; y++) {
        for (x = 0; x < grid_size; x++) {
            phi[y][x] = 0.0;
        }
    }

    /* Boundary Containing rank does 2 loops. One over part with inner
    conductor and one over part without inner conductor */
    if (my_rank == 0) {
        for (y = 0; y < slice + 1; y++) {
            for (x = xboundary; x < grid_size; x++) {
                phi[y][x] = phi_0;
            }
        }
    }

    if (my_rank < my_size - 1) {
        /* send top most strip up one node to be recieved as bottom halo */
        printf("1: my_rank=%d MPI_Isend\n",my_rank);
        MPI_Isend(&phi[1][0], grid_size, MPI_DOUBLE, down, 1, MPI_COMM_WORLD,
            &sreqU);

        /* recv top halo from up one node */
        printf("2: my_rank=%d MPI_Irecv\n",my_rank);
        MPI_Irecv(&phi[slice + 1][0], grid_size, MPI_DOUBLE, down, 2,
            MPI_COMM_WORLD, &reqU);

        printf("3: my_rank=%d\n",my_rank);
    }

    if (my_rank > 0) {
        /* recv top halo from down one node */
        printf("4: my_rank=%d MPI_Irecv\n",my_rank);
        MPI_Irecv(&phi[0][0], grid_size, MPI_DOUBLE, up, 2, MPI_COMM_WORLD,
            &reqD);

        /* send bottom most strip down one node to be recieved as top halo */
        printf("5: my_rank=%d MPI_Isend\n",my_rank);
        MPI_Isend(&phi[slice][0], grid_size, MPI_DOUBLE, up, 1, MPI_COMM_WORLD,
            &sreqD);

        printf("6: my_rank=%d\n",my_rank);
    }

    if (my_rank < my_size - 1) {
        /* Wait for send to down one rank to complete */
        printf("7: my_rank=%d\n",my_rank);
        MPI_Wait(&sreqD, &sDstatus);
        printf("8: my_rank=%d\n",my_rank);

        /* Wait for recieve from up one rank to complete */
        printf("9: my_rank=%d\n",my_rank);
        MPI_Wait(&reqD, &rDstatus);
        printf("10: my_rank=%d\n",my_rank);
    }

    if (my_rank > 0) {
        /* Wait for send to up down one rank to complete */
        printf("11: my_rank=%d\n",my_rank);
        MPI_Wait(&sreqU, &sUstatus);
        printf("12: my_rank=%d\n",my_rank);

        /* Wait for recieve from down one rank to complete */
        printf("12: my_rank=%d\n",my_rank);
        MPI_Wait(&reqU, &rUstatus);
        printf("13: my_rank=%d\n",my_rank);
    }

    MPI_Finalize();

    return 0;
}

顺便说一句,WAAAY的代码太多了。您是否尝试过在调试器下运行?它通常会将您带到出现分段错误的地方,并提供当时程序状态的完整快照。如果由于某种原因无法这样做,您可以将
ulimit-c
设置为允许内核转储,然后调试内核文件。内核文件太多了出现错误的位置。但是您是否更改了该切片和网格大小是否合理。即使在这之后,检查是否存在大量可能误入歧途的数组引用。这些行是:if(my_rank=A_proc)不会做你想让他们注意到的事情。试着将你发布的代码集中在你要问的特定问题上,即使你必须将它分割成一些来说明。@Bolloir我在每个等待语句之前添加了打印语句。等待sreqD之间的一个位置没有打印。可能是MPI_Wa的问题它?作为一种风格,您可以在每次迭代开始时声明一个包含4个请求的数组,所有请求都设置为
MPI\u REQUEST\u NULL
,然后
MPI\u Wait all(4,…)
。或者您可以在第一个和最后一个列上设置
up
down
MPI\u PROC\u NULL
,并且始终
MPI\u Isend()
MPI\u Irecv()
my_rank=0 my_size=2 up=0 down=1
Linner=5 grid_res_y=0.2 yboundary=25
Rinner=1 grid_res_x=0.1 xboundary=10
slice=50 grid_size=100 phi=41600
1: my_rank=0 MPI_Isend
2: my_rank=0 MPI_Irecv
3: my_rank=0
7: my_rank=0
my_rank=1 my_size=2 up=0 down=1
4: my_rank=1 MPI_Irecv
5: my_rank=1 MPI_Isend
6: my_rank=1
11: my_rank=1
[manderly:230404] *** Process received signal ***
[manderly:230403] *** Process received signal ***
[manderly:230403] Signal: Segmentation fault (11)
[manderly:230403] Signal code: Address not mapped (1)
[manderly:230403] Failing at address: 0x58
[manderly:230404] Signal: Segmentation fault (11)
[manderly:230404] Signal code: Address not mapped (1)
[manderly:230404] Failing at address: 0x58
[manderly:230403] [ 0] [manderly:230404] [ 0] /lib64/libpthread.so.0(+0x121c0)/lib64/libpthread.so.0(+0x121c0)[0x7fa5478341c0]
[0x7fa0ebe951c0]
[manderly:230404] [ 1] [manderly:230403] [ 1] /usr/lib64/openmpi/lib/libmpi.so.20(ompi_request_default_wait+0x31)[0x7fa0ec0e9a81]
[manderly:230404] [ 2] /usr/lib64/openmpi/lib/libmpi.so.20(ompi_request_default_wait+0x31)[0x7fa547a88a81]
[manderly:230403] [ 2] /usr/lib64/openmpi/lib/libmpi.so.20(PMPI_Wait+0x60)[0x7fa0ec12c350]
[manderly:230404] [ 3] ./fix2[0x400f93]
[manderly:230404] [ 4] /usr/lib64/openmpi/lib/libmpi.so.20(PMPI_Wait+0x60)[0x7fa547acb350]
[manderly:230403] [ 3] ./fix2[0x400ef7]
/lib64/libc.so.6(__libc_start_main+0xea)[0x7fa0ebaedfea]
[manderly:230404] [ 5] ./fix2[0x40081a[manderly:230403] [ 4] ]
[manderly:230404] *** End of error message ***
/lib64/libc.so.6(__libc_start_main+0xea)[0x7fa54748cfea]
[manderly:230403] [ 5] ./fix2[0x40081a]
[manderly:230403] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node manderly exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------