C 多个节点上的MPI_b广播错误

C 多个节点上的MPI_b广播错误,c,linux,segmentation-fault,mpi,C,Linux,Segmentation Fault,Mpi,背景:我正在编写基于项目的I/O系统调用的MPI版本 代码在单个节点上的多个处理器上运行时不会出错 但是,在多个节点上运行会导致分段错误。。。包含2个进程(每个节点1个进程)的错误消息如下所示: $ qsub test.sub $ cat test.e291810 0: pasc_open(./libSDL.so, 0, 0) 1: pasc_open(./libSDL.so, 0, 0) 1: mptr[0]=0 mptr[len-1]=0 1: MPI_Bcast(mptr=eed11000

背景:我正在编写基于项目的I/O系统调用的MPI版本

代码在单个节点上的多个处理器上运行时不会出错

但是,在多个节点上运行会导致分段错误。。。包含2个进程(每个节点1个进程)的错误消息如下所示:

$ qsub test.sub
$ cat test.e291810
0: pasc_open(./libSDL.so, 0, 0)
1: pasc_open(./libSDL.so, 0, 0)
1: mptr[0]=0 mptr[len-1]=0
1: MPI_Bcast(mptr=eed11000, len=435104, MPI_BYTE, 0, MPI_COMM_WORLD)
0: mptr[0]=127 mptr[len-1]=0
0: MPI_Bcast(mptr=eeb11000, len=435104, MPI_BYTE, 0, MPI_COMM_WORLD)
_pmiu_daemon(SIGCHLD): [NID 00632] [c3-0c0s14n0] [Sun May 18 13:10:30 2014] PE RANK 0 exit signal Segmentation fault
[NID 00632] 2014-05-18 13:10:30 Apid 8283706: initiated application termination
static int nextfd = BASE_FD;
#define next_fd() (nextfd++)

int pasc_open(const char *pathname, int flags, mode_t mode)
{
    int rank;
    int err;

    if(!init)
        return ((pasc_open_fp) def.open)(pathname, flags, mode);

    if(MPI_Comm_rank(MPI_COMM_WORLD, &rank) != MPI_SUCCESS)
        return -1;
    dprintf("%d: %s(%s, %x, %x)\n", rank, __FUNCTION__, pathname, flags, mode);

    /* Handle just read-only access for now. */
    if(flags == O_RDONLY || flags == (O_RDONLY | O_CLOEXEC)) {
        int fd, len, xlen, mptr_is_null;
        void *mptr;
        struct mpi_buf { int len, en; } buf;
        struct file_entry *file;

        if(rank == 0) {
            len = -1;
            fd = ((pasc_open_fp) def.open)(pathname, flags, mode);
            /* Call stat to get file size and check for errors */
            if(fd >= 0) {
                struct stat st;
                if(fstat(fd, &st) >= 0)
                    len = st.st_size;
                else
                    ((pasc_close_fp) def.close)(fd);
            }
            /* Record them */
            buf.len = len;
            buf.en = errno;
        }
        /* Propagate file size and errno */
        if(MPI_Bcast(&buf, 2, MPI_INT, 0, MPI_COMM_WORLD) != MPI_SUCCESS)
            return -1;
        len = buf.len;
        if(len < 0) {
            dprintf("error opening file, len < 0");
            return -1;
        }
        /* Get the page-aligned size */
        xlen = page_extend(len);
        /* `mmap` the file into memory */
        if(rank == 0) {
            mptr = ((pasc_mmap_fp) def.mmap)(0, xlen, PROT_READ, MAP_PRIVATE,
                    fd, 0);
        } else {
            fd = next_fd();
            mptr = ((pasc_mmap_fp) def.mmap)(0, xlen, PROT_READ | PROT_WRITE,
                    MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
        }
        ((pasc_lseek_fp) def.lseek)(fd, 0, SEEK_SET);
        /* Ensure success on all aux. processes */
        if(rank != 0)
            mptr_is_null = !mptr;
        MPI_Allreduce(MPI_IN_PLACE, &mptr_is_null, 1, MPI_INT, MPI_LAND,
                MPI_COMM_WORLD);
        if(mptr_is_null) {
            if(mptr)
                ((pasc_munmap_fp) def.munmap)(mptr, xlen);
            dprintf("%d: error: mmap/malloc error\n", rank);
            return -1;
        }
        dprintf("%d: mptr[0]=%d mptr[len-1]=%d\n", rank, ((char*)mptr)[0], ((char*)mptr)[len-1]);
        /* Propagate file contents */
        dprintf("%d: MPI_Bcast(mptr=%x, len=%d, MPI_BYTE, 0, MPI_COMM_WORLD)\n",
        rank, mptr, len);
        if(MPI_Bcast(mptr, len, MPI_BYTE, 0, MPI_COMM_WORLD) != MPI_SUCCESS)
            return -1;
        if(rank != 0)
            fd = next_fd();
        /* Register the file in the linked list */
        file = malloc(sizeof(struct file_entry));
        file->fd = fd;
        file->refcnt = 1;
        strncpy(file->fn, pathname, PASC_FNMAX);
        file->mptr = mptr;
        file->len = len;
        file->xlen = xlen;
        file->offset = 0;
        /* Reverse stack */
        file->next = open_files;
        open_files = file;
        return fd;

    }
    /* Fall back to independent access */
    return ((pasc_open_fp) def.open)(pathname, flags, mode);
}
发生错误的函数如下所示:

$ qsub test.sub
$ cat test.e291810
0: pasc_open(./libSDL.so, 0, 0)
1: pasc_open(./libSDL.so, 0, 0)
1: mptr[0]=0 mptr[len-1]=0
1: MPI_Bcast(mptr=eed11000, len=435104, MPI_BYTE, 0, MPI_COMM_WORLD)
0: mptr[0]=127 mptr[len-1]=0
0: MPI_Bcast(mptr=eeb11000, len=435104, MPI_BYTE, 0, MPI_COMM_WORLD)
_pmiu_daemon(SIGCHLD): [NID 00632] [c3-0c0s14n0] [Sun May 18 13:10:30 2014] PE RANK 0 exit signal Segmentation fault
[NID 00632] 2014-05-18 13:10:30 Apid 8283706: initiated application termination
static int nextfd = BASE_FD;
#define next_fd() (nextfd++)

int pasc_open(const char *pathname, int flags, mode_t mode)
{
    int rank;
    int err;

    if(!init)
        return ((pasc_open_fp) def.open)(pathname, flags, mode);

    if(MPI_Comm_rank(MPI_COMM_WORLD, &rank) != MPI_SUCCESS)
        return -1;
    dprintf("%d: %s(%s, %x, %x)\n", rank, __FUNCTION__, pathname, flags, mode);

    /* Handle just read-only access for now. */
    if(flags == O_RDONLY || flags == (O_RDONLY | O_CLOEXEC)) {
        int fd, len, xlen, mptr_is_null;
        void *mptr;
        struct mpi_buf { int len, en; } buf;
        struct file_entry *file;

        if(rank == 0) {
            len = -1;
            fd = ((pasc_open_fp) def.open)(pathname, flags, mode);
            /* Call stat to get file size and check for errors */
            if(fd >= 0) {
                struct stat st;
                if(fstat(fd, &st) >= 0)
                    len = st.st_size;
                else
                    ((pasc_close_fp) def.close)(fd);
            }
            /* Record them */
            buf.len = len;
            buf.en = errno;
        }
        /* Propagate file size and errno */
        if(MPI_Bcast(&buf, 2, MPI_INT, 0, MPI_COMM_WORLD) != MPI_SUCCESS)
            return -1;
        len = buf.len;
        if(len < 0) {
            dprintf("error opening file, len < 0");
            return -1;
        }
        /* Get the page-aligned size */
        xlen = page_extend(len);
        /* `mmap` the file into memory */
        if(rank == 0) {
            mptr = ((pasc_mmap_fp) def.mmap)(0, xlen, PROT_READ, MAP_PRIVATE,
                    fd, 0);
        } else {
            fd = next_fd();
            mptr = ((pasc_mmap_fp) def.mmap)(0, xlen, PROT_READ | PROT_WRITE,
                    MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
        }
        ((pasc_lseek_fp) def.lseek)(fd, 0, SEEK_SET);
        /* Ensure success on all aux. processes */
        if(rank != 0)
            mptr_is_null = !mptr;
        MPI_Allreduce(MPI_IN_PLACE, &mptr_is_null, 1, MPI_INT, MPI_LAND,
                MPI_COMM_WORLD);
        if(mptr_is_null) {
            if(mptr)
                ((pasc_munmap_fp) def.munmap)(mptr, xlen);
            dprintf("%d: error: mmap/malloc error\n", rank);
            return -1;
        }
        dprintf("%d: mptr[0]=%d mptr[len-1]=%d\n", rank, ((char*)mptr)[0], ((char*)mptr)[len-1]);
        /* Propagate file contents */
        dprintf("%d: MPI_Bcast(mptr=%x, len=%d, MPI_BYTE, 0, MPI_COMM_WORLD)\n",
        rank, mptr, len);
        if(MPI_Bcast(mptr, len, MPI_BYTE, 0, MPI_COMM_WORLD) != MPI_SUCCESS)
            return -1;
        if(rank != 0)
            fd = next_fd();
        /* Register the file in the linked list */
        file = malloc(sizeof(struct file_entry));
        file->fd = fd;
        file->refcnt = 1;
        strncpy(file->fn, pathname, PASC_FNMAX);
        file->mptr = mptr;
        file->len = len;
        file->xlen = xlen;
        file->offset = 0;
        /* Reverse stack */
        file->next = open_files;
        open_files = file;
        return fd;

    }
    /* Fall back to independent access */
    return ((pasc_open_fp) def.open)(pathname, flags, mode);
}
static int nextfd=BASE\u FD;
#定义next_fd()(nextfd++)
int pasc_open(常量字符*路径名、int标志、模式)
{
整数秩;
INTERR;
if(!init)
返回((pasc_open_fp)def.open)(路径名、标志、模式);
如果(MPI通信等级(MPI通信世界和等级)!=MPI通信成功)
返回-1;
dprintf(“%d:%s(%s,%x,%x)\n”、秩、函数、路径名、标志、模式);
/*现在只处理只读访问*/
if(flags==O|RDONLY | flags==(O|RDONLY | O|CLOEXEC)){
int-fd、len、xlen、mptr\u为空;
无效*mptr;
结构mpi_buf{int len,en;}buf;
结构文件\u条目*文件;
如果(秩==0){
len=-1;
fd=((pasc_open_fp)def.open)(路径名、标志、模式);
/*调用stat获取文件大小并检查错误*/
如果(fd>=0){
结构统计;
如果(fstat(fd和st)>=0)
len=标准尺寸;
其他的
((pasc_close_fp)定义关闭)(fd);
}
/*记录下来*/
buf.len=len;
buf.en=错误号;
}
/*传播文件大小和错误号*/
如果(MPI_Bcast(&buf,2,MPI_INT,0,MPI_COMM_WORLD)!=MPI_SUCCESS)
返回-1;
len=buf.len;
if(len<0){
dprintf(“打开文件时出错,len<0”);
返回-1;
}
/*使页面大小对齐*/
xlen=页面扩展(len);
/*`mmap`将文件映射到内存中*/
如果(秩==0){
mptr=((pasc_mmap_fp)def.mmap)(0,xlen,PROT_READ,MAP_PRIVATE,
fd,0);
}否则{
fd=下一个_fd();
mptr=((pasc_mmap_fp)def.mmap)(0,xlen,PROT_READ,PROT_WRITE,
MAP|u PRIVATE | MAP|u ANONYMOUS,fd,0);
}
((pasc_lseek_fp)def.lseek)(fd,0,SEEK_集);
/*确保所有辅助流程的成功*/
如果(秩!=0)
mptr\u为空=!mptr;
MPI\u Allreduce(MPI\u IN\u PLACE,&mptr\u为空,1,MPI\u INT,MPI\u LAND,
MPI_COMM_WORLD);
if(mptr_为空){
如果(mptr)
((pasc_munmap_fp)def.munmap)(mptr,xlen);
dprintf(“%d:error:mmap/malloc error\n”,秩);
返回-1;
}
dprintf(“%d:mptr[0]=%d mptr[len-1]=%d\n”,秩,((char*)mptr[0],((char*)mptr[len-1]);
/*传播文件内容*/
dprintf(“%d:MPI\u Bcast(mptr=%x,len=%d,MPI\u字节,0,MPI\u COMM\u WORLD)\n”,
等级、mptr、len);
如果(MPI_Bcast(mptr、len、MPI_字节、0、MPI_通信世界)!=MPI_成功)
返回-1;
如果(秩!=0)
fd=下一个_fd();
/*在链接列表中注册文件*/
file=malloc(sizeof(struct file_entry));
文件->fd=fd;
文件->参考CNT=1;
strncpy(文件->fn,路径名,PASC\u FNMAX);
文件->mptr=mptr;
文件->len=len;
文件->xlen=xlen;
文件->偏移量=0;
/*反向堆栈*/
文件->下一步=打开文件;
打开文件=文件;
返回fd;
}
/*退回到独立访问*/
返回((pasc_open_fp)def.open)(路径名、标志、模式);
}
该错误发生在最后的
MPI\u Bcast
调用中。我不知道为什么会发生这种情况:它复制的内存和复制到的内存我可以很好地解除引用

我正在运行SUSE Linux x86_64的定制Cray XC30机器上使用MPICH

谢谢



编辑:我尝试用一对MPI\u Send/
MPI\u Recv
来替换
MPI\u Bcast
调用,结果是一样的。

Cray MPI实现可能出于性能原因而发挥了一些魔力。在不了解内部情况的情况下,大部分答案都是猜测

节点间通信可能不利用网络堆栈,依赖于某种共享内存通信。当您尝试通过网络堆栈发送
mmap
-ed缓冲区时,某个地方出现了故障-DMA引擎(我在这里疯狂地猜测)无法处理这种情况

您可以尝试页面锁定mmaped缓冲区-也许
mlock
可以正常工作。
如果失败,则继续将数据复制到
malloc
ed缓冲区。

在其他进程中调用
fd=next\u fd()
时,是否可能最终处于争用状态?我不这么认为,因为
fd
是线程相关的,而
next\u fd
被定义为全局变量的简单增量。一旦进程产生,全局变量在每个进程中都是相同的(在那里有自己唯一的内存,所有的东西都被复制了!)。如果您增加它,并且不与所有其他进程通信,那么除了秩0之外的每个进程都将具有相同的fd(当然,fd的内存是进程唯一的)。文件描述符将增加,并为每个线程分别维护——希望我对代码所做的编辑清除了这一点。因此,秩0将具有实际系统调用返回的“real”
fd
,而其他秩将具有我生成的
fd
。由于
fd
是特定于线程的,并且操作文件描述符的系统调用都被覆盖,这还会是一个问题吗?如果将mmaped数据(mptr)复制到
malloc
ed缓冲区,会发生什么情况,你仍然有segfault吗?
mlock
不起作用,很遗憾——我现在不得不使用
malloc
/
memcpy
解决方案。Cray uGNI可能无法RDMA这样的缓冲区,在这种情况下,它应该内部复制