使用C MPI的2D阵列的散射和聚集（散射和聚集）部分_C_Mpi

使用C MPI的2D阵列的散射和聚集（散射和聚集）部分

c mpi

使用C MPI的2D阵列的散射和聚集（散射和聚集）部分,c,mpi,C,Mpi,因此，我试图建立一个通信例程，在该例程中，我使用“MPI_Gatherv”将一些2D数组从多个处理器传输回root 我一直在尝试使用scatterv和gatherv实现这一点，首先是一个简单的示例；一个4x4阵列，我试图将其划分为四个2x2阵列，然后分散我一直在尝试使用Scatterv函数在4个处理器之间划分我的4x4。我目前已经达到了根处理器能够打印出其2x2阵列的程度，但是一旦下一个处理器尝试打印出其本地数据，我就会得到一个segfault错误，如果我不尝试打印本地阵列，我就不会有任何错误

因此，我试图建立一个通信例程，在该例程中，我使用“MPI_Gatherv”将一些2D数组从多个处理器传输回root

我一直在尝试使用scatterv和gatherv实现这一点，首先是一个简单的示例；一个4x4阵列，我试图将其划分为四个2x2阵列，然后分散

我一直在尝试使用Scatterv函数在4个处理器之间划分我的4x4。我目前已经达到了根处理器能够打印出其2x2阵列的程度，但是一旦下一个处理器尝试打印出其本地数据，我就会得到一个segfault错误，如果我不尝试打印本地阵列，我就不会有任何错误。这是我的密码：

    #include <stdio.h>
    #include <stdlib.h>
    #include <mpi.h>

    int main (int argc, char** argv) {

    int rank, numprocs;

    MPI_Init(&argc, &argv);

    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int ndim = 4;
    int **ga = NULL;

    // Create global array. size is 4x4, Only for Root.
    if (rank == 0) {

            ga = malloc(ndim*sizeof(int*));

            for (int i=0; i < ndim; i++)
                    ga[i] = malloc(ndim*sizeof(int));

            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            if (i > 0) {
                                    ga[i][j] = i*ndim + j;

                            } else {
                                    ga[i][j] =i+j;

                            }
                    }
            }
    }

    //print global array.
    if (rank == 0) {

            printf("Send array:\n");
            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            printf(" %d ", ga[i][j]);
                    }

                    printf("\n");
            }
    }

    //Create local arrays on all procs.
    int **la = NULL;
    //local array size is 2x2.
    int ndim_loc = ((ndim*ndim)/numprocs)/2;

    la = (int**)malloc(ndim_loc*sizeof(int*));
    for (int i=0; i< ndim_loc; i++)
            la[i] = (int*)malloc(ndim_loc*sizeof(int));

     if (rank == 0) {

            printf("recieve array:\n");

            for (int i=0; i <ndim_loc; i++) {

                    for(int j=0; j<ndim_loc; j++) {

                            la[i][j] = 0;

                            printf(" %d ", la[i][j]);
                    }

                    printf("\n");
            }
    }

    // global size  
    int sizes[2] = {ndim, ndim};

    //local size, 4 procs, ndim = 4. each proc has a 2x2.
    int subsizes[2] = {ndim_loc, ndim_loc};
    int starts[2] = {0, 0};               //Set starting point of subarray in global array.

    if (rank == 0) {

            printf("Global arr dims = [%d,%d]\n", sizes[0], sizes[1]);
            printf("Sub arr dims = [%d,%d]\n", subsizes[0], subsizes[1]);
            printf("start point in global = [%d,%d]\n", starts[0], starts[1]);
    }

    //Preparing MPI send types.
    MPI_Datatype sub_arr, type;
    MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &sub_arr);
    MPI_Type_create_resized(sub_arr, 0, 2*sizeof(int), &type); 
    //Re-sizing block extent from one int to two ints, i.e. 1 block extent is one row of the sub array       
    MPI_Type_commit(&type);

    //Setting up arrays for sendcounts (each processor receives 1 sub array).
    int scounts[numprocs];

    //Displacements relative to global array[0][0].
    // .___.___.___.___.
    // |[0]|   |[1]|   |  [i] marks the starting position of the sub array in the global one for processor i.
    // |___|___|___|___|  So, the displacements (in units of the new block extent) are: {0,1,4,5}
    // |   |   |   |   |
    // |___|___|___|___|
    // |[2]|   |[3]|   |
    // |___|___|___|___|
    // |   |   |   |   |
    // |___|___|___|___|

    int displs[numprocs];

    for (int i=0; i<numprocs; i++) {

            scounts[i] = 1;

            if (i > 0 && i%2 == 0) {

                    displs[i] = displs[i-1] + 3;

            } else if (i == 0) {

                    displs[i] = 0;

            } else {

                    displs[i] = displs[i-1] + 1;

            }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    printf("I AM RANK %d, displ = %d, scount = %d\n", rank, displs[rank], scounts[rank]);

    //Sending uses the newly defined MPI_TYPE, receiving side is 4 MPI_INTs.
    MPI_Scatterv(&ga, scounts, displs, type, &la, (ndim_loc*ndim_loc), MPI_INT, 0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);

    //print local array.    
    printf("RANK = %d, local data:\n", rank);

    for (int i=0; i<ndim_loc; i++) {

            for (int j=0; j<ndim_loc; j++) {

                    printf("  %d  ", la[i][j]);
            }

            printf("\n");
    }

在此方面的任何帮助都将不胜感激

编辑：来自用户的建议，以连续方式分配内存，而不是“断断续续地分配”。以下是内存分配，发件人：

    int ndim = 4;
    int **ga = NULL;
    int *ga_pre = NULL;

    // Create global array. size is 4x4
    if (rank == 0) {

            //Contiguous allocation.
            ga_pre = malloc((ndim*ndim)*sizeof(int));
            ga = malloc(ndim*sizeof(int*));

            for (int i=0; i<ndim; i++)
                    ga[i] = &(ga_pre[ndim*i]);

我的新输出：

    mpirun -np 4 ./a.out 
    Send array:
     0  1  2  3 
     4  5  6  7 
     8  9  10  11 
     12  13  14  15 
    recieve array:
     0  0 
     0  0 
    start point in global = [0,0]
    Global arr dims = [4,4]
    Sub arr dims = [2,2]
    I AM RANK 0, displ = 0, scount = 1
    I AM RANK 1, displ = 1, scount = 1
    I AM RANK 2, displ = 4, scount = 1
    I AM RANK 3, displ = 5, scount = 1

    ===================================================================================
    =   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
    =   PID 23484 RUNNING AT login04
    =   EXIT CODE: 11
    =   CLEANING UP REMAINING PROCESSES
    =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES

更新：我已尝试使用其他用户@Ryker的连续内存分配函数：

    int ndim = 4;
    int **ga = NULL;

    // Create global array. size is 4x4
    if (rank == 0) {

            //contiguous allocation
            ga = Create2D(ndim, ndim);

            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            if (i > 0) {
                                    ga[i][j] = i*ndim + j;

                            } else {
                                    ga[i][j] =i+j;

                            }
                    }
            }
    }

    //print global array.
    if (rank == 0) {

            printf("Send array:\n");
            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            printf(" %d ", ga[i][j]);
                    }

                    printf("\n");
            }
    }

    //Create local arrays on all procs.
    int **la = NULL;
    int ndim_loc = ((ndim*ndim)/numprocs)/2;

    //Contiguous allocation
    la = Create2D(ndim_loc, ndim_loc);

    if (rank == 0) {

            printf("recieve array:\n");

            for (int i=0; i <ndim_loc; i++) {

                    for(int j=0; j<ndim_loc; j++) {

                            la[i][j] = 0;

                            printf(" %d ", la[i][j]);
                    }

                    printf("\n");
            }
    }

更新：感谢所有的建议，我将先看看是否可以使用静态数组完成此任务。在这一点上，我将开始使用不同类型的内存分配例程。一旦我发现问题所在，我会更新帖子

更新（2019年5月12日）：

好的，问题出在我的内存分配上，正如预期的那样。事实证明，使用我之前使用的内存分配类型（在上面的代码段中）并不能创建真正连续的内存块，数组仍然是“锯齿状”的。这就是为什么对scatterv的调用没有导致错误（每个处理器上发送了4个整数，接收了4个整数），但是当我试图打印阵列时，它会导致Segfault。数据被放在内存中，但是从print语句中引用内存地址的方式暗示了一个锯齿数组，而不是一个连续数组。根据@Ryker引用我的网页

因此，连续分配和引用内存的方法基本上是在print/initialise语句中使用行和列标记（i，j）以及数组的“跨步”（即一行中的元素）分配1D数组和引用元素。守则如下：

    int main (int argc, char** argv) {

    int rank, numprocs;

    MPI_Init(&argc, &argv);

    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int ndim = 4;
    int *ga = NULL;

    // Create global array. size is 4x4
    if (rank == 0) {

            //contiguous allocation
            ga = malloc((ndim*ndim)*sizeof(int*));

            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            //ndim is the 'stride'
                            if (i > 0) {
                                    ga[i*ndim +j] = i*ndim +j;

                            } else {
                                    ga[i*ndim +j] =i+j;

                            }
                    }
            }
    }

    //print global array.
    if (rank == 0) {

            printf("Send array:\n");
            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            printf(" %d ", ga[i*ndim +j]);
                    }

                    printf("\n");
            }
    }

    //Create local arrays on all procs.
    int *la = NULL;

    int ndim_loc = ((ndim*ndim)/numprocs)/2;

    //Contiguous allocation
    la = malloc((ndim_loc*ndim_loc)*sizeof(int*));

    if (rank == 0) {

            printf("recieve array:\n");

            for (int i=0; i <ndim_loc; i++) {

                    //ndim_loc is the 'stride' of the sub-array
                    for(int j=0; j<ndim_loc; j++) {

                            la[i*ndim + j] = 0;

                            printf(" %d ", la[i*ndim +j]);
                    }

                    printf("\n");
            }
    }

    // global size  
    int sizes[2] = {ndim, ndim};

    //local size, 2 procs, ndim = 4. each proc has a 2x2.
    int subsizes[2] = {ndim_loc, ndim_loc};
    int starts[2] = {0, 0};

    int scounts[numprocs];
    int displs[numprocs];

    for (int i=0; i<numprocs; i++) {
           scounts[i] = 1;

           if (i > 0 && i%2 == 0) {

            displs[i] = displs[i-1] + 3;

    } else if (i == 0) {

            displs[i] = 0;
    } else {

            displs[i] = displs[i-1] + 1;
    }

    if (rank == 0) {

    printf("start point in global = [%d,%d]\n", starts[0], starts[1]);
    printf("Global arr dims = [%d,%d]\n", sizes[0], sizes[1]);
    printf("Sub arr dims = [%d,%d]\n", subsizes[0], subsizes[1]);

    }

    MPI_Datatype sub_arr, type;
    MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &sub_arr);
    MPI_Type_create_resized(sub_arr, 0, 2*sizeof(int), &type);
    MPI_Type_commit(&type);

    MPI_Barrier(MPI_COMM_WORLD);

    printf("I AM RANK %d, displ = %d, scount = %d\n", rank, displs[rank], scounts[rank]);

    MPI_Scatterv(ga, scounts, displs, type, la, (ndim_loc*ndim_loc), MPI_INT, 0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);

    //print local array.    
    printf("RANK = %d, local data:\n", rank);
    for (int i=0; i<ndim_loc; i++) {
             for (int j=0; j<ndim_loc; j++) {
                   printf("  %d  ", la[i*ndim_loc +j]);
              }
              printf("\n");
     }

您必须在连续内存中分配2d数组。并正确传递接收缓冲区。请更新您的代码，看看情况如何。@GillesGouaillardet我刚刚更新了我的代码，全局数组和本地数组是连续分配的（就像在我的代码片段中一样），我还尝试了本地数组的连续分配和全局数组的锯齿分配。他们仍然给我和以前一样的输出。您是否认为这可能是变量“starts[2]”的问题，这些变量应该引用每个处理器的唯一起点，还是只需要引用{0,0}？编辑：我也尝试过将&ga[0][0]/&la[0][0]作为scatterv的参数，但没有效果。请将更新后的代码附加到问题中，以便我可以检查您的更改。您在代码中的何处打印“recieve array:”（输入错误…）？如果您需要我的帮助，请发布您编译和运行的完整代码。如果我不能发现错误，我将使用调试器来跟踪它。我将首先尝试使用静态数组来实现这一点。类似于

intga[4][4]={0}
    MPI_Scatterv(&ga[0][0], scounts, displs, type, &la[0][0], (ndim_loc*ndim_loc), MPI_INT, 0, MPI_COMM_WORLD);

    mpirun -np 4 ./a.out 
    Send array:
     0  1  2  3 
     4  5  6  7 
     8  9  10  11 
     12  13  14  15 
    recieve array:
     0  0 
     0  0 
    start point in global = [0,0]
    Global arr dims = [4,4]
    Sub arr dims = [2,2]
    I AM RANK 0, displ = 0, scount = 1
    I AM RANK 1, displ = 1, scount = 1
    I AM RANK 2, displ = 4, scount = 1
    I AM RANK 3, displ = 5, scount = 1

    ===================================================================================
    =   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
    =   PID 23484 RUNNING AT login04
    =   EXIT CODE: 11
    =   CLEANING UP REMAINING PROCESSES
    =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES

    int ndim = 4;
    int **ga = NULL;

    // Create global array. size is 4x4
    if (rank == 0) {

            //contiguous allocation
            ga = Create2D(ndim, ndim);

            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            if (i > 0) {
                                    ga[i][j] = i*ndim + j;

                            } else {
                                    ga[i][j] =i+j;

                            }
                    }
            }
    }

    //print global array.
    if (rank == 0) {

            printf("Send array:\n");
            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            printf(" %d ", ga[i][j]);
                    }

                    printf("\n");
            }
    }

    //Create local arrays on all procs.
    int **la = NULL;
    int ndim_loc = ((ndim*ndim)/numprocs)/2;

    //Contiguous allocation
    la = Create2D(ndim_loc, ndim_loc);

    if (rank == 0) {

            printf("recieve array:\n");

            for (int i=0; i <ndim_loc; i++) {

                    for(int j=0; j<ndim_loc; j++) {

                            la[i][j] = 0;

                            printf(" %d ", la[i][j]);
                    }

                    printf("\n");
            }
    }

    mpirun -np 4 ./a.out 
    Send array:
     0  1  2  3 
     4  5  6  7 
     8  9  10  11 
     12  13  14  15 
    recieve array:
     0  0 
     0  0 
    start point in global = [0,0]
    Global arr dims = [4,4]
    Sub arr dims = [2,2]
    I AM RANK 0, displ = 0, scount = 1
    I AM RANK 2, displ = 4, scount = 1
    I AM RANK 1, displ = 1, scount = 1
    I AM RANK 3, displ = 5, scount = 1

    ===================================================================================
    =   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
    =   PID 27303 RUNNING AT login04
    =   EXIT CODE: 11
    =   CLEANING UP REMAINING PROCESSES
    =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
    ===================================================================================
       Intel(R) MPI Library troubleshooting guide:
          https://software.intel.com/node/561764
    ===================================================================================

    int main (int argc, char** argv) {

    int rank, numprocs;

    MPI_Init(&argc, &argv);

    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int ndim = 4;
    int *ga = NULL;

    // Create global array. size is 4x4
    if (rank == 0) {

            //contiguous allocation
            ga = malloc((ndim*ndim)*sizeof(int*));

            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            //ndim is the 'stride'
                            if (i > 0) {
                                    ga[i*ndim +j] = i*ndim +j;

                            } else {
                                    ga[i*ndim +j] =i+j;

                            }
                    }
            }
    }

    //print global array.
    if (rank == 0) {

            printf("Send array:\n");
            for (int i=0; i < ndim; i++) {

                    for (int j=0; j < ndim; j++) {

                            printf(" %d ", ga[i*ndim +j]);
                    }

                    printf("\n");
            }
    }

    //Create local arrays on all procs.
    int *la = NULL;

    int ndim_loc = ((ndim*ndim)/numprocs)/2;

    //Contiguous allocation
    la = malloc((ndim_loc*ndim_loc)*sizeof(int*));

    if (rank == 0) {

            printf("recieve array:\n");

            for (int i=0; i <ndim_loc; i++) {

                    //ndim_loc is the 'stride' of the sub-array
                    for(int j=0; j<ndim_loc; j++) {

                            la[i*ndim + j] = 0;

                            printf(" %d ", la[i*ndim +j]);
                    }

                    printf("\n");
            }
    }

    // global size  
    int sizes[2] = {ndim, ndim};

    //local size, 2 procs, ndim = 4. each proc has a 2x2.
    int subsizes[2] = {ndim_loc, ndim_loc};
    int starts[2] = {0, 0};

    int scounts[numprocs];
    int displs[numprocs];

    for (int i=0; i<numprocs; i++) {
           scounts[i] = 1;

           if (i > 0 && i%2 == 0) {

            displs[i] = displs[i-1] + 3;

    } else if (i == 0) {

            displs[i] = 0;
    } else {

            displs[i] = displs[i-1] + 1;
    }

    if (rank == 0) {

    printf("start point in global = [%d,%d]\n", starts[0], starts[1]);
    printf("Global arr dims = [%d,%d]\n", sizes[0], sizes[1]);
    printf("Sub arr dims = [%d,%d]\n", subsizes[0], subsizes[1]);

    }

    MPI_Datatype sub_arr, type;
    MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &sub_arr);
    MPI_Type_create_resized(sub_arr, 0, 2*sizeof(int), &type);
    MPI_Type_commit(&type);

    MPI_Barrier(MPI_COMM_WORLD);

    printf("I AM RANK %d, displ = %d, scount = %d\n", rank, displs[rank], scounts[rank]);

    MPI_Scatterv(ga, scounts, displs, type, la, (ndim_loc*ndim_loc), MPI_INT, 0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);

    //print local array.    
    printf("RANK = %d, local data:\n", rank);
    for (int i=0; i<ndim_loc; i++) {
             for (int j=0; j<ndim_loc; j++) {
                   printf("  %d  ", la[i*ndim_loc +j]);
              }
              printf("\n");
     }

    mpirun -np 4 ./a.out 
    Send array:
     0  1  2  3 
     4  5  6  7 
     8  9  10  11 
     12  13  14  15 
     recieve array:  
     0  0 
     0  0 
    start point in global = [0,0]
    Global arr dims = [4,4]
    Sub arr dims = [2,2]
    I AM RANK 0, displ = 0, scount = 1
    I AM RANK 1, displ = 1, scount = 1
    I AM RANK 2, displ = 4, scount = 1
    I AM RANK 3, displ = 5, scount = 1
    RANK = 0, local data:
      0    1  
      4    5  
    RANK = 1, local data:
      2    3  
      6    7  
    RANK = 2, local data:
     8    9  
     12    13     
    RANK = 3, local data:
     10    11  
     14    15