使用C MPI的2D阵列的散射和聚集(散射和聚集)部分
因此,我试图建立一个通信例程,在该例程中,我使用“MPI_Gatherv”将一些2D数组从多个处理器传输回root 我一直在尝试使用scatterv和gatherv实现这一点,首先是一个简单的示例;一个4x4阵列,我试图将其划分为四个2x2阵列,然后分散 我一直在尝试使用Scatterv函数在4个处理器之间划分我的4x4。我目前已经达到了根处理器能够打印出其2x2阵列的程度,但是一旦下一个处理器尝试打印出其本地数据,我就会得到一个segfault错误,如果我不尝试打印本地阵列,我就不会有任何错误。这是我的密码:使用C MPI的2D阵列的散射和聚集(散射和聚集)部分,c,mpi,C,Mpi,因此,我试图建立一个通信例程,在该例程中,我使用“MPI_Gatherv”将一些2D数组从多个处理器传输回root 我一直在尝试使用scatterv和gatherv实现这一点,首先是一个简单的示例;一个4x4阵列,我试图将其划分为四个2x2阵列,然后分散 我一直在尝试使用Scatterv函数在4个处理器之间划分我的4x4。我目前已经达到了根处理器能够打印出其2x2阵列的程度,但是一旦下一个处理器尝试打印出其本地数据,我就会得到一个segfault错误,如果我不尝试打印本地阵列,我就不会有任何错误
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main (int argc, char** argv) {
int rank, numprocs;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int ndim = 4;
int **ga = NULL;
// Create global array. size is 4x4, Only for Root.
if (rank == 0) {
ga = malloc(ndim*sizeof(int*));
for (int i=0; i < ndim; i++)
ga[i] = malloc(ndim*sizeof(int));
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
if (i > 0) {
ga[i][j] = i*ndim + j;
} else {
ga[i][j] =i+j;
}
}
}
}
//print global array.
if (rank == 0) {
printf("Send array:\n");
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
printf(" %d ", ga[i][j]);
}
printf("\n");
}
}
//Create local arrays on all procs.
int **la = NULL;
//local array size is 2x2.
int ndim_loc = ((ndim*ndim)/numprocs)/2;
la = (int**)malloc(ndim_loc*sizeof(int*));
for (int i=0; i< ndim_loc; i++)
la[i] = (int*)malloc(ndim_loc*sizeof(int));
if (rank == 0) {
printf("recieve array:\n");
for (int i=0; i <ndim_loc; i++) {
for(int j=0; j<ndim_loc; j++) {
la[i][j] = 0;
printf(" %d ", la[i][j]);
}
printf("\n");
}
}
// global size
int sizes[2] = {ndim, ndim};
//local size, 4 procs, ndim = 4. each proc has a 2x2.
int subsizes[2] = {ndim_loc, ndim_loc};
int starts[2] = {0, 0}; //Set starting point of subarray in global array.
if (rank == 0) {
printf("Global arr dims = [%d,%d]\n", sizes[0], sizes[1]);
printf("Sub arr dims = [%d,%d]\n", subsizes[0], subsizes[1]);
printf("start point in global = [%d,%d]\n", starts[0], starts[1]);
}
//Preparing MPI send types.
MPI_Datatype sub_arr, type;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &sub_arr);
MPI_Type_create_resized(sub_arr, 0, 2*sizeof(int), &type);
//Re-sizing block extent from one int to two ints, i.e. 1 block extent is one row of the sub array
MPI_Type_commit(&type);
//Setting up arrays for sendcounts (each processor receives 1 sub array).
int scounts[numprocs];
//Displacements relative to global array[0][0].
// .___.___.___.___.
// |[0]| |[1]| | [i] marks the starting position of the sub array in the global one for processor i.
// |___|___|___|___| So, the displacements (in units of the new block extent) are: {0,1,4,5}
// | | | | |
// |___|___|___|___|
// |[2]| |[3]| |
// |___|___|___|___|
// | | | | |
// |___|___|___|___|
int displs[numprocs];
for (int i=0; i<numprocs; i++) {
scounts[i] = 1;
if (i > 0 && i%2 == 0) {
displs[i] = displs[i-1] + 3;
} else if (i == 0) {
displs[i] = 0;
} else {
displs[i] = displs[i-1] + 1;
}
}
MPI_Barrier(MPI_COMM_WORLD);
printf("I AM RANK %d, displ = %d, scount = %d\n", rank, displs[rank], scounts[rank]);
//Sending uses the newly defined MPI_TYPE, receiving side is 4 MPI_INTs.
MPI_Scatterv(&ga, scounts, displs, type, &la, (ndim_loc*ndim_loc), MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
//print local array.
printf("RANK = %d, local data:\n", rank);
for (int i=0; i<ndim_loc; i++) {
for (int j=0; j<ndim_loc; j++) {
printf(" %d ", la[i][j]);
}
printf("\n");
}
在此方面的任何帮助都将不胜感激
编辑:
来自用户的建议,以连续方式分配内存,而不是“断断续续地分配”。以下是内存分配,发件人:
int ndim = 4;
int **ga = NULL;
int *ga_pre = NULL;
// Create global array. size is 4x4
if (rank == 0) {
//Contiguous allocation.
ga_pre = malloc((ndim*ndim)*sizeof(int));
ga = malloc(ndim*sizeof(int*));
for (int i=0; i<ndim; i++)
ga[i] = &(ga_pre[ndim*i]);
我的新输出:
mpirun -np 4 ./a.out
Send array:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
recieve array:
0 0
0 0
start point in global = [0,0]
Global arr dims = [4,4]
Sub arr dims = [2,2]
I AM RANK 0, displ = 0, scount = 1
I AM RANK 1, displ = 1, scount = 1
I AM RANK 2, displ = 4, scount = 1
I AM RANK 3, displ = 5, scount = 1
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 23484 RUNNING AT login04
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
更新:
我已尝试使用其他用户@Ryker的连续内存分配函数:
int ndim = 4;
int **ga = NULL;
// Create global array. size is 4x4
if (rank == 0) {
//contiguous allocation
ga = Create2D(ndim, ndim);
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
if (i > 0) {
ga[i][j] = i*ndim + j;
} else {
ga[i][j] =i+j;
}
}
}
}
//print global array.
if (rank == 0) {
printf("Send array:\n");
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
printf(" %d ", ga[i][j]);
}
printf("\n");
}
}
//Create local arrays on all procs.
int **la = NULL;
int ndim_loc = ((ndim*ndim)/numprocs)/2;
//Contiguous allocation
la = Create2D(ndim_loc, ndim_loc);
if (rank == 0) {
printf("recieve array:\n");
for (int i=0; i <ndim_loc; i++) {
for(int j=0; j<ndim_loc; j++) {
la[i][j] = 0;
printf(" %d ", la[i][j]);
}
printf("\n");
}
}
更新:感谢所有的建议,我将先看看是否可以使用静态数组完成此任务。在这一点上,我将开始使用不同类型的内存分配例程。一旦我发现问题所在,我会更新帖子
更新(2019年5月12日):
好的,问题出在我的内存分配上,正如预期的那样。事实证明,使用我之前使用的内存分配类型(在上面的代码段中)并不能创建真正连续的内存块,数组仍然是“锯齿状”的。这就是为什么对scatterv的调用没有导致错误(每个处理器上发送了4个整数,接收了4个整数),但是当我试图打印阵列时,它会导致Segfault。数据被放在内存中,但是从print语句中引用内存地址的方式暗示了一个锯齿数组,而不是一个连续数组。根据@Ryker引用我的网页
因此,连续分配和引用内存的方法基本上是在print/initialise语句中使用行和列标记(i,j)以及数组的“跨步”(即一行中的元素)分配1D数组和引用元素。守则如下:
int main (int argc, char** argv) {
int rank, numprocs;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int ndim = 4;
int *ga = NULL;
// Create global array. size is 4x4
if (rank == 0) {
//contiguous allocation
ga = malloc((ndim*ndim)*sizeof(int*));
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
//ndim is the 'stride'
if (i > 0) {
ga[i*ndim +j] = i*ndim +j;
} else {
ga[i*ndim +j] =i+j;
}
}
}
}
//print global array.
if (rank == 0) {
printf("Send array:\n");
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
printf(" %d ", ga[i*ndim +j]);
}
printf("\n");
}
}
//Create local arrays on all procs.
int *la = NULL;
int ndim_loc = ((ndim*ndim)/numprocs)/2;
//Contiguous allocation
la = malloc((ndim_loc*ndim_loc)*sizeof(int*));
if (rank == 0) {
printf("recieve array:\n");
for (int i=0; i <ndim_loc; i++) {
//ndim_loc is the 'stride' of the sub-array
for(int j=0; j<ndim_loc; j++) {
la[i*ndim + j] = 0;
printf(" %d ", la[i*ndim +j]);
}
printf("\n");
}
}
// global size
int sizes[2] = {ndim, ndim};
//local size, 2 procs, ndim = 4. each proc has a 2x2.
int subsizes[2] = {ndim_loc, ndim_loc};
int starts[2] = {0, 0};
int scounts[numprocs];
int displs[numprocs];
for (int i=0; i<numprocs; i++) {
scounts[i] = 1;
if (i > 0 && i%2 == 0) {
displs[i] = displs[i-1] + 3;
} else if (i == 0) {
displs[i] = 0;
} else {
displs[i] = displs[i-1] + 1;
}
if (rank == 0) {
printf("start point in global = [%d,%d]\n", starts[0], starts[1]);
printf("Global arr dims = [%d,%d]\n", sizes[0], sizes[1]);
printf("Sub arr dims = [%d,%d]\n", subsizes[0], subsizes[1]);
}
MPI_Datatype sub_arr, type;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &sub_arr);
MPI_Type_create_resized(sub_arr, 0, 2*sizeof(int), &type);
MPI_Type_commit(&type);
MPI_Barrier(MPI_COMM_WORLD);
printf("I AM RANK %d, displ = %d, scount = %d\n", rank, displs[rank], scounts[rank]);
MPI_Scatterv(ga, scounts, displs, type, la, (ndim_loc*ndim_loc), MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
//print local array.
printf("RANK = %d, local data:\n", rank);
for (int i=0; i<ndim_loc; i++) {
for (int j=0; j<ndim_loc; j++) {
printf(" %d ", la[i*ndim_loc +j]);
}
printf("\n");
}
您必须在连续内存中分配2d数组。并正确传递接收缓冲区。请更新您的代码,看看情况如何。@GillesGouaillardet我刚刚更新了我的代码,全局数组和本地数组是连续分配的(就像在我的代码片段中一样),我还尝试了本地数组的连续分配和全局数组的锯齿分配。他们仍然给我和以前一样的输出。您是否认为这可能是变量“starts[2]”的问题,这些变量应该引用每个处理器的唯一起点,还是只需要引用{0,0}?编辑:我也尝试过将&ga[0][0]/&la[0][0]作为scatterv的参数,但没有效果。请将更新后的代码附加到问题中,以便我可以检查您的更改。您在代码中的何处打印“recieve array:”(输入错误…)?如果您需要我的帮助,请发布您编译和运行的完整代码。如果我不能发现错误,我将使用调试器来跟踪它。我将首先尝试使用静态数组来实现这一点。类似于
intga[4][4]={0}代码>
MPI_Scatterv(&ga[0][0], scounts, displs, type, &la[0][0], (ndim_loc*ndim_loc), MPI_INT, 0, MPI_COMM_WORLD);
mpirun -np 4 ./a.out
Send array:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
recieve array:
0 0
0 0
start point in global = [0,0]
Global arr dims = [4,4]
Sub arr dims = [2,2]
I AM RANK 0, displ = 0, scount = 1
I AM RANK 1, displ = 1, scount = 1
I AM RANK 2, displ = 4, scount = 1
I AM RANK 3, displ = 5, scount = 1
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 23484 RUNNING AT login04
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
int ndim = 4;
int **ga = NULL;
// Create global array. size is 4x4
if (rank == 0) {
//contiguous allocation
ga = Create2D(ndim, ndim);
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
if (i > 0) {
ga[i][j] = i*ndim + j;
} else {
ga[i][j] =i+j;
}
}
}
}
//print global array.
if (rank == 0) {
printf("Send array:\n");
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
printf(" %d ", ga[i][j]);
}
printf("\n");
}
}
//Create local arrays on all procs.
int **la = NULL;
int ndim_loc = ((ndim*ndim)/numprocs)/2;
//Contiguous allocation
la = Create2D(ndim_loc, ndim_loc);
if (rank == 0) {
printf("recieve array:\n");
for (int i=0; i <ndim_loc; i++) {
for(int j=0; j<ndim_loc; j++) {
la[i][j] = 0;
printf(" %d ", la[i][j]);
}
printf("\n");
}
}
mpirun -np 4 ./a.out
Send array:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
recieve array:
0 0
0 0
start point in global = [0,0]
Global arr dims = [4,4]
Sub arr dims = [2,2]
I AM RANK 0, displ = 0, scount = 1
I AM RANK 2, displ = 4, scount = 1
I AM RANK 1, displ = 1, scount = 1
I AM RANK 3, displ = 5, scount = 1
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 27303 RUNNING AT login04
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
Intel(R) MPI Library troubleshooting guide:
https://software.intel.com/node/561764
===================================================================================
int main (int argc, char** argv) {
int rank, numprocs;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int ndim = 4;
int *ga = NULL;
// Create global array. size is 4x4
if (rank == 0) {
//contiguous allocation
ga = malloc((ndim*ndim)*sizeof(int*));
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
//ndim is the 'stride'
if (i > 0) {
ga[i*ndim +j] = i*ndim +j;
} else {
ga[i*ndim +j] =i+j;
}
}
}
}
//print global array.
if (rank == 0) {
printf("Send array:\n");
for (int i=0; i < ndim; i++) {
for (int j=0; j < ndim; j++) {
printf(" %d ", ga[i*ndim +j]);
}
printf("\n");
}
}
//Create local arrays on all procs.
int *la = NULL;
int ndim_loc = ((ndim*ndim)/numprocs)/2;
//Contiguous allocation
la = malloc((ndim_loc*ndim_loc)*sizeof(int*));
if (rank == 0) {
printf("recieve array:\n");
for (int i=0; i <ndim_loc; i++) {
//ndim_loc is the 'stride' of the sub-array
for(int j=0; j<ndim_loc; j++) {
la[i*ndim + j] = 0;
printf(" %d ", la[i*ndim +j]);
}
printf("\n");
}
}
// global size
int sizes[2] = {ndim, ndim};
//local size, 2 procs, ndim = 4. each proc has a 2x2.
int subsizes[2] = {ndim_loc, ndim_loc};
int starts[2] = {0, 0};
int scounts[numprocs];
int displs[numprocs];
for (int i=0; i<numprocs; i++) {
scounts[i] = 1;
if (i > 0 && i%2 == 0) {
displs[i] = displs[i-1] + 3;
} else if (i == 0) {
displs[i] = 0;
} else {
displs[i] = displs[i-1] + 1;
}
if (rank == 0) {
printf("start point in global = [%d,%d]\n", starts[0], starts[1]);
printf("Global arr dims = [%d,%d]\n", sizes[0], sizes[1]);
printf("Sub arr dims = [%d,%d]\n", subsizes[0], subsizes[1]);
}
MPI_Datatype sub_arr, type;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &sub_arr);
MPI_Type_create_resized(sub_arr, 0, 2*sizeof(int), &type);
MPI_Type_commit(&type);
MPI_Barrier(MPI_COMM_WORLD);
printf("I AM RANK %d, displ = %d, scount = %d\n", rank, displs[rank], scounts[rank]);
MPI_Scatterv(ga, scounts, displs, type, la, (ndim_loc*ndim_loc), MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
//print local array.
printf("RANK = %d, local data:\n", rank);
for (int i=0; i<ndim_loc; i++) {
for (int j=0; j<ndim_loc; j++) {
printf(" %d ", la[i*ndim_loc +j]);
}
printf("\n");
}
mpirun -np 4 ./a.out
Send array:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
recieve array:
0 0
0 0
start point in global = [0,0]
Global arr dims = [4,4]
Sub arr dims = [2,2]
I AM RANK 0, displ = 0, scount = 1
I AM RANK 1, displ = 1, scount = 1
I AM RANK 2, displ = 4, scount = 1
I AM RANK 3, displ = 5, scount = 1
RANK = 0, local data:
0 1
4 5
RANK = 1, local data:
2 3
6 7
RANK = 2, local data:
8 9
12 13
RANK = 3, local data:
10 11
14 15