Mpi &引用;ORTED“U CMD”处理器:卡在无限循环中-正在中止”;
在进行最终归约(我的程序中一组矩阵的求和)时,如下所示Mpi &引用;ORTED“U CMD”处理器:卡在无限循环中-正在中止”;,mpi,eigen,Mpi,Eigen,在进行最终归约(我的程序中一组矩阵的求和)时,如下所示 struct Tomo { typedef Eigen::Matrix<int, HISTOGRAM_BOXES, HISTOGRAM_BOXES, Eigen::RowMajor> HistoMtx; HistoMtx exp_val; HistoMtx u; [……] 我不明白这意味着什么,也不明白什么会触发它。这似乎是OpenMPI内部的问题。可能是底层网络的问题或其他需要管理员注意的问题。例
struct Tomo {
typedef Eigen::Matrix<int, HISTOGRAM_BOXES, HISTOGRAM_BOXES, Eigen::RowMajor> HistoMtx;
HistoMtx exp_val;
HistoMtx u;
[……]
我不明白这意味着什么,也不明白什么会触发它。这似乎是OpenMPI内部的问题。可能是底层网络的问题或其他需要管理员注意的问题。例如:
struct buffer_set {
Tomo * X;
Tomo * Y;
Tomo * Z;
} buffers[2];
if(rank == 0){
/* MASTER NODE */
for(int source=1; source<size; source++){
printf("Reducing from %i\n", source);
for(int i=0;i<env_count;i++){
MPI_Recv(buffers[1].X[i].exp_val.data(), buffers[1].X[i].exp_val.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].Y[i].exp_val.data(), buffers[1].Y[i].exp_val.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].Z[i].exp_val.data(), buffers[1].Z[i].exp_val.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].X[i].u.data(), buffers[1].X[i].u.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].Y[i].u.data(), buffers[1].Y[i].u.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].Z[i].u.data(), buffers[1].Z[i].u.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
}
merge_buffers(0, 1);
}
WriteH5File("h5file.h5", 0);
}else{
/* SLAVE NODES */
for(int i=0;i<env_count;i++){
MPI_Send(buffers[0].X[i].exp_val.data(), buffers[0].X[i].exp_val.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].Y[i].exp_val.data(), buffers[0].Y[i].exp_val.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].Z[i].exp_val.data(), buffers[0].Z[i].exp_val.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].X[i].u.data(), buffers[0].X[i].u.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].Y[i].u.data(), buffers[0].Y[i].u.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].Z[i].u.data(), buffers[0].Z[i].u.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
[compute-35-3.local:01139] [[33012,0],2] ORTED_CMD_PROCESSOR: STUCK IN INFINITE LOOP - ABORTING
[compute-35-3:01139] *** Process received signal ***