C MPI io按进程按行平均读取文件（而不是按块大小）_C_Mpi

C MPI io按进程按行平均读取文件（而不是按块大小）

c mpi

C MPI io按进程按行平均读取文件（而不是按块大小）,c,mpi,C,Mpi,我是MPI新手，我有这个问题。我想读取一个超过20000行的文件的内容，然后将这些行平均分配给所有进程进行进一步处理。文件中每一行的内容如下（两列数字）我需要在运行时将这些行等分为任意数量的进程（进程的数量可以是2,3,4,5，…，128）我知道如何将文件分割成块，但我需要保留每行中的值，因此我需要逐行读取这是我用来完成这项工作的MPI代码和串行代码，但我遇到了分段错误 /* Open the file */ MPI_File_open (MPI_COMM_WORLD, "small.tx

我是MPI新手，我有这个问题。我想读取一个超过20000行的文件的内容，然后将这些行平均分配给所有进程进行进一步处理。文件中每一行的内容如下（两列数字）

我需要在运行时将这些行等分为任意数量的进程（进程的数量可以是2,3,4,5，…，128）

我知道如何将文件分割成块，但我需要保留每行中的值，因此我需要逐行读取

这是我用来完成这项工作的MPI代码和串行代码，但我遇到了分段错误

/* Open the file */
MPI_File_open (MPI_COMM_WORLD, "small.txt", MPI_MODE_RDONLY, MPI_INFO_NULL, &myfile);
/* Get the size of the file */
MPI_File_get_size(myfile, &filesize);
/* Calculate how many elements that is */
filesize = filesize/sizeof(char);

/* Calculate how many elements each processor gets */
bufsize = filesize/np;
/* Allocate the buffer to read to, one extra for terminating null char */
buf = (char *) malloc((bufsize+1)*sizeof(char));


/* Set the file view */
MPI_File_set_view(myfile, myid*bufsize*sizeof(char), MPI_CHAR, MPI_CHAR,"native",MPI_INFO_NULL);


Nooflines_Real = count_lines(myfile);
printf("%s contains %d lines\n", argv[1], Nooflines_Real);


int count_lines (FILE *infile) {
  char readline[80];
  int lines=0;
  while( fgets(readline,80,infile) != NULL ) lines++;
  rewind(infile);
  return(lines);
}

您的参数

myfile

是类型为

MPI\u File

的变量，而不是类型为

File*

，因此您不能将其用于

fgets（）

，

倒带（）

等操作。这可能是您的错误根源

我的建议是采用这种方法，在每个文件中读取重叠块（考虑到您不知道一行有多长的事实），每个任务在块中读取并处理它们的行。如果您真正关心的是每个文件都有完全相同的行数（尽可能多），那么可以让它们彼此交换数据以获得完全相同的行数

更新：如果您真的想这样做（请注意，如果您的输入都是数字，那么以二进制格式输入会容易得多），一些代码读取文本文件中的分区，就像读取其他数字一样，然后处理每一行（比如通过对列求和）下面是我上面链接的答案的简单扩展：

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

void readlines(MPI_File *in, const int rank, const int size, const int overlap,
               char ***lines, int *nlines) {
    MPI_Offset filesize;
    MPI_Offset localsize;
    MPI_Offset start;
    MPI_Offset end;
    char *chunk;

    /* figure out who reads what */

    MPI_File_get_size(*in, &filesize);
    localsize = filesize/size;
    start = rank * localsize;
    end   = start + localsize - 1;

    /* add overlap to the end of everyone's chunk... */
    end += overlap;

    /* except the last processor, of course */
    if (rank == size-1) end = filesize;

    localsize =  end - start + 1;

    /* allocate memory */
    chunk = malloc( (localsize + 1)*sizeof(char));

    /* everyone reads in their part */
    MPI_File_read_at_all(*in, start, chunk, localsize, MPI_CHAR, MPI_STATUS_IGNORE);
    chunk[localsize] = '\0';

    /*
     * everyone calculate what their start and end *really* are by going 
     * from the first newline after start to the first newline after the
     * overlap region starts (eg, after end - overlap + 1)
     */

    int locstart=0, locend=localsize;
    if (rank != 0) {
        while(chunk[locstart] != '\n') locstart++;
        locstart++;
    }
    if (rank != size-1) {
        locend-=overlap;
        while(chunk[locend] != '\n') locend++;
    }
    localsize = locend-locstart+1;

    /* Now let's copy our actual data over into a new array, with no overlaps */
    char *data = (char *)malloc((localsize+1)*sizeof(char));
    memcpy(data, &(chunk[locstart]), localsize);
    data[localsize] = '\0';
    free(chunk);

    /* Now we'll count the number of lines */
    *nlines = 0;
    for (int i=0; i<localsize; i++)
        if (data[i] == '\n') (*nlines)++;

    /* Now the array lines will point into the data array at the start of each line */
    /* assuming nlines > 1 */
    *lines = (char **)malloc((*nlines)*sizeof(char *));
    (*lines)[0] = strtok(data,"\n");
    for (int i=1; i<(*nlines); i++)
        (*lines)[i] = strtok(NULL, "\n");

    return;
}

void processlines(char **lines, const int nlines, const int rank) {
    for (int i=0; i<nlines; i++) {
        float a, b;
        sscanf(lines[i],"%f %f", &a, &b);
        printf("%d: <%s>: %f + %f = %f\n", rank, lines[i], a, b, a+b);
    }
}

int main(int argc, char **argv) {

    MPI_File in;
    int rank, size;
    int ierr;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    if (argc != 2) {
        if (rank == 0) fprintf(stderr, "Usage: %s infilename\n", argv[0]);
        MPI_Finalize();
        exit(1);
    }

    ierr = MPI_File_open(MPI_COMM_WORLD, argv[1], MPI_MODE_RDONLY, MPI_INFO_NULL, &in);
    if (ierr) {
        if (rank == 0) fprintf(stderr, "%s: Couldn't open file %s\n", argv[0], argv[1]);
        MPI_Finalize();
        exit(2);
    }

    const int overlap=100;
    char **lines;
    int nlines;
    readlines(&in, rank, size, overlap, &lines, &nlines);

    printf("Rank %d has %d lines\n", rank, nlines);

    processlines(lines, nlines, rank);

    free(lines[0]);
    free(lines);

    MPI_File_close(&in);

    MPI_Finalize();
    return 0;
}

#包括
#包括
#包括
#包括
#包括
无效读线（MPI_文件*in、常量整数秩、常量整数大小、常量整数重叠、，
字符***行，整数*行）{
MPI_偏移量文件大小；
MPI_偏移量localsize；
MPI_偏移开始；
MPI_偏置端；
字符*块；
/*找出谁在读什么*/
MPI文件大小（*in，&filesize）；
localsize=filesize/size；
开始=排名*本地大小；
end=start+localsize-1；
/*在每个人的区块末尾添加重叠*/
结束+=重叠；
/*当然，除了最后一个处理器*/
如果（rank==size-1）end=filesize；
localsize=end-start+1；
/*分配内存*/
chunk=malloc（（localsize+1）*sizeof（char））；
/*每个人都读自己的部分*/
MPI文件在所有位置读取（*输入、开始、块、本地大小、MPI字符、MPI状态忽略）；
区块[localsize]='\0'；
/*
*每个人都会计算他们的起点和终点
*从开始后的第一个换行到结束后的第一个换行
*重叠区域开始（例如，结束后-重叠+1）
*/
int locstart=0，locend=localsize；
如果（秩！=0）{
while（chunk[locstart]！='\n'）locstart++；
locstart++；
}
如果（等级！=尺寸-1）{
locend-=重叠；
while（chunk[locend]！='\n'）locend++；
}
localsize=locend-locstart+1；
/*现在，让我们将实际数据复制到一个新数组中，没有重叠*/
char*data=（char*）malloc（（localsize+1）*sizeof（char））；
memcpy（数据和块[locstart]），localsize；
数据[localsize]='\0'；
自由（块）；
/*现在我们来计算行数*/
*nlines=0；
对于（int i=0；i 1*/
*行=（char**）malloc（（*nlines）*sizeof（char*）；
（*行）[0]=strtok（数据，“\n”）；
对于（int i=1；i您是否尝试过使用调试器查看此分段错误发生在代码中的何处？分段错误是添加代码Nooflines\u Real=count\u line（myfile）后的结果；我已经用工作代码充实了下面的答案。顺便问一下，解决不同类型的文件指针导致的SEGFULT的方法是什么。解决方法是停止调用posix文件调用，这些调用期望file*
处理MPI\u文件类型的事物。如果您有一个MPI\u文件，则在其上调用MPI-IO例程-MPI\F文件读取、MPI文件查找等。谢谢，你能告诉我在哪里可以找到MPI-IO的参考资料吗routnies@user1733911，MPI标准文档是参考信息和（有时）的重要来源有用的示例。您可以在PDF和HTML中找到它。现在请继续使用MPI-2.2，因为MPI-3.0是非常新的，大多数MPI实现仍然只符合v2.2。@JonathanDursi我们如何组合秩0和秩1的输出？
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

void readlines(MPI_File *in, const int rank, const int size, const int overlap,
               char ***lines, int *nlines) {
    MPI_Offset filesize;
    MPI_Offset localsize;
    MPI_Offset start;
    MPI_Offset end;
    char *chunk;

    /* figure out who reads what */

    MPI_File_get_size(*in, &filesize);
    localsize = filesize/size;
    start = rank * localsize;
    end   = start + localsize - 1;

    /* add overlap to the end of everyone's chunk... */
    end += overlap;

    /* except the last processor, of course */
    if (rank == size-1) end = filesize;

    localsize =  end - start + 1;

    /* allocate memory */
    chunk = malloc( (localsize + 1)*sizeof(char));

    /* everyone reads in their part */
    MPI_File_read_at_all(*in, start, chunk, localsize, MPI_CHAR, MPI_STATUS_IGNORE);
    chunk[localsize] = '\0';

    /*
     * everyone calculate what their start and end *really* are by going 
     * from the first newline after start to the first newline after the
     * overlap region starts (eg, after end - overlap + 1)
     */

    int locstart=0, locend=localsize;
    if (rank != 0) {
        while(chunk[locstart] != '\n') locstart++;
        locstart++;
    }
    if (rank != size-1) {
        locend-=overlap;
        while(chunk[locend] != '\n') locend++;
    }
    localsize = locend-locstart+1;

    /* Now let's copy our actual data over into a new array, with no overlaps */
    char *data = (char *)malloc((localsize+1)*sizeof(char));
    memcpy(data, &(chunk[locstart]), localsize);
    data[localsize] = '\0';
    free(chunk);

    /* Now we'll count the number of lines */
    *nlines = 0;
    for (int i=0; i<localsize; i++)
        if (data[i] == '\n') (*nlines)++;

    /* Now the array lines will point into the data array at the start of each line */
    /* assuming nlines > 1 */
    *lines = (char **)malloc((*nlines)*sizeof(char *));
    (*lines)[0] = strtok(data,"\n");
    for (int i=1; i<(*nlines); i++)
        (*lines)[i] = strtok(NULL, "\n");

    return;
}

void processlines(char **lines, const int nlines, const int rank) {
    for (int i=0; i<nlines; i++) {
        float a, b;
        sscanf(lines[i],"%f %f", &a, &b);
        printf("%d: <%s>: %f + %f = %f\n", rank, lines[i], a, b, a+b);
    }
}

int main(int argc, char **argv) {

    MPI_File in;
    int rank, size;
    int ierr;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    if (argc != 2) {
        if (rank == 0) fprintf(stderr, "Usage: %s infilename\n", argv[0]);
        MPI_Finalize();
        exit(1);
    }

    ierr = MPI_File_open(MPI_COMM_WORLD, argv[1], MPI_MODE_RDONLY, MPI_INFO_NULL, &in);
    if (ierr) {
        if (rank == 0) fprintf(stderr, "%s: Couldn't open file %s\n", argv[0], argv[1]);
        MPI_Finalize();
        exit(2);
    }

    const int overlap=100;
    char **lines;
    int nlines;
    readlines(&in, rank, size, overlap, &lines, &nlines);

    printf("Rank %d has %d lines\n", rank, nlines);

    processlines(lines, nlines, rank);

    free(lines[0]);
    free(lines);

    MPI_File_close(&in);

    MPI_Finalize();
    return 0;
}

$ mpirun -np 2 ./textio foo2.in 
Rank 0 has 4 lines
0: <45.87   13.22>: 45.869999 + 13.220000 = 59.090000
0: <45.71   13.27>: 45.709999 + 13.270000 = 58.980000
0: <45.78   13.21>: 45.779999 + 13.210000 = 58.989998
0: <45.67   13.1>: 45.669998 + 13.100000 = 58.769997
Rank 1 has 3 lines
1: <45.7    13.24>: 45.700001 + 13.240000 = 58.940002
1: <45.81   13.28>: 45.810001 + 13.280000 = 59.090000
1: <45.85   13.32>: 45.849998 + 13.320000 = 59.169998