C++ 使用gzbuffer快速读取gzip文件，然后逐行拆分内容_C++_Zlib

C++ 使用gzbuffer快速读取gzip文件，然后逐行拆分内容

c++

C++ 使用gzbuffer快速读取gzip文件，然后逐行拆分内容,c++,zlib,C++,Zlib,我想要一个函数，它接受一个文件名和一个字符串向量作为输入，并通过高效地读取文件来逐行填充向量。以下是我迄今为止所做的： /** \brief Read the whole file in a vector of lines */ int readFile( const string & pathToFile, vector<string> & lines) { gzFile stream; openFile(pathToF

我想要一个函数，它接受一个文件名和一个字符串向量作为输入，并通过高效地读取文件来逐行填充向量。以下是我迄今为止所做的：

/** \brief Read the whole file in a vector of lines
 */
  int
  readFile(
    const string & pathToFile,
    vector<string> & lines)
  {
    gzFile stream;
    openFile(pathToFile, stream, "rb");

    int errnum;
    const char * error_msg = NULL;

    size_t nb_bytes_to_read = 256000; // 8192 is default for gzbuffer
    if(gzbuffer(stream, nb_bytes_to_read) == -1){
      error_msg = gzerror(stream, &errnum);
      if(errnum != Z_OK){
        cerr << "ERROR: gzbuffer failed with " << nb_bytes_to_read
             << " bytes" << endl;
        cerr << error_msg << endl;
        exit(EXIT_FAILURE);
      }
    }

    size_t buf_len = nb_bytes_to_read;
    char * buf = (char *) malloc(buf_len);
    if(buf == NULL){
      cerr << "ERROR: can't allocate " << nb_bytes_to_read
           << " bytes" << endl;
      exit(EXIT_FAILURE);
    }

    size_t nb_bytes_read = 0, tot_nb_bytes_read = 0;
    while(! gzeof(stream)){
      nb_bytes_read = gzread(stream, buf + tot_nb_bytes_read,
                             nb_bytes_to_read);
      tot_nb_bytes_read += nb_bytes_read;
      if(nb_bytes_read < nb_bytes_to_read && ! gzeof(stream)){
        error_msg = gzerror(stream, &errnum);
        if(errnum != Z_OK){
          cerr << "ERROR: gzread failed on " << pathToFile << endl;
          cerr << error_msg << endl;
          exit(EXIT_FAILURE);
        }
      }
      if(tot_nb_bytes_read == buf_len){
        buf_len += nb_bytes_to_read;
        buf = (char*) realloc(buf, buf_len);
        if(buf == NULL){
          cerr << "ERROR: can't allocate " << nb_bytes_to_read
               << " bytes" << endl;
          exit(EXIT_FAILURE);
        }
      }
    }

    closeFile(pathToFile, stream);

    lines = split(buf, "\n", lines);

    free(buf);

    return 0;
  }

（由于我不是专业程序员，欢迎您提供任何其他建议！）

strtok（）

对以null结尾的字符串进行操作。您提供了一个从文本文件读取的缓冲区。没有空值。因此，

strtok（）

正在读取超过缓冲区末尾的数据，直到它在内存中意外找到零为止

顺便说一下，

strtok（）

有问题，甚至不可重入。阅读

strtok

和

strep

的手册页，谢谢，我现在在

while

循环之后将“\0”分配给

buf

的第一个元素，它工作得很好。此外，由于指定的

strep

的可移植性不如

strtok

，因此我暂时使用后者。或者您可以使用两者都不使用并编写自己的代码来扫描字符。我不确定是否能够很好地做到这一点，尤其是当我看到to-strtok的外观时。但也许我错了？

  void
  openFile(
    const string & pathToFile,
    gzFile & fileStream,
    const char * mode)
  {
    fileStream = gzopen(pathToFile.c_str(), mode);
    if(fileStream == NULL){
      cerr << "ERROR: can't open file " << pathToFile
           << " with mode " << *mode
           << " (errno=" << errno << ")" << endl;
      exit(EXIT_FAILURE);
    }
  }

  void
  closeFile(
    const string & pathToFile,
    gzFile & fileStream)
  {
    int ret = gzclose(fileStream);
    if(ret != Z_OK){
      cerr << "ERROR: can't close the file " << pathToFile
           << ", gzclose() returned " << ret << endl;
      exit(EXIT_FAILURE);
    }
  }

  vector<string> &
  split(
    char * buf,
    const char * delim,
    vector<string> & tokens)
  {
    tokens.clear();
    char * pch;
    pch = strtok(buf, delim);
    while(pch != NULL){
      tokens.push_back(string(pch));
      pch = strtok(NULL, delim);
    }
    return tokens;
  }