C++ 如何加速包含图形数据的文本文件的io/解析

C++ 如何加速包含图形数据的文本文件的io/解析,c++,parsing,io,ifstream,C++,Parsing,Io,Ifstream,我的任务是实现A*算法,但是我从文件中读取图形节点数据的速度非常慢,大约需要4分钟,我想知道是否有办法显著加快速度 kart对象只是一个节点向量,这个问题的焦点是io ifstream ifs_edge("<my path>", ios::binary | ios::ate); auto edges_size = ifs_edge.tellg(); ifs_edge.seekg(ios::beg); string str_edges(edges_size, 0

我的任务是实现A*算法,但是我从文件中读取图形节点数据的速度非常慢,大约需要4分钟,我想知道是否有办法显著加快速度

kart对象只是一个节点向量,这个问题的焦点是io

ifstream ifs_edge("<my path>", ios::binary | ios::ate);
    auto edges_size = ifs_edge.tellg();
    ifs_edge.seekg(ios::beg);
    string str_edges(edges_size, 0);
    ifs_edge.read(&str_edges[0], edges_size);
    cout << edges_size << endl;

    int counter = 0;
    double lon = 0, lat = 0;
    string substr;
    for (char c : str_edges)
    {
            if (c != 0x20 && c != 0x0A)
                    substr += c;
            else
            {
                    if (substr.size())
                    {
                            if (counter == 1)
                                    lon = stod(substr);
                            if (counter == 2)
                                    lat = stod(substr);
                            substr.clear();
                            counter++;
                    }
            }
            if (c == 0x0A)
            {
                    counter = 0;
                    if (lon && lat)
                    {
                            astar::kart_node kn;
                            kn.c = { lon, lat };
                            kart.push_back(kn);
                    }
            }
    }
编辑2:
提供了更快的解决方案,这是很好的,但由于一些奇怪的原因,速度对我来说太慢了,我也试过几台计算机。谜团依然存在,我愿意接受更多的建议,尽管我的解决方案有所改进,但这个问题还没有被标记为已解决

您正在代码中执行大量字符串操作和单个转换。所有这些都可以避免,经度和纬度可以直接读入浮点变量。在这种情况下,看起来整个事情可以简化为

ifstream ifs_edge("<my path>");
int eater; // used to eat the first line and the index of each lon,lat pair
double lon, lat;
std::size_t size;
ifs_edge >> size; //get size for vector
std::vector<astar::kart_node> kart;
kart.reserve(size);
while(ifs_edge >> eater >> lon >> lat)
    kart.emplace_back(lon, lat);

现在我们为向量保留了空间,以避免重复的内存分配和拷贝;没有更多的字符串转换;每个kart_节点都是在向量中构造的,而不是构造然后复制的。

您在代码中进行了大量的字符串操作和单个转换。所有这些都可以避免,经度和纬度可以直接读入浮点变量。在这种情况下,看起来整个事情可以简化为

ifstream ifs_edge("<my path>");
int eater; // used to eat the first line and the index of each lon,lat pair
double lon, lat;
std::size_t size;
ifs_edge >> size; //get size for vector
std::vector<astar::kart_node> kart;
kart.reserve(size);
while(ifs_edge >> eater >> lon >> lat)
    kart.emplace_back(lon, lat);

现在我们为向量保留了空间,以避免重复的内存分配和拷贝;没有更多的字符串转换;每个kart_节点都是在向量中构造的,而不是构造然后复制的。

由于数据是标准化的,所以可以使用fstream或fscanf

以下是输出: fstream大约需要7.2秒 fscanf大约需要1.6秒

Reading from data.txt
1000000 entry read!
2000000 entry read!
3000000 entry read!
last entry: -116.239,89.5744
read_file_cpp 7.19992s
1000000 entry read!
2000000 entry read!
3000000 entry read!
last entry: -116.239,89.5744
read_file_c 1.55708s
下面是用clang++-O3编译的代码

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cmath>
#include <ctime>
#include <cstdio>
using namespace std;

void read_file_cpp(const string& path, vector<pair<double, double> >* kart) {
  ifstream in(path);
  int n, index;
  double lon, lat;
  in >> n;
  for (int i=1;i<=n;++i) {
    in >> index >> lon >> lat;
    kart->push_back(make_pair(lon, lat));
    if(i % 1000000 == 0)
      cout << i << " entry read!" <<endl;
  }

  in.close();
}

void read_file_c(const string& path, vector<pair<double, double> >* kart) {
  FILE* f = fopen(path.c_str(),"r");
  int n, index;
  fscanf (f, "%d", &n);
  double lon, lat;

  for (int i=1;i<=n;++i) {
    fscanf(f, "%d %lf %lf", &index, &lon, &lat);
    kart->push_back(make_pair(lon, lat));
    if(i % 1000000 == 0)
      cout << i << " entry read!" <<endl;
  }

  fclose(f);
}

void generate_data(const string& path, int n) {

  cout << "generating data... " << endl;

  ofstream out(path);
  out << n << endl;
  for (int i=0;i<n;++i) {
    float lon = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) * 360 - 180;
    float lat = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) * 360 - 180;
    out << i << " " << lon << " " << lat << endl;
  }
  out.close();

  cout << "done !" << endl;
}


int main() {
  const string path = "data.txt";
  vector<pair<double, double> > data;

  // generate_data(path, 3901630);
  {
    clock_t begin = clock();
    read_file_cpp(path, &data);
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

    cout << "last entry: " << data.back().first << "," << data.back().second << endl;

    cout << "read_file_cpp "<< elapsed_secs << "s" << endl;
  }

  data.clear();

  {
    clock_t begin = clock();
    read_file_c(path, &data);
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

    cout << "last entry: " << data.back().first << "," << data.back().second << endl;

    cout << "read_file_c "<< elapsed_secs << "s" << endl;
  }

}

因为您的数据是标准化的,所以可以使用fstream或fscanf

以下是输出: fstream大约需要7.2秒 fscanf大约需要1.6秒

Reading from data.txt
1000000 entry read!
2000000 entry read!
3000000 entry read!
last entry: -116.239,89.5744
read_file_cpp 7.19992s
1000000 entry read!
2000000 entry read!
3000000 entry read!
last entry: -116.239,89.5744
read_file_c 1.55708s
下面是用clang++-O3编译的代码

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cmath>
#include <ctime>
#include <cstdio>
using namespace std;

void read_file_cpp(const string& path, vector<pair<double, double> >* kart) {
  ifstream in(path);
  int n, index;
  double lon, lat;
  in >> n;
  for (int i=1;i<=n;++i) {
    in >> index >> lon >> lat;
    kart->push_back(make_pair(lon, lat));
    if(i % 1000000 == 0)
      cout << i << " entry read!" <<endl;
  }

  in.close();
}

void read_file_c(const string& path, vector<pair<double, double> >* kart) {
  FILE* f = fopen(path.c_str(),"r");
  int n, index;
  fscanf (f, "%d", &n);
  double lon, lat;

  for (int i=1;i<=n;++i) {
    fscanf(f, "%d %lf %lf", &index, &lon, &lat);
    kart->push_back(make_pair(lon, lat));
    if(i % 1000000 == 0)
      cout << i << " entry read!" <<endl;
  }

  fclose(f);
}

void generate_data(const string& path, int n) {

  cout << "generating data... " << endl;

  ofstream out(path);
  out << n << endl;
  for (int i=0;i<n;++i) {
    float lon = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) * 360 - 180;
    float lat = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) * 360 - 180;
    out << i << " " << lon << " " << lat << endl;
  }
  out.close();

  cout << "done !" << endl;
}


int main() {
  const string path = "data.txt";
  vector<pair<double, double> > data;

  // generate_data(path, 3901630);
  {
    clock_t begin = clock();
    read_file_cpp(path, &data);
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

    cout << "last entry: " << data.back().first << "," << data.back().second << endl;

    cout << "read_file_cpp "<< elapsed_secs << "s" << endl;
  }

  data.clear();

  {
    clock_t begin = clock();
    read_file_c(path, &data);
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

    cout << "last entry: " << data.back().first << "," << data.back().second << endl;

    cout << "read_file_c "<< elapsed_secs << "s" << endl;
  }

}


你能显示文件的外观吗?当然,我现在添加了。你能显示文件的外观吗?当然,我现在添加了。提示。如果只生成一行数据,这将破坏整个读取过程!您应该使它更健壮,以防止攻击等等!我得到了一个例外,std::length_error,我还注意到该文件的双精度和索引之间实际上有一些过大的间距,这有关系吗?@Jontahan间距不应该有关系,因为>>忽略了空白。@ThomasRoskop输入验证留给OP。为了通用性和简洁性,这里省略了它。@NathanOliver:没关系!暗示如果只生成一行数据,这将破坏整个读取过程!您应该使它更健壮,以防止攻击等等!我得到了一个例外,std::length_error,我还注意到该文件的双精度和索引之间实际上有一些过大的间距,这有关系吗?@Jontahan间距不应该有关系,因为>>忽略了空白。@ThomasRoskop输入验证留给OP。为了通用性和简洁性,这里省略了它。@NathanOliver:没关系!所以你的read_file_c是我迄今为止尝试过的最快的函数,但它仍然使用100s.read_file_cpp 94.965s,字面上是相同的代码,我也使用了你的文件生成器。在这种情况下,问题可能是你的,IO不再是瓶颈。所以你的read_file_c是我迄今为止尝试过的最快的函数,但它仍然使用100s.read_file_cpp 94.965s,字面上是相同的代码,我也使用了您的文件生成器。在这种情况下,问题可能是您的,IO不再是瓶颈。