C++ c++；多线程读取文件_C++_Multithreading_File_Fstream_Istream

C++ c++；多线程读取文件

c++ multithreading file

C++ c++；多线程读取文件,c++,multithreading,file,fstream,istream,C++,Multithreading,File,Fstream,Istream,我有一个大文件，超过8900万行。我想读取一个文件并将其转换为哈希表，然后进行一些计算问题是使用istream读取文件并将其传递到哈希表的速度太慢是否有可能使用更多线程读取文件？使用线程库或者我应该把锉刀切成小块，然后用一根线把每一块锉好吗散列函数不需要花很多时间来计算对于碰撞，我使用列表。这个表的数字是100万 // Adding_date_too_file.cpp : This file contains the 'main' function. Program ex

我有一个大文件，超过8900万行。我想读取一个文件并将其转换为哈希表，然后进行一些计算

问题是使用

istream

读取文件并将其传递到哈希表的速度太慢

是否有可能使用更多线程读取文件？使用线程库

或者我应该把锉刀切成小块，然后用一根线把每一块锉好吗

散列函数不需要花很多时间来计算

对于碰撞，我使用列表。这个表的数字是100万

       // Adding_date_too_file.cpp : This file contains the 'main' function. Program execution begins and ends there.
    //

    #include "pch.h"
    #include <iostream>
    #include <string>
    #include "hash.h"
    #include <iostream>
    #include <fstream>

    using namespace std;
    int main()
    {

        hasho Hashy;
        string f1, f2, f3, f4, f5, f6, f7;
        bool is_first_line = true;
        fstream file_input;
        fstream  file_2_Collums;

        cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only two column\n which is going to be used for searching based on that file)" << flush;
        while (true)
        {

            string infilename;
            getline(cin, infilename);
            file_input.open(infilename.c_str());
            if (file_input)break;
            cout << "Invalid file. Please enter a valid input file name> " << flush;
        }



        cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only one column )" << flush;
        while (true)
        {

            string infilename;
            getline(cin, infilename);
            file_2_Collums.open(infilename.c_str());
            if (file_2_Collums)break;
            cout << "Invalid file. Please enter a valid input file name> " << flush;
        }
        //creating output file


        int * table;
        table = new int[2];

        int count_file_lines = 0;
        int line_counter_inventors = 0;

        if (file_input.is_open())
        {

            while (!file_input.eof())
            {
                if (is_first_line == true) {
                    getline(file_input, f1, '\n');
                    is_first_line = false;
                }


                getline(file_input, f1, '\t');// patent id

                getline(file_input, f2, '\t');// patent id

                getline(file_input, f3, '\t');// patent id

                getline(file_input, f3, '\t');// patent id

                getline(file_input, f6, '\t');// patent id
                getline(file_input, f3, '\n');//date


                //cout << "adding these items " << f1 << '\t' << f6 << endl;

                Hashy.AddItem(f2, f6);
            cout << count_file_lines << endl;
                count_file_lines++;
            //  cout << f2 << '\t' << f6 << endl;
            }

        }

        int lines_2 = 0;

            if (file_2_Collums.is_open())
            {
                Hashy.openOutputFile();
                while (!file_2_Collums.eof())
                {

                    getline(file_2_Collums, f1, '\t');//patent_id

                    getline(file_2_Collums, f4, '\n');//assignee_id
                    //cout << f1 << endl;


                    Hashy.FindDateId(f1, f4);

                    lines_2++;
                }

            }





    system("pause");
    return 0;}

//Adding_date_too_file.cpp：此文件包含“main”函数。程序执行从那里开始并结束。
//
#包括“pch.h”
#包括
#包括
#包括“hash.h”
#包括
#包括
使用名称空间std；
int main（）
{
哈索哈西；
字符串f1、f2、f3、f4、f5、f6、f7；
bool是第一行=真；
fstream文件输入；
fstream文件2；
库特
我有一个超过8900万行的大文件
如果您考虑使用多个线程来处理它，您可能不应该这样做。您应该解释这个巨大的文件包含什么（什么样的数据：基因组学、时间序列等等）及其大小（以GB为单位）。您是处理同一个文件一次，还是多次？需要多少时间（以您需要处理吗？需要多少时间来计算行数
一种可能是将该文件拆分为几个较小的文件（例如，使用）
我不知道这是否会对你有帮助（可能不会，除非你运行了几次程序来读取这些文件）
另一种可能是对文件进行两次传递。第一次传递将计算行数，并可能记住其中一些行的起始偏移量（例如，每1024行）。然后，您可以在第二次传递中并行处理文件（通过重用记住的偏移量）
顺便说一句，如果您的大文件太大，无法保存在中，那么您的问题是IO受限（瓶颈是物理磁盘硬件），并且您无法通过尝试并行化来获得任何速度（即使将其拆分为较小的文件）
一种可能是一次性（缓慢地）读取和解析您的大型文件，并用其数据填充某个数据库（可能是一个数据库）。然后您可能（如果您多次处理该数据）利用访问该数据库（而不是该文件）的优势
关于哈希表，考虑使用标准C++（例如）。
另外，我们不知道那个大文件是什么，它包含什么，以及如何处理它。
分析你的代码，看看什么是慢的。在这里发布你的代码。也就是说，如果你有一个可变长度行的文本文件，很难使用线程同时读取文件的不同行。你可以让每个线程读取一个固定大小的行然后将这些块拼凑成一段连贯的文本。还要注意的是，在旋转盘片（而不是SSD）上使用线程实际上可能会降低性能，因为更多的线程试图同时使用磁盘，需要更多地移动读/写磁头。现在，您只需要分析代码。如果您不熟悉该术语，请查看分析的含义。顺便说一句，您的代码不适合在此处发布，因为它应该是最小的且独立的，即a。听起来是学习如何使用数据库的时候了。您建议的这些小文件是否可以由多个线程读取并插入哈希表？拆分文件的意义是什么？硬盘驱动器i/O速度（如SSD为0.5 GB/s）将文件拆分不会改变。拆分文件并让更多线程读取（例如，每个线程读取400万个线程），因此我认为由于更多线程正在处理该文件，读取时间将减少。但我不确定。该文件包含8900万行和9列，我将从该文件中提取3列并存储到hash表。然后我使用另一个包含3列的文件，我使用第一列将其与较大文件的列相匹配，然后在比较后创建一个新文件。这就是我使用HashTable的原因。不要评论此答案，但要编辑您的问题。解释您是如何获得该文件的？您只处理一次或几次吗l次？为什么不能使用std:：unordered\u map
或std:：map？
  #include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"

#include "hash.h"
#include <fstream>
using namespace std;
static ofstream output_file;

hasho::hasho()
{
    for (int i = 0; i < tableSize; i++) {

        //cout << i << endl;

        HashTable[i] = new item;
        HashTable[i]->pattent_id = "empty";

        HashTable[i]->date = "empty";
        HashTable[i]->next = NULL;
    }

}

void hasho::openOutputFile() {

    cout << "Please give the name of the output file: \n(The file should end with the format type (txt,csv etc.)) " << flush;
    while (true)
    {

        string infilename;
        getline(cin, infilename);
        output_file.open(infilename.c_str(), fstream::out);
        break;
    }
}

int hasho::NumberOfItemsInIndex(int index) {

    int count = 0;
    if (HashTable[index]->date == "empty") {

        return count;
    }
    else {

        count++;
        item* ptr = HashTable[index];
        while (ptr->next != NULL) {

            count++;
            ptr = ptr->next;

        }
    }

    return count;

}

void hasho::PrintTable() {

    int number;

    for (int i = 0; i < tableSize; i++) {


        number = NumberOfItemsInIndex(i);
        cout << "---------------------------------------\n";
        cout << "index= " << i << endl;

        cout << HashTable[i]->pattent_id << endl;
        cout << HashTable[i]->date << endl;
        cout << "# of items= " << number << endl;
        cout << "---------------------------------------\n";
    }

}


void hasho::PrintItemsInIndex(int index) {


    item* ptr = HashTable[index];

    if (ptr->date == "empty") {
        cout << "index  = " << index << " is empty." << endl;
    }
    else {
        cout << "index = " << index << " contains the following items\n";
        while (ptr != NULL) {



            cout << "-----------" << endl;
            cout << ptr->date << endl;
            cout << ptr->pattent_id << endl;
            cout << "-----------" << endl;

            ptr = ptr->next;

        }
    }

}



void hasho::AddItem(string pattend_id, string date)
{
    int index = Hash(pattend_id);

    if (HashTable[index]->pattent_id == "empty")
    {
        HashTable[index]->pattent_id = pattend_id;
        HashTable[index]->date = date;
    }
    else {

        item* ptr = HashTable[index];
        item* n = new item;
        n->date = date ;
        n->pattent_id = pattend_id;
        n->next = NULL;

        while (ptr->next != NULL) {
            ptr = ptr->next;
        }
        ptr->next = n;

    }
}

void hasho::FindDateId(string pattend_id, string assignee_id1) {

    int found = 0;
    int nfound = 0;

    int index = Hash(pattend_id);
    bool foundDateId = false;
    string date;
    item* ptr = HashTable[index];
    int count = 1;
    //write to file

    //cout << "WE are looking for the date of " <<pattend_id << " in Index:  " << index <<endl;
    while (ptr != NULL) {
        //cout << "NOw we are looking with : " << pattend_id << endl;
        if (ptr->pattent_id == pattend_id) {

            //cout << "NOw we are looking with : " << pattend_id <<endl;


            foundDateId = true;

            date = ptr->date;
            //write to file 


            output_file << pattend_id << "\t";
            output_file << assignee_id1 << endl;
            output_file << date << "\t";
            //cout << " " << date << endl;
            found = 1;
            count++;
        }
        ptr = ptr->next;
    }
    if (foundDateId == false) {
        nfound++;


    }
    cout << "found " << found << endl;
    cout << "not found " << nfound << endl;
    cout << endl;

}

int hasho::Hash(string key)
{
    int unsigned hash = 0;
    int  unsigned index;

    //cout << key << endl;

    for (int unsigned i = 0; i < key.length(); i++) {

        hash = hash + (int)key[i] *(i+1);

    }

    index =hash % tableSize;
    //cout << index << endl;
    return index;


}

#pragma once

#include "pch.h"
#include <iostream>
#include <string>
//#include "hash.cpp"
using namespace std;

#pragma comment(linker, "/STACK:7000000")
#pragma comment(linker, "/HEAP:7000000")

#ifndef  HASH_H
#define HASH_H


class hasho {
private:
    static const int tableSize = 300003;

    struct item {
        string pattent_id;
        string date;
        item* next;
    };

    item* HashTable[tableSize];



public:
    hasho();
    int Hash(string key);
    void AddItem(string pattend_id, string date);
    int NumberOfItemsInIndex(int index);
    void PrintTable();
    void PrintItemsInIndex(int index);
    void FindDateId(string pattent_id, string assgnee_id);
    void openOutputFile();
};


#endif // ! HASH_H