C++ 按时间顺序合并N个日志文件

C++ 按时间顺序合并N个日志文件,c++,algorithm,external-sorting,C++,Algorithm,External Sorting,我有N个不同的日志文件,它们来自我们设备上运行的N个不同的服务。我想将N个文件合并成一个文件,并保持时间顺序。文件大小可以从几KB到GB不等 N个日志文件的格式相同,如下所示: ********** LOGGING SESSION STARTED ************ * Hmsoa Version: 2.4.0.12 * Exe Path: c:\program files (x86)\silicon biosystems\deparray300a_driver\deparray300a

我有N个不同的日志文件,它们来自我们设备上运行的N个不同的服务。我想将N个文件合并成一个文件,并保持时间顺序。文件大小可以从几KB到GB不等

N个日志文件的格式相同,如下所示:

**********  LOGGING SESSION STARTED ************
* Hmsoa Version: 2.4.0.12
* Exe Path: c:\program files (x86)\silicon biosystems\deparray300a_driver\deparray300a_driver.exe
* Exe Version: 1.6.0.154
************************************************


TIME = 2017/02/01 11:12:12,180 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'Connect'->Enter;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'Connect'->Exit=0;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CCisProxyLocal CONNECT - ok'->Enter;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CRecoveryAxesProxyLocal CONNECT - ok'->Enter;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'CAmplifierProxyLocalV3 CONNECT - ok'->Enter;
TIME = 2017/02/01 11:12:12,196 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'SYSTEM_DIAGNOSIS_GET'->Enter;
TIME = 2017/02/01 11:12:12,211 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'SYSTEM_DIAGNOSIS_GET'->Exit=0;
TIME = 2017/02/01 11:12:12,211 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'LBL_SQUARE_SET'->Enter;
TIME = 2017/02/01 11:12:12,219 ; THID = 4924; CAT = ; LVL = 1000; LOG = API 'LBL_SQUARE_SET'->Exit=0;
因为我已经有N个不同的文件,所以到目前为止我所做的是应用一个外部排序算法,为每个文件读取一行:

#include "stdafx.h"
#include "boost/regex.hpp"
#include "boost/lexical_cast.hpp"
#include "boost\filesystem.hpp"
#include <string>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <sstream>
#include <climits>
#include <ctime>
namespace fs = boost::filesystem;

static const boost::regex expression(R"(^(?:(?:TIME\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))");
static const boost::regex nameFileEx(R"(^[\d\-\_]+(\w+\s?\w+|\w+))");
static const std::string path("E:\\2017-02-01"); 
//static const std::string path("E:\\TestLog");

unsigned long time2Milleseconds(const std::string & time)
{
    int a, b, c, d;
    if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3)
        return a * 3600000 + b * 60000 + c * 1000 + d;
}

void readAllFilesUntilLine7(std::vector<std::pair<std::ifstream, std::string>> & vifs)
{
    std::string line;
    for (int i = 0; i < vifs.size(); ++i)
    {
        int lineNumber = 0;
        while (lineNumber != 7 && std::getline(vifs[i].first, line))
        { 
            ++lineNumber;
        }
    }
}

void checkRegex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logTime, std::vector<std::string> & lines, int index, int & counter)
{
    std::string line;
    boost::smatch what;
    if (std::getline(vifs[index].first, line))
    {
        if (boost::regex_search(line, what, expression))
        {
            logTime[index] = time2Milleseconds(what[1]);
        }
        lines[index] = line;
    }
    else
    {
        --counter;
        logTime[index] = ULONG_MAX;
    }
}

void mergeFiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logTime, std::vector<std::string> & lines, std::ofstream & file, int & counter)
{
    std::string line;
    boost::smatch what;
    int index = 0;
    for (int i = 0; i < vifs.size(); ++i)
    {
        checkRegex(vifs, logTime, lines, i, counter);
    }
    index = min_element(logTime.begin(), logTime.end()) - logTime.begin();
    file << lines[index] << " --> " << vifs[index].second << "\n";
    while (true)
    {
        checkRegex(vifs, logTime, lines, index, counter);
        index = min_element(logTime.begin(), logTime.end()) - logTime.begin();
        if (0 == counter)
            break;
        file << lines[index] << " --> " << vifs[index].second << "\n";
    }
}

int main()
{
    clock_t begin = clock();
    int cnt = std::count_if(fs::directory_iterator(path),fs::directory_iterator(),static_cast<bool(*)(const fs::path&)>(fs::is_regular_file));
    std::vector<std::pair<std::ifstream, std::string>> vifs(cnt);
    int index = 0;
    boost::smatch what;
    std::string file;
    for (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d)
    {
        if (fs::is_regular_file(d->path()))
        {
            file = d->path().filename().string();
            if (boost::regex_search(file, what, nameFileEx))
            {
                vifs[index++] = std::make_pair(std::ifstream(d->path().string()), what[1]);
            }
        }
    }
    std::vector<unsigned long> logTime(cnt, ULONG_MAX);
    std::vector<std::string> lines(cnt);
    std::ofstream filename(path + "\\TestLog.txt");
    readAllFilesUntilLine7(vifs);
    mergeFiles(vifs, logTime, lines, filename, cnt);
    filename.close();
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    std::cout << "Elapsed time = " << elapsed_secs << "\n";
    return 0;
}
Main.cpp:

#include "stdafx.h"
#include "boost/regex.hpp"
#include "boost/lexical_cast.hpp"
#include "boost\filesystem.hpp"
#include <string>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <sstream>
#include <climits>
#include <ctime>
#include <queue>
#include "Data.h"
namespace fs = boost::filesystem;

static const boost::regex expression(R"(^(?:(?:TIME\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))");
static const boost::regex nameFileEx(R"(^[\d\-\_]+(\w+\s?\w+|\w+))");
static const std::string path("E:\\2017-02-01");
//static const std::string path("E:\\TestLog");

unsigned long time2Milleseconds(const std::string & time)
{
    int a, b, c, d;
    if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3)
        return a * 3600000 + b * 60000 + c * 1000 + d;
}

void initializeHeap(std::ifstream & ifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, const int index)
{
    ULONG time;
    std::string line;
    boost::smatch what;
    bool match = false;
    while (!match && std::getline(ifs, line))
    {
        if (boost::regex_search(line, what, expression))
        {
            time = time2Milleseconds(what[1]);
            myHeap.push(Data(index, line, time));
            match = true;
        }
    }
}

void checkRegex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, ULONG time, const int index)
{
    std::string line;
    boost::smatch what;
    if (std::getline(vifs[index].first, line))
    {
        if (boost::regex_search(line, what, expression))
        {
            time = time2Milleseconds(what[1]);
        }
        myHeap.push(Data(index, line, time));
    }
}

void mergeFiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, std::ofstream & file)
{
    int index = 0;
    ULONG time = 0;
    while (!myHeap.empty())
    {
        index = myHeap.top().getIndex();
        time = myHeap.top().getTime();
        file << myHeap.top().getLine() << " --> " << vifs[index].second << "\n";
        myHeap.pop();
        checkRegex(vifs, myHeap, time, index);
    }
}

int main()
{
    clock_t begin = clock();
    int cnt = std::count_if(fs::directory_iterator(path), fs::directory_iterator(), static_cast<bool(*)(const fs::path&)>(fs::is_regular_file));
    std::priority_queue<Data, std::vector<Data>, Compare> myHeap;
    std::vector<std::pair<std::ifstream, std::string>> vifs(cnt);
    int index = 0;
    boost::smatch what;
    std::string file;
    for (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d)
    {
        if (fs::is_regular_file(d->path()))
        {
            file = d->path().filename().string();
            if (boost::regex_search(file, what, nameFileEx))
            {
                vifs[index] = std::make_pair(std::ifstream(d->path().string()), what[1]);
                initializeHeap(vifs[index].first, myHeap, index);
                ++index;
            }
        }
    }
    std::ofstream filename(path + "\\TestLog.txt");
    mergeFiles(vifs, myHeap, filename);
    filename.close();
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    std::cout << "Elapsed time = " << elapsed_secs << "\n";
    return 0;
}
#包括“stdafx.h”
#包括“boost/regex.hpp”
#包括“boost/lexical_cast.hpp”
#包括“boost\filesystem.hpp”
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括“Data.h”
名称空间fs=boost::filesystem;
静态常量boost::正则表达式(R“(^(?:(?:TIME\s=\s\d{4}\/\d{2}\/\d{2}\s)(?:@))([0-9:,]+)”;
静态常量boost::regex namefilex(R“(^[\d\-\\\\\+(\w+\s?\w+\124; \ w+));
静态常量标准::字符串路径(“E:\\2017-02-01”);
//静态常量std::字符串路径(“E:\\TestLog”);
无符号长时间2毫秒(常量标准::字符串和时间)
{
INTA、b、c、d;
如果(sscanf_s(time.c_str(),%d:%d:%d,%d“,&a,&b,&c,&d)>=3)
返回a*3600000+b*60000+c*1000+d;
}
void initializeHeap(std::ifstream和ifs,std::priority_queue和myHeap,const int index)
{
乌龙时间;
std::字符串行;
刺激:smatch什么;
布尔匹配=假;
while(!match&&std::getline(ifs,line))
{
if(boost::regex_搜索(行、内容、表达式))
{
时间=time2millesons(什么[1]);
push(数据(索引、行、时间));
匹配=真;
}
}
}
void checkRegex(std::vector&vifs,std::priority_queue&myHeap,ULONG time,const int index)
{
std::字符串行;
刺激:smatch什么;
if(std::getline(vifs[index]。第一行))
{
if(boost::regex_搜索(行、内容、表达式))
{
时间=time2millesons(什么[1]);
}
push(数据(索引、行、时间));
}
}
void合并文件(std::vector和vifs,std::priority_queue和myHeap,std::ofstream和file)
{
int指数=0;
ULONG时间=0;
而(!myHeap.empty())
{
index=myHeap.top().getIndex();
time=myHeap.top().getTime();
文件路径().string()),什么[1];
初始化EAP(vifs[index]。首先,myHeap,index);
++指数;
}
}
}
std::of流文件名(路径+“\\TestLog.txt”);
合并文件(vifs、myHeap、文件名);
filename.close();
clock_t end=clock();
双倍运行秒=双倍(结束-开始)/时钟秒;

std::cout这可以在低内存的情况下更快地完成。首先考虑:

  • 从每个文件中读取一行(因此在任何时间内存中只有
    N
    行)
  • 找到
    N
    行中最小的一行,将其输出
  • 在内存中,将刚刚输出的值替换为当前输出来自的文件的下一行(注意EOF情况)
如果
M
是输出文件的长度(即所有日志的组合长度),那么简单的实现将是
O(N*M)


但是,可以通过使用堆来改进上述情况,从而缩短到
O(M log N)
的时间。也就是说,将
N
内存中的元素放在堆上。从顶部弹出以输出最小的元素。然后,当您读取新行时,只需将该行放回堆中。

这可以更快地完成,而且内存较低。首先考虑:

  • 从每个文件中读取一行(因此在任何时间内存中只有
    N
    行)
  • 找到
    N
    行中最小的一行,将其输出
  • 在内存中,将刚刚输出的值替换为当前输出来自的文件的下一行(注意EOF情况)
如果
M
是输出文件的长度(即所有日志的组合长度),那么简单的实现将是
O(N*M)


但是,可以通过使用堆来改进上述情况,从而缩短到
O(M log N)
的时间。也就是说,将
N
内存中的元素放在堆上。从顶部弹出以输出最小的元素。然后,当您读取新行时,只需将该行放回堆中即可。

值得一提的是,它有一个
-m
/
--merge
选项,用于合并已排序的文件。使用它可能比编写新程序更容易。我认为,作为第一步,您应该尝试找出IO或处理(CPU)是这里的瓶颈。(iotop?
文件进一步说,如果您保留了一个包含更多输入的文件计数器,则不需要使用
std::all_。当
checkRegex
中的
std::getline
失败时(由于EOF),每次都会减少。此外,我认为存在一个错误/极端情况:当regex在
checkRegex
中失败时,您将结束前一行的编写<代码>行[索引]=行。值得一提的是,它有一个
-m
/
--merge
选项来合并已排序的文件。使用它可能比编写新程序更容易。我认为,作为第一步,您应该尝试找出IO或处理(CPU)是这里的瓶颈。(iotop?
文件进一步说,如果您保留了一个包含更多输入的文件计数器,则不需要使用
std::all_。当
checkRegex
中的
std::getline
失败时(由于EOF),每次都会减少。此外,我认为存在一个错误/极端情况:当regex在
checkRegex
中失败时,您将结束前一行的编写<代码>行[索引]=行。除了堆之外,这正是代码所做的,不是吗?@DanielJour速度优化就是堆。内存优化仅在t处的内存中有
N
#include "stdafx.h"
#include "Data.h"


Data::Data(DWORD i_index,
           const std::string & i_line,
           ULONG i_time)
    : index(i_index)
    , line(i_line)
    , time(i_time)
{
}


Data::~Data()
{
}
#include "stdafx.h"
#include "boost/regex.hpp"
#include "boost/lexical_cast.hpp"
#include "boost\filesystem.hpp"
#include <string>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <sstream>
#include <climits>
#include <ctime>
#include <queue>
#include "Data.h"
namespace fs = boost::filesystem;

static const boost::regex expression(R"(^(?:(?:TIME\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))");
static const boost::regex nameFileEx(R"(^[\d\-\_]+(\w+\s?\w+|\w+))");
static const std::string path("E:\\2017-02-01");
//static const std::string path("E:\\TestLog");

unsigned long time2Milleseconds(const std::string & time)
{
    int a, b, c, d;
    if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3)
        return a * 3600000 + b * 60000 + c * 1000 + d;
}

void initializeHeap(std::ifstream & ifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, const int index)
{
    ULONG time;
    std::string line;
    boost::smatch what;
    bool match = false;
    while (!match && std::getline(ifs, line))
    {
        if (boost::regex_search(line, what, expression))
        {
            time = time2Milleseconds(what[1]);
            myHeap.push(Data(index, line, time));
            match = true;
        }
    }
}

void checkRegex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, ULONG time, const int index)
{
    std::string line;
    boost::smatch what;
    if (std::getline(vifs[index].first, line))
    {
        if (boost::regex_search(line, what, expression))
        {
            time = time2Milleseconds(what[1]);
        }
        myHeap.push(Data(index, line, time));
    }
}

void mergeFiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<Data, std::vector<Data>, Compare> & myHeap, std::ofstream & file)
{
    int index = 0;
    ULONG time = 0;
    while (!myHeap.empty())
    {
        index = myHeap.top().getIndex();
        time = myHeap.top().getTime();
        file << myHeap.top().getLine() << " --> " << vifs[index].second << "\n";
        myHeap.pop();
        checkRegex(vifs, myHeap, time, index);
    }
}

int main()
{
    clock_t begin = clock();
    int cnt = std::count_if(fs::directory_iterator(path), fs::directory_iterator(), static_cast<bool(*)(const fs::path&)>(fs::is_regular_file));
    std::priority_queue<Data, std::vector<Data>, Compare> myHeap;
    std::vector<std::pair<std::ifstream, std::string>> vifs(cnt);
    int index = 0;
    boost::smatch what;
    std::string file;
    for (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d)
    {
        if (fs::is_regular_file(d->path()))
        {
            file = d->path().filename().string();
            if (boost::regex_search(file, what, nameFileEx))
            {
                vifs[index] = std::make_pair(std::ifstream(d->path().string()), what[1]);
                initializeHeap(vifs[index].first, myHeap, index);
                ++index;
            }
        }
    }
    std::ofstream filename(path + "\\TestLog.txt");
    mergeFiles(vifs, myHeap, filename);
    filename.close();
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    std::cout << "Elapsed time = " << elapsed_secs << "\n";
    return 0;
}