C++ 如何使用Boost.Spirit.Qi增量解析(并处理)一个大文件?
我已经为自定义文本文件格式创建了一个Qi解析器。有成千上万个条目需要处理,每个条目通常有1-10个子条目。我给出了一个经过精简的解析器工作示例C++ 如何使用Boost.Spirit.Qi增量解析(并处理)一个大文件?,c++,parsing,boost,boost-spirit,boost-spirit-qi,C++,Parsing,Boost,Boost Spirit,Boost Spirit Qi,我已经为自定义文本文件格式创建了一个Qi解析器。有成千上万个条目需要处理,每个条目通常有1-10个子条目。我给出了一个经过精简的解析器工作示例 #包括 #包括 #包括 #包括 #包括 #包括 #包括 #包括 #包括 #包括 #包括 #包括 #包括 使用std::string; 使用std::vector; 使用std::cout; 使用std::endl; 名称空间模型 { 名称空间qi=boost::spirit::qi; 结构谱 { 字符串注释; 字符串文件; 弦本地; 双前兆MZ; 国际
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
使用std::string;
使用std::vector;
使用std::cout;
使用std::endl;
名称空间模型
{
名称空间qi=boost::spirit::qi;
结构谱
{
字符串注释;
字符串文件;
弦本地;
双前兆MZ;
国际前兆电荷;
双前兆;
};
结构簇
{
字符串id;
矢量谱;
};
结构聚类
{
字符串名;
向量簇;
};
}
//告诉fusion有关数据结构的信息,使其成为一流的fusion公民。
//必须在全局范围内。
增强融合适应结构(
模型::频谱,
(字符串,注释)
(字符串,文件)
(字符串,nativeId)
(双倍,前兆MZ)
(国际,前兆电荷)
(双倍,前兆强度)
)
增强融合适应结构(
模型::集群,
(字符串,id)
(标准::矢量,光谱)
)
增强融合适应结构(
模型::集群,
(字符串、名称)
(标准::向量,簇)
)
名称空间{
结构报告错误
{
模板结构结果{typedef void type;};
//将字符串压缩到周围的新行字符
模板
void运算符()(Iter first_Iter,Iter last_Iter,
Iter错误\u Iter,常数boost::spirit::qi::info&what)常数
{
std::字符串优先(第一个iter,错误iter);
std::字符串last(错误,last);
auto first_pos=first.rfind('\n');
auto last_pos=last.find('\n');
自动错误行=((第一个位置==标准::字符串::npos)?第一个
:std::字符串(第一个,第一个位置+1))
+标准::字符串(最后,0,最后位置);
//自动错误位置=(错误位置-第一个位置)+1;
/*自动错误\u pos=错误
如果(第一个位置!=std::string::npos)
错误_pos-=(第一个_pos+1)*/
标准:cerr
“NativeID:”>引用的字符串>
布尔值>双值>整数>双值;
群集\u开始%=
“=集群=“>eol>
“id=“>+(字符-下线)>下线>
频谱开始%eol;
群集%=
“name=“>+(字符-下线)>下线>
下线>
群集启动%eol;
BOOST_SPIRIT_DEBUG_节点((集群)(集群启动)(引用字符串)(频谱启动))
//关于_错误(集群,报告_错误(_1,_2,_3,_4));
//on_错误(集群_开始,报告_错误(_1,_2,_3,_4));
//on_错误(频谱_开始,报告_错误(_1,_2,_3,_4));
//on_错误(引用的_字符串,报告_错误(_1,_2,_3,_4));
//成功时(群集开始,量化群集(_1,_2,_3,_4))??
}
qi::规则引用的字符串;
qi::规则集群启动;
qi::规则谱\u开始;
qi:规则簇;
};
}
int main()
{
使用名称空间模型;
cluster_parser g;//我们的语法
字符串str;
//std::ifstream输入(“c:/test/Mo_tai.clustering”);
std::istringstream输入(“name=GreedyClustering\u 0.99\n”
“\n”
“=群集=\n”
“id=9c8c5830-5841-4f77-b819-64180509615b\n”
“SPEC\t\35; file=w:\\test\\mou Tai\u iTRAQ\u f4.mgf\id=index=219\35title=mou Tai\u iTRAQ\u f4.1254.1254.2文件:\“mou Tai\u iTRAQ\u f4.raw\”,NativeID:“controllerType=0 controllerNumber=1 scan=1254\”\ttrue\t\t300.1374\t2\t\t0.0\n”
“=群集=\n”
“id=f8f384a1-3d5f-4af1-9581-4d03a5aa3342\n”
“SPEC\t\35; file=w:\\test\\mou Tai\u iTRAQ\u f9.mgf\id=index=560\35title=mou Tai\u iTRAQ\u f9.1666.1666.3文件:\“mou Tai\u iTRAQ\u f9.raw\”,NativeID:“controllerType=0 controllerNumber=1 scan=1666\”\ttrue\t\t300.14413\t3\t\t0.0\n”
“SPEC\t\35; file=w:\\test\\mou Tai\u iTRAQ\u f9.mgf\id=index=520\35title=mou Tai\u iTRAQ\u f9.1621.1621.3文件:\“mou Tai\u iTRAQ\u f9.raw\”,NativeID:“controllerType=0 controllerNumber=1 scan=1621\”\ttrue\t\t300.14197\t3\t\t\t0.0\n”
“=群集=\n”
“id=b84b79e1-44bc-44c0-a9af-5391ca02582d\n”
“SPEC\t\35; file=w:\\test\\mou-Tai\u-iTRAQ\u f2.mgf\35id=index=7171\35title=mou-Tai\u-iTRAQ\u f2.12729.12729.2文件:\“mou-Tai\u-iTRAQ\u f2.raw\”,NativeID:“controllerType=0 controllerNumber=1 scan=12729\”\ttrue\t\t300.15695\t2\t\t0.0”);
input.unset(std::ios::skipws);
boost::spirit::istream_迭代器开始(输入);
boost::spirit::istream_迭代器end;
聚类结果;
bool r=短语解析(开始、结束、g、qi::空白、聚类结果);
if(r&&begin==end)
{
不能使用流式迭代器
或对内存映射文件进行操作
在处理端,从语义操作内部将操作推送到队列上
注意:您可能会遇到一个假定的错误,该错误无法正确清除回溯缓冲区;您可能希望检查此错误,并采取以下回答中所述的预防措施:使用
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/io.hpp>
namespace model
{
namespace qi = boost::spirit::qi;
namespace px = boost::phoenix;
struct spectrum {
std::string comment;
std::string file;
std::string nativeId;
double precursorMz;
int precursorCharge;
double precursorIntensity;
};
struct cluster {
std::string id;
std::vector<spectrum> spectra;
};
}
BOOST_FUSION_ADAPT_STRUCT(model::spectrum, comment, file, nativeId, precursorMz, precursorCharge, precursorIntensity)
BOOST_FUSION_ADAPT_STRUCT(model::cluster, id, spectra)
namespace model
{
template <typename Iterator>
struct cluster_parser : qi::grammar<Iterator>
{
cluster_parser(std::function<void(std::string const&, model::cluster const&)> handler)
: cluster_parser::base_type(start),
submit_(handler)
{
using namespace qi;
quoted_string %= lexeme['"' > +(char_ - '"') > '"'];
spectrum_start %=
lit("SPEC") >
"#" > +(char_ - "File:") >
"File:" > quoted_string > lit(",") >
"NativeID:" > quoted_string >
bool_ > double_ > int_ > double_;
cluster_start %=
"=Cluster=" > eol >
"id=" > +(char_ - eol) > eol >
spectrum_start % eol;
clusters %=
"name=" > qi::as_string[ +(char_ - eol) ][ name_ = _1 ] > eol > eol >
cluster_start [ submit_(name_, _1) ] % eol;
start = skip(blank) [clusters];
BOOST_SPIRIT_DEBUG_NODES((start)(clusters)(cluster_start)(quoted_string)(spectrum_start))
}
private:
qi::_a_type name_;
px::function<std::function<void(std::string const&, model::cluster const&)> > submit_;
qi::rule<Iterator, std::string(), qi::blank_type> quoted_string;
qi::rule<Iterator, cluster(), qi::blank_type> cluster_start;
qi::rule<Iterator, spectrum(), qi::blank_type> spectrum_start;
qi::rule<Iterator, qi::locals<std::string>, qi::blank_type> clusters;
qi::rule<Iterator> start;
};
}
int main()
{
using namespace model;
cluster_parser<boost::spirit::istream_iterator> g([&](auto const&...){std::cout << "handled\n";}); // Our grammar
std::string str;
//std::ifstream input("c:/test/Mo_tai.clustering");
std::istringstream input(R"(name=GreedyClustering_0.99
=Cluster=
id=9c8c5830-5841-4f77-b819-64180509615b
SPEC #file=w:\test\Mo_Tai_iTRAQ_f4.mgf#id=index=219#title=Mo_Tai_iTRAQ_f4.1254.1254.2 File:"Mo_Tai_iTRAQ_f4.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1254" true 300.1374 2 0.0
=Cluster=
id=f8f384a1-3d5f-4af1-9581-4d03a5aa3342
SPEC #file=w:\test\Mo_Tai_iTRAQ_f9.mgf#id=index=560#title=Mo_Tai_iTRAQ_f9.1666.1666.3 File:"Mo_Tai_iTRAQ_f9.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1666" true 300.14413 3 0.0
SPEC #file=w:\test\Mo_Tai_iTRAQ_f9.mgf#id=index=520#title=Mo_Tai_iTRAQ_f9.1621.1621.3 File:"Mo_Tai_iTRAQ_f9.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1621" true 300.14197 3 0.0
=Cluster=
id=b84b79e1-44bc-44c0-a9af-5391ca02582d
SPEC #file=w:\test\Mo_Tai_iTRAQ_f2.mgf#id=index=7171#title=Mo_Tai_iTRAQ_f2.12729.12729.2 File:"Mo_Tai_iTRAQ_f2.raw", NativeID:"controllerType=0 controllerNumber=1 scan=12729" true 300.15695 2 0.0)");
input.unsetf(std::ios::skipws);
boost::spirit::istream_iterator begin(input);
boost::spirit::istream_iterator end;
bool r = phrase_parse(begin, end, g, qi::blank);
if (r && begin == end) {
std::cout << "Parsing succeeded\n";
}
else {
std::cout << "Parsing failed\n";
}
if (begin!=end) {
std::cout << "Unparsed remaining input: '" << std::string(begin, end) << "\n";
}
return (r && begin==end)? 0 : 1;
}
#include <boost/asio.hpp>
#include <boost/thread.hpp>
namespace ba = boost::asio;
struct Processing {
Processing() {
for (unsigned i=0; i < boost::thread::hardware_concurrency(); ++i)
_threads.create_thread([this] { _svc.run(); });
}
~Processing() {
_work.reset();
_threads.join_all();
}
void submit(std::string const& name, model::cluster const& cluster) {
_svc.post([=] { do_processing(name, cluster); });
}
private:
void do_processing(std::string const& name, model::cluster const& cluster) {
std::cout << "Thread " << boost::this_thread::get_id() << ": " << name << " cluster of " << cluster.spectra.size() << " spectra\n";
boost::this_thread::sleep_for(boost::chrono::milliseconds(950));
}
ba::io_service _svc;
boost::optional<ba::io_service::work> _work = ba::io_service::work(_svc);
boost::thread_group _threads;
};
奖金:线程工人
下面是一个版本,它在线程池上调度集群进行异步处理
请注意,submit方法向服务发布lambda。lambda按值捕获,因为参数的生存期应在处理过程中延长
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/io.hpp>
namespace model
{
namespace qi = boost::spirit::qi;
namespace px = boost::phoenix;
struct spectrum {
std::string comment;
std::string file;
std::string nativeId;
double precursorMz;
int precursorCharge;
double precursorIntensity;
};
struct cluster {
std::string id;
std::vector<spectrum> spectra;
};
}
BOOST_FUSION_ADAPT_STRUCT(model::spectrum, comment, file, nativeId, precursorMz, precursorCharge, precursorIntensity)
BOOST_FUSION_ADAPT_STRUCT(model::cluster, id, spectra)
namespace model
{
template <typename Iterator>
struct cluster_parser : qi::grammar<Iterator>
{
cluster_parser(std::function<void(std::string const&, model::cluster const&)> handler)
: cluster_parser::base_type(start),
submit_(handler)
{
using namespace qi;
quoted_string %= lexeme['"' > +(char_ - '"') > '"'];
spectrum_start %=
lit("SPEC") >
"#" > +(char_ - "File:") >
"File:" > quoted_string > lit(",") >
"NativeID:" > quoted_string >
bool_ > double_ > int_ > double_;
cluster_start %=
"=Cluster=" > eol >
"id=" > +(char_ - eol) > eol >
spectrum_start % eol;
clusters %=
"name=" > qi::as_string[ +(char_ - eol) ][ name_ = _1 ] > eol > eol >
cluster_start [ submit_(name_, _1) ] % eol;
start = skip(blank) [clusters];
BOOST_SPIRIT_DEBUG_NODES((start)(clusters)(cluster_start)(quoted_string)(spectrum_start))
}
private:
qi::_a_type name_;
px::function<std::function<void(std::string const&, model::cluster const&)> > submit_;
qi::rule<Iterator, std::string(), qi::blank_type> quoted_string;
qi::rule<Iterator, cluster(), qi::blank_type> cluster_start;
qi::rule<Iterator, spectrum(), qi::blank_type> spectrum_start;
qi::rule<Iterator, qi::locals<std::string>, qi::blank_type> clusters;
qi::rule<Iterator> start;
};
}
int main()
{
using namespace model;
cluster_parser<boost::spirit::istream_iterator> g([&](auto const&...){std::cout << "handled\n";}); // Our grammar
std::string str;
//std::ifstream input("c:/test/Mo_tai.clustering");
std::istringstream input(R"(name=GreedyClustering_0.99
=Cluster=
id=9c8c5830-5841-4f77-b819-64180509615b
SPEC #file=w:\test\Mo_Tai_iTRAQ_f4.mgf#id=index=219#title=Mo_Tai_iTRAQ_f4.1254.1254.2 File:"Mo_Tai_iTRAQ_f4.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1254" true 300.1374 2 0.0
=Cluster=
id=f8f384a1-3d5f-4af1-9581-4d03a5aa3342
SPEC #file=w:\test\Mo_Tai_iTRAQ_f9.mgf#id=index=560#title=Mo_Tai_iTRAQ_f9.1666.1666.3 File:"Mo_Tai_iTRAQ_f9.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1666" true 300.14413 3 0.0
SPEC #file=w:\test\Mo_Tai_iTRAQ_f9.mgf#id=index=520#title=Mo_Tai_iTRAQ_f9.1621.1621.3 File:"Mo_Tai_iTRAQ_f9.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1621" true 300.14197 3 0.0
=Cluster=
id=b84b79e1-44bc-44c0-a9af-5391ca02582d
SPEC #file=w:\test\Mo_Tai_iTRAQ_f2.mgf#id=index=7171#title=Mo_Tai_iTRAQ_f2.12729.12729.2 File:"Mo_Tai_iTRAQ_f2.raw", NativeID:"controllerType=0 controllerNumber=1 scan=12729" true 300.15695 2 0.0)");
input.unsetf(std::ios::skipws);
boost::spirit::istream_iterator begin(input);
boost::spirit::istream_iterator end;
bool r = phrase_parse(begin, end, g, qi::blank);
if (r && begin == end) {
std::cout << "Parsing succeeded\n";
}
else {
std::cout << "Parsing failed\n";
}
if (begin!=end) {
std::cout << "Unparsed remaining input: '" << std::string(begin, end) << "\n";
}
return (r && begin==end)? 0 : 1;
}
#include <boost/asio.hpp>
#include <boost/thread.hpp>
namespace ba = boost::asio;
struct Processing {
Processing() {
for (unsigned i=0; i < boost::thread::hardware_concurrency(); ++i)
_threads.create_thread([this] { _svc.run(); });
}
~Processing() {
_work.reset();
_threads.join_all();
}
void submit(std::string const& name, model::cluster const& cluster) {
_svc.post([=] { do_processing(name, cluster); });
}
private:
void do_processing(std::string const& name, model::cluster const& cluster) {
std::cout << "Thread " << boost::this_thread::get_id() << ": " << name << " cluster of " << cluster.spectra.size() << " spectra\n";
boost::this_thread::sleep_for(boost::chrono::milliseconds(950));
}
ba::io_service _svc;
boost::optional<ba::io_service::work> _work = ba::io_service::work(_svc);
boost::thread_group _threads;
};
多亏了快速的支持,coliru回来了,我用找到的编辑了答案。添加了链接非常令人印象深刻的答案!谢谢!
Processing processing;
auto handler = [&processing](auto&... args) { processing.submit(args...); };
cluster_parser<boost::spirit::istream_iterator> g(handler); // Our grammar
Thread 7f0144a5b700: GreedyClustering_0.99 cluster of 1 spectra
Thread 7f014425a700: GreedyClustering_0.99 cluster of 2 spectra
Parsing succeeded
Thread 7f0143a59700: GreedyClustering_0.99 cluster of 1 spectra