C++ 缓慢地提高灵气_C++_Parsing_Csv_Boost Spirit_Boost Spirit Qi

C++ 缓慢地提高灵气

c++ parsing csv

C++ 缓慢地提高灵气,c++,parsing,csv,boost-spirit,boost-spirit-qi,C++,Parsing,Csv,Boost Spirit,Boost Spirit Qi,我尝试用Boost Spirit QI解析TPCH文件。我的实施灵感来自Spirit QI（）的员工示例。数据为csv格式，标记用“|”字符分隔它可以工作，但速度非常慢（对于1GB，20秒）以下是我的行项目文件qi语法： struct lineitem { int l_orderkey; int l_partkey; int l_suppkey; int l_linenumber; std::string l_quantity; std::

我尝试用Boost Spirit QI解析TPCH文件。我的实施灵感来自Spirit QI（）的员工示例。数据为csv格式，标记用“|”字符分隔

它可以工作，但速度非常慢（对于1GB，20秒）

以下是我的行项目文件qi语法：

struct lineitem {
    int l_orderkey;
    int l_partkey;
    int l_suppkey;
    int l_linenumber;
    std::string l_quantity;
    std::string l_extendedprice;
    std::string l_discount;
    std::string l_tax;
    std::string l_returnflag;
    std::string l_linestatus;
    std::string l_shipdate;
    std::string l_commitdate;
    std::string l_recepitdate;
    std::string l_shipinstruct;
    std::string l_shipmode;
    std::string l_comment;
};

BOOST_FUSION_ADAPT_STRUCT( lineitem,
    (int, l_orderkey)
    (int, l_partkey)
    (int, l_suppkey)
    (int, l_linenumber)
    (std::string, l_quantity)
    (std::string, l_extendedprice)
    (std::string, l_discount)
    (std::string, l_tax)
    (std::string, l_returnflag)
    (std::string, l_linestatus)
    (std::string, l_shipdate)
    (std::string, l_commitdate)
    (std::string, l_recepitdate)
    (std::string, l_shipinstruct)
    (std::string, l_shipmode)
    (std::string, l_comment)) 

vector<lineitem>* lineitems=new vector<lineitem>();

phrase_parse(state->dataPointer,
    state->dataEndPointer,
    (*(int_ >> "|" >>
    int_ >> "|" >> 
    int_ >> "|" >>
    int_ >> "|" >>
    +(char_ - '|') >> "|" >>
    +(char_ - '|') >> "|" >>
    +(char_ - '|') >> "|" >>
    +(char_ - '|') >> "|" >>
    +(char_ - '|') >> '|' >>
    +(char_ - '|') >> '|' >>
    +(char_ - '|') >> '|' >>
    +(char_ - '|') >> '|' >>
    +(char_ - '|') >> '|' >>
    +(char_ - '|') >> '|' >>
    +(char_ - '|') >> '|' >>
    +(char_ - '|') >> '|' 
    ) ), space, *lineitems
);

struct行项目{
int l_orderkey；
int l_零件键；
int l_suppkey；
国际线号；
std：：字符串l_数量；
std：：字符串l_扩展价格；
std：：字符串l_折扣；
std：：字符串l_税；
std：：字符串l_returnflag；
std：：字符串l_linestatus；
std：：字符串l_shipdate；
std：：字符串l_commitdate；
std：：字符串l_recepitdate；
std：：字符串l_；
std：：字符串l_shipmode；
std：：字符串l_注释；
};
增强融合适应结构（行项，
（int，l_orderkey）
（内部，左/右部件键）
（int，l_辅助键）
（int，l_线号）
（标准：：字符串，l_数量）
（标准：：字符串，l_扩展价格）
（标准：：字符串，l_折扣）
（标准：：字符串，l_税）
（标准：：字符串，l_返回标志）
（标准：：字符串，l_linestatus）
（标准：：字符串，l_发货日期）
（标准：：字符串，l_提交日期）
（标准：：字符串，l_recepitdate）
（标准：：字符串，l_）
（标准：：字符串，l_shipmode）
（标准：：字符串，l_注释））
vector*lineitems=新向量（）；
短语解析（状态->数据指针，
状态->数据端点，
（*（int_>>“|”>>
int_>>“|”>>
int_>>“|”>>
int_>>“|”>>
+（char|-'|'）>>“|”>>
+（char|-'|'）>>“|”>>
+（char|-'|'）>>“|”>>
+（char|-'|'）>>“|”>>
+（char|-'''|'）>>>'|'>>
+（char|-'''|'）>>>'|'>>
+（char|-'''|'）>>>'|'>>
+（char|-'''|'）>>>'|'>>
+（char|-'''|'）>>>'|'>>
+（char|-'''|'）>>>'|'>>
+（char|-'''|'）>>>'|'>>
+（char|-''|'）>>>'|'
)），空格，*行项目
);

问题似乎在于字符解析。它比其他转换慢得多。

有没有更好的方法将可变长度标记解析为字符串？

编译时是否使用-O2

Boosts库有很多冗余，在使用优化标志时会被删除

另一种可能的解决方案是使用重复解析器指令：

我找到了解决问题的办法。如本文所述性能瓶颈是灵气的字符串处理。所有其他数据类型似乎都非常快

我通过自己处理数据而不是使用处理来避免这个问题

我的解决方案使用一个helper类，它为csv文件的每个字段提供函数。函数将值存储到结构中。字符串存储在char[]s中。给解析器一个换行符，它调用一个将结构添加到结果向量的函数。 Boost解析器调用这个函数，而不是单独将值存储到向量中

以下是我为TCPH基准的region.tbl文件编写的代码：

struct region{
    int r_regionkey;
    char r_name[25];
    char r_comment[152];
};

class regionStorage{
public:
regionStorage(vector<region>* regions) :regions(regions), pos(0) {}
void storer_regionkey(int const&i){
    currentregion.r_regionkey = i;
}

void storer_name(char const&i){
    currentregion.r_name[pos] = i;
    pos++;
}

void storer_comment(char const&i){
    currentregion.r_comment[pos] = i;
    pos++;
}

void resetPos() {
    pos = 0;
}

void endOfLine() {
    pos = 0;
    regions->push_back(currentregion);
}

private:
vector<region>* regions;
region currentregion;
int pos;
};


void parseRegion(){

    vector<region> regions;
    regionStorage regionstorageObject(&regions);
    phrase_parse(dataPointer, /*< start iterator >*/    
     state->dataEndPointer, /*< end iterator >*/
     (*(lexeme[
     +(int_[boost::bind(&regionStorage::storer_regionkey, &regionstorageObject, _1)] - '|') >> '|' >>
     +(char_[boost::bind(&regionStorage::storer_name, &regionstorageObject, _1)] - '|') >> char_('|')[boost::bind(&regionStorage::resetPos, &regionstorageObject)] >>
     +(char_[boost::bind(&regionStorage::storer_comment, &regionstorageObject, _1)] - '|') >> char_('|')[boost::bind(&regionStorage::endOfLine, &regionstorageObject)]
    ])), space);

   cout << regions.size() << endl;
}

struct区域{
国际区域密钥；
字符r_名称[25]；
char r_评论[152]；
};
类区域存储{
公众：
区域存储（矢量*区域）：区域（区域），位置（0）{}
无效存储区密钥（int const&i）{
currentregion.r\u regionkey=i；
}
无效存储者名称（字符常量和i）{
currentregion.r_name[pos]=i；
pos++；
}
无效仓库注释（char const&i）{
currentregion.r_comment[pos]=i；
pos++；
}
void resetPos（）{
pos=0；
}
void endOfLine（）{
pos=0；
区域->推回（当前区域）；
}
私人：
矢量*区域；
区域当前区域；
int pos；
};
void parseRegion（）{
矢量区域；
区域存储区域存储对象（®ions）；
短语解析（数据指针，/**/
state->dataEndPointer，/**/
（*（词素）[
+（int_[boost:：bind（®ionStorage:：storer_regionkey，®ionstorageObject，_1）]-“|”）>>“|”>>
+（char|[boost:：bind（®ionStorage:：storer_name，®ionstorageObject，_1）]-“|”）>>char|（“|”）[boost:：bind（®ionStorage:：resetPos，®ionstorageObject）]>>
+（char|[boost:：bind（®ionStorage:：storer_comment，®ionstorageObject，_1）]-“|”）>>char|（“|”）[boost:：bind（®ionStorage:：endOfLine，®ionstorageObject]
]))，空间）；
cout问题主要来自将单个char
元素添加到std:：string
容器。根据语法，对于每个std:：string
属性，当满足char时分配开始，当找到
分隔符时分配停止。因此，首先有sizeof（char）+1
保留字节（以null结尾的“\0”）。编译器必须运行std:：string
的分配器，具体取决于分配器加倍算法！这意味着必须非常频繁地为小字符串重新分配内存。这意味着您的字符串将被复制到其大小加倍的内存分配，并以1,2,4,6,12,24…cha的间隔释放以前的分配racters。难怪它很慢，这会导致频繁的malloc调用出现巨大问题；堆碎片越多，可用内存块的链接列表越大，变量（越小）这些内存块的大小会导致应用程序在其整个生命周期内对内存的分配进行更长时间的扫描。tldr；数据会变得支离破碎，并广泛分散在内存中
证明？每当迭代器中遇到有效字符时，char\u解析器就会调用以下代码
/boost/spirit/home/qi/char/char_parser.hpp
if (first != last && this->derived().test(*first, context))
{
    spirit::traits::assign_to(*first, attr_);
    ++first;
    return true;
}
return false;

if (first != last && this->derived().test(*first, context))
{
    spirit::traits::assign_to(*first, attr_);
    ++first;
    return true;
}
if (traits::is_container<Attribute>::value == true)
    attr_.shrink_to_fit();
return false;

/增压/精神/家庭/合格中介机构/细节/分配给.hpp
// T is not a container and not a string
template <typename T_>
static void call(T_ const& val, Attribute& attr, mpl::false_, mpl::false_)
{
    traits::push_back(attr, val);
}

/boost/spirit/home/qi/char/char_parser.hpp
if (first != last && this->derived().test(*first, context))
{
    spirit::traits::assign_to(*first, attr_);
    ++first;
    return true;
}
return false;

if (first != last && this->derived().test(*first, context))
{
    spirit::traits::assign_to(*first, attr_);
    ++first;
    return true;
}
if (traits::is_container<Attribute>::value == true)
    attr_.shrink_to_fit();
return false;

if（first！=last&&this->derived（）.test（*first，context））
{
精神：：特质：：分配给（*首先，属性）；
++第一,；
返回true；
}
if（traits:：is_container:：value==true）
属性收缩到适合（）；
返回false；

我还没有测试过它，但我认为它可以将字符串属性上的字符解析器的速度提高10倍以上