C++ 尽管令牌不完整,但Boost Spirit表示解析成功
我有一个非常简单的路径构造,我正试图用boost spirit.lex解析它 我们有以下语法:C++ 尽管令牌不完整,但Boost Spirit表示解析成功,c++,boost,boost-spirit,boost-spirit-lex,C++,Boost,Boost Spirit,Boost Spirit Lex,我有一个非常简单的路径构造,我正试图用boost spirit.lex解析它 我们有以下语法: token := [a-z]+ path := (token : path) | (token) 我们这里只讨论冒号分隔的小写ASCII字符串 我有三个例子“xyz”,“abc:xyz”,“abc:xyz:” 前两项应视为有效。第三个带有尾随冒号的字符不应视为有效。不幸的是,我使用的解析器认为这三个都是有效的。语法不应该允许一个空的标记,但很明显,spirit正在这样做。我错过了什么让第三个被拒绝
token := [a-z]+
path := (token : path) | (token)
我们这里只讨论冒号分隔的小写ASCII字符串
我有三个例子“xyz”,“abc:xyz”,“abc:xyz:”
前两项应视为有效。第三个带有尾随冒号的字符不应视为有效。不幸的是,我使用的解析器认为这三个都是有效的。语法不应该允许一个空的标记,但很明显,spirit正在这样做。我错过了什么让第三个被拒绝
另外,如果您阅读下面的代码,在注释中还有另一个版本的解析器,它要求所有路径都以分号结尾。当我激活这些行时,我可以获得适当的行为(即拒绝“abc:xyz:;”),但这并不是我真正想要的
有人有什么想法吗
谢谢
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <iostream>
#include <string>
using namespace boost::spirit;
using boost::phoenix::val;
template<typename Lexer>
struct PathTokens : boost::spirit::lex::lexer<Lexer>
{
PathTokens()
{
identifier = "[a-z]+";
separator = ":";
this->self.add
(identifier)
(separator)
(';')
;
}
boost::spirit::lex::token_def<std::string> identifier, separator;
};
template <typename Iterator>
struct PathGrammar
: boost::spirit::qi::grammar<Iterator>
{
template <typename TokenDef>
PathGrammar(TokenDef const& tok)
: PathGrammar::base_type(path)
{
using boost::spirit::_val;
path
=
(token >> tok.separator >> path)[std::cerr << _1 << "\n"]
|
//(token >> ';')[std::cerr << _1 << "\n"]
(token)[std::cerr << _1 << "\n"]
;
token
= (tok.identifier) [_val=_1]
;
}
boost::spirit::qi::rule<Iterator> path;
boost::spirit::qi::rule<Iterator, std::string()> token;
};
int main()
{
typedef std::string::iterator BaseIteratorType;
typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::string> > TokenType;
typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
typedef PathTokens<LexerType>::iterator_type TokensIterator;
typedef std::vector<std::string> Tests;
Tests paths;
paths.push_back("abc");
paths.push_back("abc:xyz");
paths.push_back("abc:xyz:");
/*
paths.clear();
paths.push_back("abc;");
paths.push_back("abc:xyz;");
paths.push_back("abc:xyz:;");
*/
for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
{
std::string str = *iter;
std::cerr << "*****" << str << "*****\n";
PathTokens<LexerType> tokens;
PathGrammar<TokensIterator> grammar(tokens);
BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();
bool r = boost::spirit::lex::tokenize_and_parse(first, last, tokens, grammar);
std::cerr << r << " " << (first==last) << "\n";
}
}
#包括
#包括
#包括
#包括
#包括
#包括
使用名称空间boost::spirit;
使用boost::phoenix::val;
模板
结构PathTokens:boost::spirit::lex::lexer
{
PathTokens()
{
标识符=“[a-z]+”;
分隔符=“:”;
此->self.add
(标识符)
(分离器)
(';')
;
}
boost::spirit::lex::令牌定义标识符,分隔符;
};
模板
结构路径语法
:boost::spirit::qi::语法
{
模板
路径语法(TokenDef const&tok)
:路径语法::基本类型(路径)
{
使用boost::spirit::\u val;
路径
=
(令牌>>tok.separator>>路径)[std::cerr';')[std::cerr问题在于调用tokenize\u和parse
检查字符串是否已完全标记化后,first==last
的first和last
的含义,您无法推断语法的任何内容。如果像这样隔离解析,您将获得预期的结果:
PathTokens<LexerType> tokens;
PathGrammar<TokensIterator> grammar(tokens);
BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();
LexerType::iterator_type lexfirst = tokens.begin(first,last);
LexerType::iterator_type lexlast = tokens.end();
bool r = parse(lexfirst, lexlast, grammar);
std::cerr << r << " " << (lexfirst==lexlast) << "\n";
PathTokens令牌;
路径语法(标记);
BaseIteratorType first=str.begin();
BaseIteratorType last=str.end();
LexerType::迭代器类型lexfirst=tokens.begin(first,last);
LexerType::iterator_type lexlast=tokens.end();
boolr=parse(lexfirst、lexlast、grammar);
除了Allonesmz已经说过的话,CERI还有一个技巧,我有时使用qi::eoi
来实现:
path = (
(token >> tok.separator >> path) [std::cerr << _1 << "\n"]
| token [std::cerr << _1 << "\n"]
) >> eoi;
这就是我最终得出的结论。它使用了@sehe和@llonesmiz的建议。请注意转换为std::wstring以及语法定义中使用的动作,这在最初的文章中没有出现
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/bind.hpp>
#include <iostream>
#include <string>
//
// This example uses boost spirit to parse a simple
// colon-delimited grammar.
//
// The grammar we want to recognize is:
// identifier := [a-z]+
// separator = :
// path= (identifier separator path) | identifier
//
// From the boost spirit perspective this example shows
// a few things I found hard to come by when building my
// first parser.
// 1. How to flag an incomplete token at the end of input
// as an error. (use of boost::spirit::eoi)
// 2. How to bind an action on an instance of an object
// that is taken as input to the parser.
// 3. Use of std::wstring.
// 4. Use of the lexer iterator.
//
// This using directive will cause issues with boost::bind
// when referencing placeholders such as _1.
// using namespace boost::spirit;
//! A class that tokenizes our input.
template<typename Lexer>
struct Tokens : boost::spirit::lex::lexer<Lexer>
{
Tokens()
{
identifier = L"[a-z]+";
separator = L":";
this->self.add
(identifier)
(separator)
;
}
boost::spirit::lex::token_def<std::wstring, wchar_t> identifier, separator;
};
//! This class provides a callback that echoes strings to stderr.
struct Echo
{
void echo(boost::fusion::vector<std::wstring> const& t) const
{
using namespace boost::fusion;
std::wcerr << at_c<0>(t) << "\n";
}
};
//! The definition of our grammar, as described above.
template <typename Iterator>
struct Grammar : boost::spirit::qi::grammar<Iterator>
{
template <typename TokenDef>
Grammar(TokenDef const& tok, Echo const& e)
: Grammar::base_type(path)
{
using boost::spirit::_val;
path
=
((token >> tok.separator >> path)[boost::bind(&Echo::echo, e,::_1)]
|
(token)[boost::bind(&Echo::echo, &e, ::_1)]
) >> boost::spirit::eoi; // Look for end of input.
token
= (tok.identifier) [_val=boost::spirit::qi::_1]
;
}
boost::spirit::qi::rule<Iterator> path;
boost::spirit::qi::rule<Iterator, std::wstring()> token;
};
int main()
{
// A set of typedefs to make things a little clearer. This stuff is
// well described in the boost spirit documentation/examples.
typedef std::wstring::iterator BaseIteratorType;
typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::wstring> > TokenType;
typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
typedef Tokens<LexerType>::iterator_type TokensIterator;
typedef LexerType::iterator_type LexerIterator;
// Define some paths to parse.
typedef std::vector<std::wstring> Tests;
Tests paths;
paths.push_back(L"abc");
paths.push_back(L"abc:xyz");
paths.push_back(L"abc:xyz:");
paths.push_back(L":");
// Parse 'em.
for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
{
std::wstring str = *iter;
std::wcerr << L"*****" << str << L"*****\n";
Echo e;
Tokens<LexerType> tokens;
Grammar<TokensIterator> grammar(tokens, e);
BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();
// Have the lexer consume our string.
LexerIterator lexFirst = tokens.begin(first, last);
LexerIterator lexLast = tokens.end();
// Have the parser consume the output of the lexer.
bool r = boost::spirit::qi::parse(lexFirst, lexLast, grammar);
// Print the status and whether or note all output of the lexer
// was processed.
std::wcerr << r << L" " << (lexFirst==lexLast) << L"\n";
}
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
//
//本例使用boost spirit解析一个简单的
//冒号分隔语法。
//
//我们想要识别的语法是:
//标识符:=[a-z]+
//分隔符=:
//路径=(标识符分隔符路径)|标识符
//
//从boost spirit的角度来看,此示例显示
//有几件事我在建立我的公司时发现很难做到
//第一个解析器。
//1.如何在输入结束时标记不完整的令牌
//作为错误。(使用boost::spirit::eoi)
//2.如何在对象实例上绑定操作
//作为解析器的输入。
//3.std::wstring的使用。
//4.使用lexer迭代器。
//
//此using指令将导致boost::bind出现问题
//引用占位符(如_1)时。
//使用名称空间boost::spirit;
//!一个标记输入的类。
模板
结构令牌:boost::spirit::lex::lexer
{
代币()
{
标识符=L“[a-z]+”;
分隔符=L:“;
此->self.add
(标识符)
(分离器)
;
}
boost::spirit::lex::令牌定义标识符,分隔符;
};
//!此类提供将字符串回显到stderr的回调。
结构回声
{
无效回波(boost::fusion::vector const&t)const
{
使用名称空间boost::fusion;
std::wcerr tok.separator>>路径[boost::bind(&Echo::Echo,e,:1)]
|
(令牌)[boost::bind(&Echo::Echo,&e,:\u 1]
)>>boost::spirit::eoi;//查找输入的结尾。
代币
=(tok.identifier)[u val=boost::spirit::qi::\u 1]
;
}
提升::精神::气::规则路径;
提升::精神::气::规则令牌;
};
int main()
{
//一套打字机可以让事情变得更清楚。这个东西是
//在boost spirit文档/示例中有详细描述。
typedef std::wstring::迭代器BaseIteratorType;
typedef boost::spirit::lex::lexertl::令牌类型;
typedef boost::spirit::lex::lexertl::lexer LexerType;
typedef令牌::迭代器\类型令牌站点生成器;
typedef LexerType::迭代器\ u型lexerator;
//定义一些要解析的路径。
typedef标准::向量测试;
测试路径;
路径。推回(L“abc”);
路径。推回(L“abc:xyz”);
路径。推回(L“abc:xyz:”);
路径。推回(L):;
//解析它们。
for(Tests::iterator iter=path.begin();iter!=path.end();++iter)
{
std::wstring str=*iter;
std::wcerr我插入了您的代码,并且lexer的迭代器不相等。因此,至少问题是可以检测到的。但是,是否有任何理由“r”不应该为false。如果我只给解析器“:”它返回false,它应该返回false。在中,您可以看到解析函数“如果所涉及的解析器组件均未出现故障,则返回true,否则返回false”。我的理解是,如果语法能够匹配您的“起始规则”(path
,在您的示例中)它返回true,与字符串的解析量无关。这就是为什么您需要检查first==last
,以确保您的整个文本已被解析。这是有意义的。自从编写原始文章以来,我一直在尝试使用运算符“>”而不是运算符
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/bind.hpp>
#include <iostream>
#include <string>
//
// This example uses boost spirit to parse a simple
// colon-delimited grammar.
//
// The grammar we want to recognize is:
// identifier := [a-z]+
// separator = :
// path= (identifier separator path) | identifier
//
// From the boost spirit perspective this example shows
// a few things I found hard to come by when building my
// first parser.
// 1. How to flag an incomplete token at the end of input
// as an error. (use of boost::spirit::eoi)
// 2. How to bind an action on an instance of an object
// that is taken as input to the parser.
// 3. Use of std::wstring.
// 4. Use of the lexer iterator.
//
// This using directive will cause issues with boost::bind
// when referencing placeholders such as _1.
// using namespace boost::spirit;
//! A class that tokenizes our input.
template<typename Lexer>
struct Tokens : boost::spirit::lex::lexer<Lexer>
{
Tokens()
{
identifier = L"[a-z]+";
separator = L":";
this->self.add
(identifier)
(separator)
;
}
boost::spirit::lex::token_def<std::wstring, wchar_t> identifier, separator;
};
//! This class provides a callback that echoes strings to stderr.
struct Echo
{
void echo(boost::fusion::vector<std::wstring> const& t) const
{
using namespace boost::fusion;
std::wcerr << at_c<0>(t) << "\n";
}
};
//! The definition of our grammar, as described above.
template <typename Iterator>
struct Grammar : boost::spirit::qi::grammar<Iterator>
{
template <typename TokenDef>
Grammar(TokenDef const& tok, Echo const& e)
: Grammar::base_type(path)
{
using boost::spirit::_val;
path
=
((token >> tok.separator >> path)[boost::bind(&Echo::echo, e,::_1)]
|
(token)[boost::bind(&Echo::echo, &e, ::_1)]
) >> boost::spirit::eoi; // Look for end of input.
token
= (tok.identifier) [_val=boost::spirit::qi::_1]
;
}
boost::spirit::qi::rule<Iterator> path;
boost::spirit::qi::rule<Iterator, std::wstring()> token;
};
int main()
{
// A set of typedefs to make things a little clearer. This stuff is
// well described in the boost spirit documentation/examples.
typedef std::wstring::iterator BaseIteratorType;
typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::wstring> > TokenType;
typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
typedef Tokens<LexerType>::iterator_type TokensIterator;
typedef LexerType::iterator_type LexerIterator;
// Define some paths to parse.
typedef std::vector<std::wstring> Tests;
Tests paths;
paths.push_back(L"abc");
paths.push_back(L"abc:xyz");
paths.push_back(L"abc:xyz:");
paths.push_back(L":");
// Parse 'em.
for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
{
std::wstring str = *iter;
std::wcerr << L"*****" << str << L"*****\n";
Echo e;
Tokens<LexerType> tokens;
Grammar<TokensIterator> grammar(tokens, e);
BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();
// Have the lexer consume our string.
LexerIterator lexFirst = tokens.begin(first, last);
LexerIterator lexLast = tokens.end();
// Have the parser consume the output of the lexer.
bool r = boost::spirit::qi::parse(lexFirst, lexLast, grammar);
// Print the status and whether or note all output of the lexer
// was processed.
std::wcerr << r << L" " << (lexFirst==lexLast) << L"\n";
}
}