C++ 尽管令牌不完整，但Boost Spirit表示解析成功_C++_Boost_Boost Spirit_Boost Spirit Lex

C++ 尽管令牌不完整，但Boost Spirit表示解析成功

c++ boost

C++ 尽管令牌不完整，但Boost Spirit表示解析成功,c++,boost,boost-spirit,boost-spirit-lex,C++,Boost,Boost Spirit,Boost Spirit Lex,我有一个非常简单的路径构造，我正试图用boost spirit.lex解析它我们有以下语法： token := [a-z]+ path := (token : path) | (token) 我们这里只讨论冒号分隔的小写ASCII字符串我有三个例子“xyz”，“abc:xyz”，“abc:xyz:” 前两项应视为有效。第三个带有尾随冒号的字符不应视为有效。不幸的是，我使用的解析器认为这三个都是有效的。语法不应该允许一个空的标记，但很明显，spirit正在这样做。我错过了什么让第三个被拒绝

我有一个非常简单的路径构造，我正试图用boost spirit.lex解析它

我们有以下语法：

token := [a-z]+
path := (token : path) | (token)

我们这里只讨论冒号分隔的小写ASCII字符串

我有三个例子“xyz”，“abc:xyz”，“abc:xyz:”

前两项应视为有效。第三个带有尾随冒号的字符不应视为有效。不幸的是，我使用的解析器认为这三个都是有效的。语法不应该允许一个空的标记，但很明显，spirit正在这样做。我错过了什么让第三个被拒绝

另外，如果您阅读下面的代码，在注释中还有另一个版本的解析器，它要求所有路径都以分号结尾。当我激活这些行时，我可以获得适当的行为（即拒绝“abc:xyz:；”），但这并不是我真正想要的

有人有什么想法吗

谢谢

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>

#include <iostream>
#include <string>

using namespace boost::spirit;
using boost::phoenix::val;

template<typename Lexer>
struct PathTokens : boost::spirit::lex::lexer<Lexer>
{
      PathTokens()
      {
         identifier = "[a-z]+";
         separator = ":";

         this->self.add
            (identifier)
            (separator)
            (';')
            ;
      }
      boost::spirit::lex::token_def<std::string> identifier, separator;
};


template <typename Iterator>
struct PathGrammar 
   : boost::spirit::qi::grammar<Iterator> 
{
      template <typename TokenDef>
      PathGrammar(TokenDef const& tok)
         : PathGrammar::base_type(path)
      {
         using boost::spirit::_val;
         path
            = 
            (token >> tok.separator >> path)[std::cerr << _1 << "\n"]
            |
            //(token >> ';')[std::cerr << _1 << "\n"]
            (token)[std::cerr << _1 << "\n"]
             ; 

          token 
             = (tok.identifier) [_val=_1]
          ;

      }
      boost::spirit::qi::rule<Iterator> path;
      boost::spirit::qi::rule<Iterator, std::string()> token;
};


int main()
{
   typedef std::string::iterator BaseIteratorType;
   typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::string> > TokenType;
   typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
   typedef PathTokens<LexerType>::iterator_type TokensIterator;
   typedef std::vector<std::string> Tests;

   Tests paths;
   paths.push_back("abc");
   paths.push_back("abc:xyz");
   paths.push_back("abc:xyz:");
   /*
     paths.clear();
     paths.push_back("abc;");
     paths.push_back("abc:xyz;");
     paths.push_back("abc:xyz:;");
   */
   for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
   {
      std::string str = *iter;
      std::cerr << "*****" << str << "*****\n";

      PathTokens<LexerType> tokens;
      PathGrammar<TokensIterator> grammar(tokens);

      BaseIteratorType first = str.begin();
      BaseIteratorType last = str.end();

      bool r = boost::spirit::lex::tokenize_and_parse(first, last, tokens, grammar);

      std::cerr << r << " " << (first==last) << "\n";
   }
}

#包括
#包括
#包括
#包括
#包括
#包括
使用名称空间boost：：spirit；
使用boost:：phoenix:：val；
模板
结构PathTokens:boost:：spirit:：lex:：lexer
{
PathTokens（）
{
标识符=“[a-z]+”；
分隔符=“：”；
此->self.add
（标识符）
（分离器）
(';')
;
}
boost:：spirit:：lex:：令牌定义标识符，分隔符；
};
模板
结构路径语法
：boost：：spirit：：qi：：语法
{
模板
路径语法（TokenDef const&tok）
：路径语法：：基本类型（路径）
{
使用boost：：spirit:：\u val；
路径
= 
（令牌>>tok.separator>>路径）[std:：cerr'；'）[std:：cerr问题在于调用tokenize\u和parse
检查字符串是否已完全标记化后，first==last
的first和last
的含义，您无法推断语法的任何内容。如果像这样隔离解析，您将获得预期的结果：
  PathTokens<LexerType> tokens;
  PathGrammar<TokensIterator> grammar(tokens);

  BaseIteratorType first = str.begin();
  BaseIteratorType last = str.end();

  LexerType::iterator_type lexfirst = tokens.begin(first,last);
  LexerType::iterator_type lexlast = tokens.end();


  bool r = parse(lexfirst, lexlast, grammar);

  std::cerr << r << " " << (lexfirst==lexlast) << "\n";

PathTokens令牌；
路径语法（标记）；
BaseIteratorType first=str.begin（）；
BaseIteratorType last=str.end（）；
LexerType:：迭代器类型lexfirst=tokens.begin（first，last）；
LexerType:：iterator_type lexlast=tokens.end（）；
boolr=parse（lexfirst、lexlast、grammar）；
除了Allonesmz已经说过的话，CERI还有一个技巧，我有时使用qi:：eoi
来实现：
path = (
           (token >> tok.separator >> path) [std::cerr << _1 << "\n"]
         | token                           [std::cerr << _1 << "\n"]
    ) >> eoi;

这就是我最终得出的结论。它使用了@sehe和@llonesmiz的建议。请注意转换为std:：wstring以及语法定义中使用的动作，这在最初的文章中没有出现
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/bind.hpp>

#include <iostream>
#include <string>

//
// This example uses boost spirit to parse a simple
// colon-delimited grammar.
//
// The grammar we want to recognize is:
//    identifier := [a-z]+
//    separator = :
//    path= (identifier separator path) | identifier
//
// From the boost spirit perspective this example shows
// a few things I found hard to come by when building my
// first parser.
//    1. How to flag an incomplete token at the end of input
//       as an error. (use of boost::spirit::eoi)
//    2. How to bind an action on an instance of an object
//       that is taken as input to the parser.
//    3. Use of std::wstring.
//    4. Use of the lexer iterator.
//

// This using directive will cause issues with boost::bind
// when referencing placeholders such as _1.
// using namespace boost::spirit;

//! A class that tokenizes our input.
template<typename Lexer>
struct Tokens : boost::spirit::lex::lexer<Lexer>
{
      Tokens()
      {
         identifier = L"[a-z]+";
         separator = L":";

         this->self.add
            (identifier)
            (separator)
            ;
      }
      boost::spirit::lex::token_def<std::wstring, wchar_t> identifier, separator;
};

//! This class provides a callback that echoes strings to stderr.
struct Echo
{
      void echo(boost::fusion::vector<std::wstring> const& t) const
      {
         using namespace boost::fusion;
         std::wcerr << at_c<0>(t) << "\n";
      }
};


//! The definition of our grammar, as described above.
template <typename Iterator>
struct Grammar : boost::spirit::qi::grammar<Iterator> 
{
      template <typename TokenDef>
      Grammar(TokenDef const& tok, Echo const& e)
         : Grammar::base_type(path)
      {
         using boost::spirit::_val;
         path
            = 
            ((token >> tok.separator >> path)[boost::bind(&Echo::echo, e,::_1)]
             |
             (token)[boost::bind(&Echo::echo, &e, ::_1)]
             ) >> boost::spirit::eoi; // Look for end of input.

          token 
             = (tok.identifier) [_val=boost::spirit::qi::_1]
          ;

      }
      boost::spirit::qi::rule<Iterator> path;
      boost::spirit::qi::rule<Iterator, std::wstring()> token;
};


int main()
{
   // A set of typedefs to make things a little clearer. This stuff is
   // well described in the boost spirit documentation/examples.
   typedef std::wstring::iterator BaseIteratorType;
   typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::wstring> > TokenType;
   typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
   typedef Tokens<LexerType>::iterator_type TokensIterator;
   typedef LexerType::iterator_type LexerIterator;

   // Define some paths to parse.
   typedef std::vector<std::wstring> Tests;
   Tests paths;
   paths.push_back(L"abc");
   paths.push_back(L"abc:xyz");
   paths.push_back(L"abc:xyz:");
   paths.push_back(L":");

   // Parse 'em.
   for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
   {
      std::wstring str = *iter;
      std::wcerr << L"*****" << str << L"*****\n";

      Echo e;
      Tokens<LexerType> tokens;
      Grammar<TokensIterator> grammar(tokens, e);

      BaseIteratorType first = str.begin();
      BaseIteratorType last = str.end();

      // Have the lexer consume our string.
      LexerIterator lexFirst = tokens.begin(first, last);
      LexerIterator lexLast = tokens.end();

      // Have the parser consume the output of the lexer.
      bool r = boost::spirit::qi::parse(lexFirst, lexLast, grammar);

      // Print the status and whether or note all output of the lexer 
      // was processed.
      std::wcerr << r << L" " << (lexFirst==lexLast) << L"\n";
   }
}

#包括
#包括
#包括
#包括
#包括
#包括
#包括
//
//本例使用boost spirit解析一个简单的
//冒号分隔语法。
//
//我们想要识别的语法是：
//标识符：=[a-z]+
//分隔符=：
//路径=（标识符分隔符路径）|标识符
//
//从boost spirit的角度来看，此示例显示
//有几件事我在建立我的公司时发现很难做到
//第一个解析器。
//1.如何在输入结束时标记不完整的令牌
//作为错误。（使用boost:：spirit:：eoi）
//2.如何在对象实例上绑定操作
//作为解析器的输入。
//3.std:：wstring的使用。
//4.使用lexer迭代器。
//
//此using指令将导致boost:：bind出现问题
//引用占位符（如_1）时。
//使用名称空间boost：：spirit；
//！一个标记输入的类。
模板
结构令牌：boost:：spirit:：lex:：lexer
{
代币（）
{
标识符=L“[a-z]+”；
分隔符=L:“；
此->self.add
（标识符）
（分离器）
;
}
boost:：spirit:：lex:：令牌定义标识符，分隔符；
};
//！此类提供将字符串回显到stderr的回调。
结构回声
{
无效回波（boost:：fusion:：vector const&t）const
{
使用名称空间boost:：fusion；
std:：wcerr tok.separator>>路径[boost:：bind（&Echo:：Echo，e，：1）]
|
（令牌）[boost:：bind（&Echo:：Echo，&e，：\u 1]
)>>boost:：spirit:：eoi；//查找输入的结尾。
代币
=（tok.identifier）[u val=boost:：spirit:：qi:：\u 1]
;
}
提升：：精神：：气：：规则路径；
提升：：精神：：气：：规则令牌；
};
int main（）
{
//一套打字机可以让事情变得更清楚。这个东西是
//在boost spirit文档/示例中有详细描述。
typedef std:：wstring:：迭代器BaseIteratorType；
typedef boost:：spirit:：lex:：lexertl:：令牌类型；
typedef boost:：spirit:：lex:：lexertl:：lexer LexerType；
typedef令牌：：迭代器\类型令牌站点生成器；
typedef LexerType:：迭代器\ u型lexerator；
//定义一些要解析的路径。
typedef标准：：向量测试；
测试路径；
路径。推回（L“abc”）；
路径。推回（L“abc:xyz”）；
路径。推回（L“abc:xyz:”）；
路径。推回（L）：；
//解析它们。
for（Tests:：iterator iter=path.begin（）；iter！=path.end（）；++iter）
{
std:：wstring str=*iter；
std:：wcerr我插入了您的代码，并且lexer的迭代器不相等。因此，至少问题是可以检测到的。但是，是否有任何理由“r”不应该为false。如果我只给解析器“：”它返回false，它应该返回false。在中，您可以看到解析函数“如果所涉及的解析器组件均未出现故障，则返回true，否则返回false”。我的理解是，如果语法能够匹配您的“起始规则”（path
，在您的示例中）它返回true，与字符串的解析量无关。这就是为什么您需要检查first==last，以确保您的整个文本已被解析。这是有意义的。自从编写原始文章以来，我一直在尝试使用运算符“>”而不是运算符
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/bind.hpp>

#include <iostream>
#include <string>

//
// This example uses boost spirit to parse a simple
// colon-delimited grammar.
//
// The grammar we want to recognize is:
//    identifier := [a-z]+
//    separator = :
//    path= (identifier separator path) | identifier
//
// From the boost spirit perspective this example shows
// a few things I found hard to come by when building my
// first parser.
//    1. How to flag an incomplete token at the end of input
//       as an error. (use of boost::spirit::eoi)
//    2. How to bind an action on an instance of an object
//       that is taken as input to the parser.
//    3. Use of std::wstring.
//    4. Use of the lexer iterator.
//

// This using directive will cause issues with boost::bind
// when referencing placeholders such as _1.
// using namespace boost::spirit;

//! A class that tokenizes our input.
template<typename Lexer>
struct Tokens : boost::spirit::lex::lexer<Lexer>
{
      Tokens()
      {
         identifier = L"[a-z]+";
         separator = L":";

         this->self.add
            (identifier)
            (separator)
            ;
      }
      boost::spirit::lex::token_def<std::wstring, wchar_t> identifier, separator;
};

//! This class provides a callback that echoes strings to stderr.
struct Echo
{
      void echo(boost::fusion::vector<std::wstring> const& t) const
      {
         using namespace boost::fusion;
         std::wcerr << at_c<0>(t) << "\n";
      }
};


//! The definition of our grammar, as described above.
template <typename Iterator>
struct Grammar : boost::spirit::qi::grammar<Iterator> 
{
      template <typename TokenDef>
      Grammar(TokenDef const& tok, Echo const& e)
         : Grammar::base_type(path)
      {
         using boost::spirit::_val;
         path
            = 
            ((token >> tok.separator >> path)[boost::bind(&Echo::echo, e,::_1)]
             |
             (token)[boost::bind(&Echo::echo, &e, ::_1)]
             ) >> boost::spirit::eoi; // Look for end of input.

          token 
             = (tok.identifier) [_val=boost::spirit::qi::_1]
          ;

      }
      boost::spirit::qi::rule<Iterator> path;
      boost::spirit::qi::rule<Iterator, std::wstring()> token;
};


int main()
{
   // A set of typedefs to make things a little clearer. This stuff is
   // well described in the boost spirit documentation/examples.
   typedef std::wstring::iterator BaseIteratorType;
   typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::wstring> > TokenType;
   typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
   typedef Tokens<LexerType>::iterator_type TokensIterator;
   typedef LexerType::iterator_type LexerIterator;

   // Define some paths to parse.
   typedef std::vector<std::wstring> Tests;
   Tests paths;
   paths.push_back(L"abc");
   paths.push_back(L"abc:xyz");
   paths.push_back(L"abc:xyz:");
   paths.push_back(L":");

   // Parse 'em.
   for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
   {
      std::wstring str = *iter;
      std::wcerr << L"*****" << str << L"*****\n";

      Echo e;
      Tokens<LexerType> tokens;
      Grammar<TokensIterator> grammar(tokens, e);

      BaseIteratorType first = str.begin();
      BaseIteratorType last = str.end();

      // Have the lexer consume our string.
      LexerIterator lexFirst = tokens.begin(first, last);
      LexerIterator lexLast = tokens.end();

      // Have the parser consume the output of the lexer.
      bool r = boost::spirit::qi::parse(lexFirst, lexLast, grammar);

      // Print the status and whether or note all output of the lexer 
      // was processed.
      std::wcerr << r << L" " << (lexFirst==lexLast) << L"\n";
   }
}