2013-05-13 22 views
2

我想標記我自己的SQL語法的擴展。這涉及識別雙引號字符串中的雙引號。例如。在MySQL中,這兩個字符串標記是等效的:""""(第二個雙引號用作轉義字符)和'"'。我嘗試過不同的事情,但我堅持如何替換令牌的值。如何用精靈:: lex消除令牌中的轉義字符?

#include <boost/spirit/include/lex_lexertl.hpp> 
namespace lex = boost::spirit::lex; 

template <typename Lexer> 
struct sql_tokens : lex::lexer<Lexer> 
{ 
    sql_tokens() 
    { 
    string_quote_double = "\\\""; // '"' 

    this->self("INITIAL") 
     = string_quote_double [ lex::_state = "STRING_DOUBLE" ] // how to also ignore + ctx.more()? 
     | ... 
     ; 

    this->self("STRING_DOUBLE") 
     = lex::token_def<>("[^\\\"]*") // action: ignore + ctx.more() 
     | lex::token_def<>("\\\"\\\"") // how to set token value to '"' ? 
     | lex::token_def<>("\\\"") [ lex::_state = "INITIAL" ] 
     ; 
    } 

    lex::token_def<> string_quote_double, ...; 
}; 

那麼如何設置令牌的價值"""已被發現?

除此之外,我還有以下問題:我可以編寫一個語義操作的函數來調用ctx.more(),並同時忽略該標記(因此將「低級」標記合併爲「高級級別「字符串標記)。但如何優雅地將這與lex :: _ state =「..」結合?

回答

5

響應EDITED發表評論,查看下面的 「UPDATE」」


我建議不要試圖解決的是,在詞法分析器讓詞法分析器產量原始字符串:

template <typename Lexer> 
    struct mylexer_t : lex::lexer<Lexer> 
{ 
    mylexer_t() 
    { 
     string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\""; 

     this->self("INITIAL") 
      = string_quote_double 
      | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ] 
      ; 
    } 

    lex::token_def<std::string> string_quote_double; 
}; 

注意暴露像這樣的令牌屬性需要修改令牌typedef:

typedef lex::lexertl::token<char const*, boost::mpl::vector<char, std::string> > token_type; 
typedef lex::lexertl::actor_lexer<token_type> lexer_type; 

後處理的解析器:

template <typename Iterator> struct mygrammar_t 
    : public qi::grammar<Iterator, std::vector<std::string>()> 
{ 
    typedef mygrammar_t<Iterator> This; 

    template <typename TokenDef> 
     mygrammar_t(TokenDef const& tok) : mygrammar_t::base_type(start) 
    { 
     using namespace qi; 

     string_quote_double %= tok.string_quote_double [ undoublequote ]; 
     start = *string_quote_double; 

     BOOST_SPIRIT_DEBUG_NODES((start)(string_quote_double)); 
    } 

    private: 
    qi::rule<Iterator, std::vector<std::string>()> start; 
    qi::rule<Iterator, std::string()> string_quote_double; 
}; 

正如你所看到的,undoubleqoute可以是任何鳳凰演員滿足標準的精神語義動作。腦死亡示例實現將是:

static bool undoublequote(std::string& val) 
{ 
    auto outidx = 0; 
    for(auto in = val.begin(); in!=val.end(); ++in) { 
     switch(*in) { 
      case '"': 
       if (++in == val.end()) { // eat the escape 
        // end of input reached 
        val.resize(outidx); // resize to effective chars 
        return true; 
       } 
       // fall through 
      default: 
       val[outidx++] = *in; // append the character 
     } 
    } 

    return false; // not ended with double quote as expected 
} 

但我建議你寫一個「正確的」去逃避者(我敢肯定,MySQL將允許\t\r\u001e甚至更​​古老的東西,以及)。

我在舊的答案在這裏一些更全面的樣本:


UPDATE

事實上如y OU指示的,它是相當容易的屬性值正常化融入詞法本身:

template <typename Lexer> 
    struct mylexer_t : lex::lexer<Lexer> 
{ 
    struct undoublequote_lex_type { 
     template <typename, typename, typename, typename> struct result { typedef void type; }; 

     template <typename It, typename IdType, typename pass_flag, typename Ctx> 
      void operator()(It& f, It& l, pass_flag& pass, IdType& id, Ctx& ctx) const { 
       std::string raw(f,l); 
       if (undoublequote(raw)) 
        ctx.set_value(raw); 
       else 
        pass = lex::pass_flags::pass_fail; 
      } 
    } undoublequote_lex; 

    mylexer_t() 
    { 
     string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\""; 

     const static undoublequote_lex_type undoublequote_lex; 
     this->self("INITIAL") 
      = string_quote_double [ undoublequote_lex ] 
      | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ] 
      ; 
    } 

    lex::token_def<std::string> string_quote_double; 
}; 

這重用上面示出的相同undoublequote的功能,但將其包裝在延遲可調用對象(或「多晶型仿函數」)undoublequote_lex_type那滿足the criteria for a Lexer Semantic Action


這裏是概念的全面工作證明:

//#include <boost/config/warning_disable.hpp> 
//#define BOOST_SPIRIT_DEBUG_PRINT_SOME 80 
//#define BOOST_SPIRIT_DEBUG // before including Spirit 
#include <boost/spirit/include/lex_lexertl.hpp> 
#include <boost/spirit/include/qi.hpp> 
#include <fstream> 
#ifdef MEMORY_MAPPED 
# include <boost/iostreams/device/mapped_file.hpp> 
#endif 
//#include <boost/spirit/include/lex_generate_static_lexertl.hpp> 

namespace /*anon*/ 
{ 
    namespace phx=boost::phoenix; 
    namespace qi =boost::spirit::qi; 
    namespace lex=boost::spirit::lex; 

    template <typename Lexer> 
     struct mylexer_t : lex::lexer<Lexer> 
    { 
     mylexer_t() 
     { 
      string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\""; 

      this->self("INITIAL") 
       = string_quote_double 
       | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ] 
       ; 
     } 

     lex::token_def<std::string> string_quote_double; 
    }; 

    static bool undoublequote(std::string& val) 
    { 
     auto outidx = 0; 
     for(auto in = val.begin(); in!=val.end(); ++in) { 
      switch(*in) { 
       case '"': 
        if (++in == val.end()) { // eat the escape 
         // end of input reached 
         val.resize(outidx); // resize to effective chars 
         return true; 
        } 
        // fall through 
       default: 
        val[outidx++] = *in; // append the character 
      } 
     } 

     return false; // not ended with double quote as expected 
    } 

    template <typename Iterator> struct mygrammar_t 
     : public qi::grammar<Iterator, std::vector<std::string>()> 
    { 
     typedef mygrammar_t<Iterator> This; 

     template <typename TokenDef> 
      mygrammar_t(TokenDef const& tok) : mygrammar_t::base_type(start) 
     { 
      using namespace qi; 

      string_quote_double %= tok.string_quote_double [ undoublequote ]; 
      start = *string_quote_double; 

      BOOST_SPIRIT_DEBUG_NODES((start)(string_quote_double)); 
     } 

     private: 
     qi::rule<Iterator, std::vector<std::string>()> start; 
     qi::rule<Iterator, std::string()> string_quote_double; 
    }; 
} 

std::vector<std::string> do_test_parse(const std::string& v) 
{ 
    char const *first = &v[0]; 
    char const *last = first+v.size(); 

    typedef lex::lexertl::token<char const*, boost::mpl::vector<char, std::string> > token_type; 
    typedef lex::lexertl::actor_lexer<token_type> lexer_type; 

    typedef mylexer_t<lexer_type>::iterator_type iterator_type; 
    const static mylexer_t<lexer_type> mylexer; 
    const static mygrammar_t<iterator_type> parser(mylexer); 

    auto iter = mylexer.begin(first, last); 
    auto end = mylexer.end(); 

    std::vector<std::string> data; 
    bool r = qi::parse(iter, end, parser, data); 

    r = r && (iter == end); 

    if (!r) 
     std::cerr << "parsing (" << iter->state() << ") failed at: '" << std::string(first, last) << "'\n"; 

    return data; 
} 

int main(int argc, const char *argv[]) 
{ 
    for (auto&& s : do_test_parse("\"bla\"\"blo\"")) 
     std::cout << s << std::endl; 
} 
+0

- 事實上,我認爲這是不可能的。儘管如此,我還是希望得到一個更簡單的解決方案,即不涉及語法。這真的需要嗎?也許只是爲了使測試/調試更容易? – coproc 2013-05-13 18:52:37

+0

@coproc當然,你可以:/這不是我所建議的。我已經添加了一個包裝器'undouble_quote_lex'函子,它向您展示瞭如何在純lex中執行此操作(請參閱** UPDATE **)。 ** [類似改編的示例程序](http://ideone.com/BGXH9W)**仍按預期打印'bla「blo' – sehe 2013-05-14 06:50:38

+0

我仍在咀嚼令牌類型。'char'的用意是什麼?爲'AttributeTypes'輸入'mpl :: vector'?是不是'std :: string'類型就足夠了?實際上我不明白爲什麼在一個標記類型定義中可能有多個屬性類型,以及它們如何可以使用 – coproc 2013-05-14 07:47:38

2

我建議在詞法分析器來解決這個和類似的任務,而不是make詞法回報的東西中間,然後用額外的代碼解析它。雙引號不能是字符串中唯一的複雜,也可能有其他的轉義,最好在一個地方清楚地描述字符串解析過程,並讓詞法分析器完成所有工作。

這裏是問題的主題中只使用詞法分析器的解決方案:我喜歡如何匹配與一個正則表達式逃逸這樣的字符串的想法

using namespace boost::spirit; 
namespace px = boost::phoenix; 

template <typename Lexer> 
struct sql_tokens : public lex::lexer<Lexer> 
{ 
    sql_tokens() 
    { 
    string = '"'; 

    this->self += 
     lex::token_def<>('"') 
     [ 
     lex::_state = "STRING", 
     lex::_pass = lex::pass_flags::pass_ignore, 
     px::ref(curString) = std::string() 
     ]; 

    std::string& (std::string::*append)(std::string::iterator, 
             std::string::iterator) 
    { &std::string::append<std::string::iterator> }; 

    this->self("STRING") = 
     lex::token_def<>("[^\"]*") 
     [ 
     lex::_pass = lex::pass_flags::pass_ignore, 
     px::bind(append, curString, lex::_start, lex::_end) 
     ] | 
     lex::token_def<>("\\\"\\\"") 
     [ 
     lex::_pass = lex::pass_flags::pass_ignore, 
     px::ref(curString) += px::val("\"") 
     ] | 
     string 
     [ 
     lex::_val = px::ref(curString), 
     lex::_state = "INITIAL" 
     ]; 

    this->self("WS") = lex::token_def<>("[ \\t\\n]+"); 
    } 

    std::string curString; 
    lex::token_def<std::string> string; 
}; 
相關問題