2015-03-08 55 views
4

有沒有辦法讓柔性沿的FLEX(詞法分析器) - 匹配的Unicode

ascSymbol  !|#|$|%|&|⋆|+|.|/|<|=|>|?|@|\|^|-|~|: 
uniSymbol  \p{Symbol}|\p{Other_Symbol}|\p{Punctuation} 
symbol  ascSymbol|uniSymbol{-}[^|_"',;] 

我通過Flex(lexer) support for unicode發現http://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html但我希望能夠在一些自動化的方式行匹配的Unicode 。

例如,我使用cmake,它被配置爲從* .l和* .y文件生成生成詞法分析器/解析器。我最好想要一個解決方案,不需要安裝GHC或另一個Haskell編譯器。

此外,爲與野牛集成和具有Unicode支持另一個詞法建議開放....

+0

我不認爲有一種方法可以做到這一點以外,從列表編譯(長)的正則表達式所需的UTF-8編碼。例如,手動操作會很麻煩,但是用Python編寫它不會太難。但是,結果將與掃描儀生成時的Unicode數據庫相關聯,因此每次UCD更改時都需要重新生成掃描程序。 – rici 2015-03-08 18:53:21

+0

嗯,這開始看起來像一個痛苦的任務。不要幻想從http://www.unicode.org/Public/UCD/latest/ucd/中的所有東西中生成一個大型文件,直到我嘗試過,但它聽起來非常低效。看看我能否得到我想要做的事[Ragel](http://www.colm。淨/文件/ ragel/ragel引導-6.9.pdf) – zcourts 2015-03-09 22:22:21

回答

0

正如事實證明越來越Unicode支持Flex中,除非Flex源本身增加了這將是一個痛苦。在那裏似乎有一些實驗性的東西,但從來沒有把它發佈到我能找到的版本中。

Ragel doc非常具有洞察力,並且內置了對Unicode的支持。我從那以後發現了this article,它給出了一個如何讓Ragel和C++發揮出色的例子。似乎是更好的選擇,所以這樣做。

希望這可以節省別人花費的時間來解決這個問題。

EDIT

「內置支持」 如上所述也許是誇張。獲得unicode支持一直比較容易,但它不僅僅是一種開箱即用的方式。 使用cmake我從派生的UCD 7文件生成狀態機。 在的CMakeLists.txt我做的:

#Ruby is required to generate a unicode Ragel machine 
FIND_PACKAGE(Ruby REQUIRED) 
MESSAGE("Found Ruby ${RUBY_VERSION}") 
SET(UNICODE_MACHINE_PATH "${PROJECT_SOURCE_DIR}/src/unicode.rl") 
if(NOT EXISTS ${UNICODE_MACHINE_PATH} OR gen_unicode) 

MESSAGE("Attempting to generate unicode state machine") 
EXECUTE_PROCESS(COMMAND ${RUBY_EXECUTABLE} ${PROJECT_SOURCE_DIR}/unicode2ragel.rb 
       OUTPUT_FILE ${UNICODE_MACHINE_PATH} 
       RESULT_VARIABLE RAGEL_UNICODE_GEN_RES) 

    if(${RAGEL_UNICODE_GEN_RES} EQUAL 0) 
    MESSAGE("Generaged Ragel Unicode state machine") 
    else() 
    MESSAGE(SEND_ERROR "Unable to generate unicode state machine") 
    endif() 
endif() 

然後在unicode2ragel.rb(附帶Ragel和UCD 7略有修改)

#!/usr/bin/env ruby 
# 
# This script uses the unicode spec to generate a Ragel state machine 
# that recognizes unicode alphanumeric characters. It generates 5 
# character classes: uupper, ulower, ualpha, udigit, and ualnum. 
# Currently supported encodings are UTF-8 [default] and UCS-4. 
# 
# Usage: unicode2ragel.rb [options] 
# -e, --encoding [ucs4 | utf8]  Data encoding 
# -h, --help      Show this message 
# 
# This script was originally written as part of the Ferret search 
# engine library. 
# 
# Author: Rakan El-Khalil <[email protected]> 

require 'optparse' 
require 'open-uri' 

ENCODINGS = [ :utf8, :ucs4 ] 
ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" } 
CHART_URL = "http://www.unicode.org/Public/7.0.0/ucd/extracted/DerivedGeneralCategory.txt"#"http://www.unicode.org/Public/7.0.0/ucd/DerivedCoreProperties.txt" 

### 
# Display vars & default option 

TOTAL_WIDTH = 80 
RANGE_WIDTH = 23 
@encoding = :utf8 

### 
# Option parsing 

cli_opts = OptionParser.new do |opts| 
    opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| 
    @encoding = o.downcase.to_sym 
    end 
    opts.on("-h", "--help", "Show this message") do 
    puts opts 
    exit 
    end 
end 

cli_opts.parse(ARGV) 
unless ENCODINGS.member? @encoding 
    puts "Invalid encoding: #{@encoding}" 
    puts cli_opts 
    exit 
end 

## 
# Downloads the document at url and yields every alpha line's hex 
# range and description. 

def each_alpha(url, property) 
    open(url) do |file| 
    file.each_line do |line| 
     next if line =~ /^#/; 
     next if line !~ /; #{property} #/; 

     range, description = line.split(/;/) 
     range.strip! 
     description.gsub!(/.*#/, '').strip! 

     if range =~ /\.\./ 
      start, stop = range.split '..' 
     else start = stop = range 
     end 

     yield start.hex .. stop.hex, description 
    end 
    end 
end 

### 
# Formats to hex at minimum width 

def to_hex(n) 
    r = "%0X" % n 
    r = "0#{r}" unless (r.length % 2).zero? 
    r 
end 

### 
# UCS4 is just a straight hex conversion of the unicode codepoint. 

def to_ucs4(range) 
    rangestr = "0x" + to_hex(range.begin) 
    rangestr << "..0x" + to_hex(range.end) if range.begin != range.end 
    [ rangestr ] 
end 

## 
# 0x00  - 0x7f  -> 0zzzzzzz[7] 
# 0x80  - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] 
# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] 
# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] 

UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] 

def to_utf8_enc(n) 
    r = 0 
    if n <= 0x7f 
    r = n 
    elsif n <= 0x7ff 
    y = 0xc0 | (n >> 6) 
    z = 0x80 | (n & 0x3f) 
    r = y << 8 | z 
    elsif n <= 0xffff 
    x = 0xe0 | (n >> 12) 
    y = 0x80 | (n >> 6) & 0x3f 
    z = 0x80 | n  & 0x3f 
    r = x << 16 | y << 8 | z 
    elsif n <= 0x10ffff 
    w = 0xf0 | (n >> 18) 
    x = 0x80 | (n >> 12) & 0x3f 
    y = 0x80 | (n >> 6) & 0x3f 
    z = 0x80 | n  & 0x3f 
    r = w << 24 | x << 16 | y << 8 | z 
    end 

    to_hex(r) 
end 

def from_utf8_enc(n) 
    n = n.hex 
    r = 0 
    if n <= 0x7f 
    r = n 
    elsif n <= 0xdfff 
    y = (n >> 8) & 0x1f 
    z = n  & 0x3f 
    r = y << 6 | z 
    elsif n <= 0xefffff 
    x = (n >> 16) & 0x0f 
    y = (n >> 8) & 0x3f 
    z = n  & 0x3f 
    r = x << 10 | y << 6 | z 
    elsif n <= 0xf7ffffff 
    w = (n >> 24) & 0x07 
    x = (n >> 16) & 0x3f 
    y = (n >> 8) & 0x3f 
    z = n  & 0x3f 
    r = w << 18 | x << 12 | y << 6 | z 
    end 
    r 
end 

### 
# Given a range, splits it up into ranges that can be continuously 
# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] 
# This is not strictly needed since the current [5.1] unicode standard 
# doesn't have ranges that straddle utf8 boundaries. This is included 
# for completeness as there is no telling if that will ever change. 

def utf8_ranges(range) 
    ranges = [] 
    UTF8_BOUNDARIES.each do |max| 
    if range.begin <= max 
     return ranges << range if range.end <= max 

     ranges << range.begin .. max 
     range = (max + 1) .. range.end 
    end 
    end 
    ranges 
end 

def build_range(start, stop) 
    size = start.size/2 
    left = size - 1 
    return [""] if size < 1 

    a = start[0..1] 
    b = stop[0..1] 

    ### 
    # Shared prefix 

    if a == b 
    return build_range(start[2..-1], stop[2..-1]).map do |elt| 
     "0x#{a} " + elt 
    end 
    end 

    ### 
    # Unshared prefix, end of run 

    return ["0x#{a}..0x#{b} "] if left.zero? 

    ### 
    # Unshared prefix, not end of run 
    # Range can be 0x123456..0x56789A 
    # Which is equivalent to: 
    #  0x123456 .. 0x12FFFF 
    #  0x130000 .. 0x55FFFF 
    #  0x560000 .. 0x56789A 

    ret = [] 
    ret << build_range(start, a + "FF" * left) 

    ### 
    # Only generate middle range if need be. 

    if a.hex+1 != b.hex 
    max = to_hex(b.hex - 1) 
    max = "FF" if b == "FF" 
    ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left 
    end 

    ### 
    # Don't generate last range if it is covered by first range 

    ret << build_range(b + "00" * left, stop) unless b == "FF" 
    ret.flatten! 
end 

def to_utf8(range) 
    utf8_ranges(range).map do |r| 
    build_range to_utf8_enc(r.begin), to_utf8_enc(r.end) 
    end.flatten! 
end 

## 
# Perform a 3-way comparison of the number of codepoints advertised by 
# the unicode spec for the given range, the originally parsed range, 
# and the resulting utf8 encoded range. 

def count_codepoints(code) 
    code.split(' ').inject(1) do |acc, elt| 
    if elt =~ /0x(.+)\.\.0x(.+)/ 
     if @encoding == :utf8 
     acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) 
     else 
     acc * ($2.hex - $1.hex + 1) 
     end 
    else 
     acc 
    end 
    end 
end 

def is_valid?(range, desc, codes) 
    spec_count = 1 
    spec_count = $1.to_i if desc =~ /\[(\d+)\]/ 
    range_count = range.end - range.begin + 1 

    sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } 
    sum == spec_count and sum == range_count 
end 

## 
# Generate the state maching to stdout 

def generate_machine(name, property) 
    pipe = " " 
    puts " #{name} = " 
    each_alpha(CHART_URL, property) do |range, desc| 

    codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) 

    raise "Invalid encoding of range #{range}: #{codes.inspect}" unless 
     is_valid? range, desc, codes 

    range_width = codes.map { |a| a.size }.max 
    range_width = RANGE_WIDTH if range_width < RANGE_WIDTH 

    desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 
    desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH 

    if desc.size > desc_width 
     desc = desc[0..desc_width - 4] + "..." 
    end 

    codes.each_with_index do |r, idx| 
     desc = "" unless idx.zero? 
     code = "%-#{range_width}s" % r 
     puts "  #{pipe} #{code} ##{desc}" 
     pipe = "|" 
    end 
    end 
    puts "  ;" 
    puts "" 
end 

puts <<EOF 
# The following Ragel file was autogenerated from: #{CHART_URL} 
# 
# It defines ualpha, udigit, ualnum. 
# 
# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]}, 
# and that your input is in #{@encoding}. 

%%{ 
    machine WChar; 
EOF 
generate_machine(:uUppercaseLetter, "Lu") 
generate_machine(:uLowercaseLetter, "Ll") 
generate_machine(:uTitlecaseLetter, "Lt") 
generate_machine(:uModifierLetter, "Lm") 
generate_machine(:uOtherLetter, "Lo") 
generate_machine(:uNonspacingMark, "Mn") 
generate_machine(:uEnclosingMark, "Me") 
generate_machine(:uSpacingMark, "Mc") 
generate_machine(:uDecimalNumber, "Nd") 
generate_machine(:uLetterNumber, "Nl") 
generate_machine(:uOtherNumber, "No") 
generate_machine(:uSpaceSeparator, "Zs") 
generate_machine(:uLineSeparator, "Zl") 
generate_machine(:uParagraphSeparator, "Zp") 
generate_machine(:uFormat, "Cf") 
generate_machine(:uPrivateUse, "Co") 
generate_machine(:uSurrogate, "Cs") 
generate_machine(:uDashPunctuation, "Pd") 
generate_machine(:uOpenPunctuation, "Ps") 
generate_machine(:uClosePunctuation, "Pe") 
generate_machine(:uConnectorPunctuation, "Pc") 
generate_machine(:uOtherPunctuation, "Po") 
generate_machine(:uMathSymbol, "Sm") 
generate_machine(:uCurrencySymbol, "Sc") 
generate_machine(:uModifierSymbol, "Sk") 
generate_machine(:uOtherSymbol, "So") 
generate_machine(:uInitialPunctuation, "Pi") 
generate_machine(:uFinalPunctuation, "Pf") 
puts <<EOF 
}%% 
EOF 

然後在你的ragel機文件可以包含unicode.rl並獲得各組的unicode訪問定義如uUppercaseLetter等等...

0

「也開到了與野牛集成 並具有統一另一個詞法分析器建議支持......」

RE/flex project提供了針對c的Flex兼容的詞法分析器發電機++支持Unicode,並與野牛工作。

它接受你的榜樣(有點修改,來解決語法):

%option unicode 
ascSymbol  [!#$%&⋆+./<=>[email protected]\\^\-~:] 
uniSymbol  [\p{Symbol}\p{Other_Symbol}\p{Punctuation}]{-}[\^|_"',;] 
symbol  {ascSymbol}|{uniSymbol} 
相關問題