Ruby Flex(lexer)-匹配unicode
有没有一种方法可以让flex按照Ruby Flex(lexer)-匹配unicode,ruby,unicode,cmake,flex-lexer,Ruby,Unicode,Cmake,Flex Lexer,有没有一种方法可以让flex按照 ascSymbol !|#|$|%|&|⋆|+|.|/|<|=|>|?|@|\|^|-|~|: uniSymbol \p{Symbol}|\p{Other_Symbol}|\p{Punctuation} symbol ascSymbol|uniSymbol{-}[^|_"',;] ascSymbol!|#|$|%|&|⋆|+|.|/||?|@|\|^|-|~|: uniSymbol\p{Symbol}|\p{O
ascSymbol !|#|$|%|&|⋆|+|.|/|<|=|>|?|@|\|^|-|~|:
uniSymbol \p{Symbol}|\p{Other_Symbol}|\p{Punctuation}
symbol ascSymbol|uniSymbol{-}[^|_"',;]
ascSymbol!|#|$|%|&|⋆|+|.|/||?|@|\|^|-|~|:
uniSymbol\p{Symbol}|\p{Other_Symbol}|\p{标点符号}
symbol ASCSSYMBOL | uniSymbol{-}[^ |",;]
我找到了via,但我希望能够以自动化的方式访问某些内容
例如,我正在使用cmake,它被配置为在构建时从*.l和*.y文件生成lexer/parser。理想情况下,我想要一个不需要安装GHC或其他Haskell编译器的解决方案
另外,对于另一个与Bison集成并支持unicode的lexer,我也持开放态度,因为在Flex中获得unicode支持将是一件痛苦的事情,除非Flex源代码本身添加了它。那里似乎有一些unicode的实验材料,但我找不到它
是有洞察力的,并且是为Unicode构建支持的。我从中发现了一个例子,说明如何使Ragel和C++发挥好。似乎是更好的选择,所以就这样。
希望这能为其他人节省解决这个问题所需的时间 编辑 上面提到的“内置支持”可能有些夸张。获得unicode支持比较容易,但这不仅仅是开箱即用的东西。 使用cmake,我从派生的UCD7文件生成一个状态机。 在CMakeLists.txt中,我执行以下操作:#Ruby is required to generate a unicode Ragel machine
FIND_PACKAGE(Ruby REQUIRED)
MESSAGE("Found Ruby ${RUBY_VERSION}")
SET(UNICODE_MACHINE_PATH "${PROJECT_SOURCE_DIR}/src/unicode.rl")
if(NOT EXISTS ${UNICODE_MACHINE_PATH} OR gen_unicode)
MESSAGE("Attempting to generate unicode state machine")
EXECUTE_PROCESS(COMMAND ${RUBY_EXECUTABLE} ${PROJECT_SOURCE_DIR}/unicode2ragel.rb
OUTPUT_FILE ${UNICODE_MACHINE_PATH}
RESULT_VARIABLE RAGEL_UNICODE_GEN_RES)
if(${RAGEL_UNICODE_GEN_RES} EQUAL 0)
MESSAGE("Generaged Ragel Unicode state machine")
else()
MESSAGE(SEND_ERROR "Unable to generate unicode state machine")
endif()
endif()
然后在unicode2ragel.rb中(使用Ragel发货,并针对UCD 7进行了轻微修改)
!/usr/bin/env ruby
#
#此脚本使用unicode规范生成Ragel状态机
#识别unicode字母数字字符的。它生成5个字符
#角色类:uupper、ulower、ualpha、udigit和ualnum。
#目前支持的编码是UTF-8[默认值]和UCS-4。
#
#用法:unicode2ragel.rb[选项]
#-e,--编码[ucs4 | utf8]数据编码
#-h,--帮助显示此消息
#
#这个脚本最初是作为雪貂搜索的一部分编写的
#引擎库。
#
#作者:Rakan El Khalil
需要“optpass”
需要“打开uri”
编码=[:utf8,:ucs4]
ALPHTYPES={:utf8=>“unsigned char”,:ucs4=>“unsigned int”}
图表\u URL=”http://www.unicode.org/Public/7.0.0/ucd/extracted/DerivedGeneralCategory.txt"#"http://www.unicode.org/Public/7.0.0/ucd/DerivedCoreProperties.txt"
###
#显示变量&默认选项
总宽度=80
范围\宽度=23
@编码=:utf8
###
#选项解析
cli_opts=OptionParser.new do| opts|
选择(“-e”,”--encoding[ucs4 | utf8],“数据编码”)do | o|
@编码=o.downcase.to_sym
结束
选项。在(“-h”,“--help”,“显示此消息”)上执行
放置选项
出口
结束
结束
cli_opts.parse(ARGV)
除非ENCODINGS.member?@encoding
放置“无效编码:#{@encoding}”
放置cli_选项
出口
结束
##
#在url下载文档并生成每个alpha行的十六进制
#范围和说明。
定义每个字母(url、属性)
打开(url)do |文件|
file.u每行do |行|
下一个if行=~/^#/;
下一个if行!~/#{property}#/;
范围,描述=行。拆分(/;/)
靶场,脱衣舞!
description.gsub!(/.*!
如果范围=~/\.\/
开始、停止=range.split'..'
否则开始=停止=范围
结束
屈服开始.hex..停止.hex,说明
结束
结束
结束
###
#以最小宽度格式化为十六进制
def至_十六进制(n)
r=“%0X”%n
r=“0#{r}”除非(r.length%2)。零?
R
结束
###
#UCS4只是unicode码点的直接十六进制转换。
def至_ucs4(范围)
rangestr=“0x”+到_十六进制(range.begin)
rangestr 0zzzzz[7]
#0x80-0x7ff->110yyyyy[5]10zzzzzz[6]
#0x800-0xffff->1110xxxx[4]10yyyyyy[6]10zzzzzz[6]
#0x010000-0x10ffff->11110www[3]10xxxxxx[6]10yyyyyy[6]10zzzzzz[6]
UTF8_边界=[0x7f,0x7ff,0xffff,0x10ffff]
def至_utf8_enc(n)
r=0
如果n 6)
z=0x80 |(n&0x3f)
r=y(12)
y=0x80 |(n>>6)和0x3f
z=0x80 | n&0x3f
r=x>12)和0x3f
y=0x80 |(n>>6)和0x3f
z=0x80 | n&0x3f
r=w 16)和0x3f
y=(n>>8)和0x3f
z=n&0x3f
r=w
“还可以接受与Bison集成的另一款lexer的建议
并支持unicode……”
提供了一个C++兼容的词法分析词法生成器,支持Unicode,与BISON一起工作。
它接受您的示例(稍作修改,以修复语法):
%unicode选项
ascSymbol[!#$%&⋆+./?@\\^\-~:]
uniSymbol[\p{Symbol}\p{Other{u Symbol}\p{标点符号}]{-}[\^ | |",;]
符号{ascSymbol}{uniSymbol}
我认为除了编译一个(长)的所需UTF-8代码列表中的正则表达式。手工操作会很麻烦,但用Python编写也不会太难。但在生成扫描程序时,结果会绑定到Unicode数据库,因此每次UCD更改时都需要重新生成扫描程序。嗯,这看起来像这是一项痛苦的任务。不要幻想从中的所有内容为flex生成一个巨大的文件,直到我尝试了才知道,但这听起来效率非常低。看看我是否能用它完成我想要的任务
#!/usr/bin/env ruby
#
# This script uses the unicode spec to generate a Ragel state machine
# that recognizes unicode alphanumeric characters. It generates 5
# character classes: uupper, ulower, ualpha, udigit, and ualnum.
# Currently supported encodings are UTF-8 [default] and UCS-4.
#
# Usage: unicode2ragel.rb [options]
# -e, --encoding [ucs4 | utf8] Data encoding
# -h, --help Show this message
#
# This script was originally written as part of the Ferret search
# engine library.
#
# Author: Rakan El-Khalil <rakan@well.com>
require 'optparse'
require 'open-uri'
ENCODINGS = [ :utf8, :ucs4 ]
ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" }
CHART_URL = "http://www.unicode.org/Public/7.0.0/ucd/extracted/DerivedGeneralCategory.txt"#"http://www.unicode.org/Public/7.0.0/ucd/DerivedCoreProperties.txt"
###
# Display vars & default option
TOTAL_WIDTH = 80
RANGE_WIDTH = 23
@encoding = :utf8
###
# Option parsing
cli_opts = OptionParser.new do |opts|
opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
@encoding = o.downcase.to_sym
end
opts.on("-h", "--help", "Show this message") do
puts opts
exit
end
end
cli_opts.parse(ARGV)
unless ENCODINGS.member? @encoding
puts "Invalid encoding: #{@encoding}"
puts cli_opts
exit
end
##
# Downloads the document at url and yields every alpha line's hex
# range and description.
def each_alpha( url, property )
open( url ) do |file|
file.each_line do |line|
next if line =~ /^#/;
next if line !~ /; #{property} #/;
range, description = line.split(/;/)
range.strip!
description.gsub!(/.*#/, '').strip!
if range =~ /\.\./
start, stop = range.split '..'
else start = stop = range
end
yield start.hex .. stop.hex, description
end
end
end
###
# Formats to hex at minimum width
def to_hex( n )
r = "%0X" % n
r = "0#{r}" unless (r.length % 2).zero?
r
end
###
# UCS4 is just a straight hex conversion of the unicode codepoint.
def to_ucs4( range )
rangestr = "0x" + to_hex(range.begin)
rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
[ rangestr ]
end
##
# 0x00 - 0x7f -> 0zzzzzzz[7]
# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
def to_utf8_enc( n )
r = 0
if n <= 0x7f
r = n
elsif n <= 0x7ff
y = 0xc0 | (n >> 6)
z = 0x80 | (n & 0x3f)
r = y << 8 | z
elsif n <= 0xffff
x = 0xe0 | (n >> 12)
y = 0x80 | (n >> 6) & 0x3f
z = 0x80 | n & 0x3f
r = x << 16 | y << 8 | z
elsif n <= 0x10ffff
w = 0xf0 | (n >> 18)
x = 0x80 | (n >> 12) & 0x3f
y = 0x80 | (n >> 6) & 0x3f
z = 0x80 | n & 0x3f
r = w << 24 | x << 16 | y << 8 | z
end
to_hex(r)
end
def from_utf8_enc( n )
n = n.hex
r = 0
if n <= 0x7f
r = n
elsif n <= 0xdfff
y = (n >> 8) & 0x1f
z = n & 0x3f
r = y << 6 | z
elsif n <= 0xefffff
x = (n >> 16) & 0x0f
y = (n >> 8) & 0x3f
z = n & 0x3f
r = x << 10 | y << 6 | z
elsif n <= 0xf7ffffff
w = (n >> 24) & 0x07
x = (n >> 16) & 0x3f
y = (n >> 8) & 0x3f
z = n & 0x3f
r = w << 18 | x << 12 | y << 6 | z
end
r
end
###
# Given a range, splits it up into ranges that can be continuously
# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
# This is not strictly needed since the current [5.1] unicode standard
# doesn't have ranges that straddle utf8 boundaries. This is included
# for completeness as there is no telling if that will ever change.
def utf8_ranges( range )
ranges = []
UTF8_BOUNDARIES.each do |max|
if range.begin <= max
return ranges << range if range.end <= max
ranges << range.begin .. max
range = (max + 1) .. range.end
end
end
ranges
end
def build_range( start, stop )
size = start.size/2
left = size - 1
return [""] if size < 1
a = start[0..1]
b = stop[0..1]
###
# Shared prefix
if a == b
return build_range(start[2..-1], stop[2..-1]).map do |elt|
"0x#{a} " + elt
end
end
###
# Unshared prefix, end of run
return ["0x#{a}..0x#{b} "] if left.zero?
###
# Unshared prefix, not end of run
# Range can be 0x123456..0x56789A
# Which is equivalent to:
# 0x123456 .. 0x12FFFF
# 0x130000 .. 0x55FFFF
# 0x560000 .. 0x56789A
ret = []
ret << build_range(start, a + "FF" * left)
###
# Only generate middle range if need be.
if a.hex+1 != b.hex
max = to_hex(b.hex - 1)
max = "FF" if b == "FF"
ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
end
###
# Don't generate last range if it is covered by first range
ret << build_range(b + "00" * left, stop) unless b == "FF"
ret.flatten!
end
def to_utf8( range )
utf8_ranges( range ).map do |r|
build_range to_utf8_enc(r.begin), to_utf8_enc(r.end)
end.flatten!
end
##
# Perform a 3-way comparison of the number of codepoints advertised by
# the unicode spec for the given range, the originally parsed range,
# and the resulting utf8 encoded range.
def count_codepoints( code )
code.split(' ').inject(1) do |acc, elt|
if elt =~ /0x(.+)\.\.0x(.+)/
if @encoding == :utf8
acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
else
acc * ($2.hex - $1.hex + 1)
end
else
acc
end
end
end
def is_valid?( range, desc, codes )
spec_count = 1
spec_count = $1.to_i if desc =~ /\[(\d+)\]/
range_count = range.end - range.begin + 1
sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
sum == spec_count and sum == range_count
end
##
# Generate the state maching to stdout
def generate_machine( name, property )
pipe = " "
puts " #{name} = "
each_alpha( CHART_URL, property ) do |range, desc|
codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
is_valid? range, desc, codes
range_width = codes.map { |a| a.size }.max
range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
if desc.size > desc_width
desc = desc[0..desc_width - 4] + "..."
end
codes.each_with_index do |r, idx|
desc = "" unless idx.zero?
code = "%-#{range_width}s" % r
puts " #{pipe} #{code} ##{desc}"
pipe = "|"
end
end
puts " ;"
puts ""
end
puts <<EOF
# The following Ragel file was autogenerated from: #{CHART_URL}
#
# It defines ualpha, udigit, ualnum.
#
# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
# and that your input is in #{@encoding}.
%%{
machine WChar;
EOF
generate_machine( :uUppercaseLetter, "Lu" )
generate_machine( :uLowercaseLetter, "Ll" )
generate_machine( :uTitlecaseLetter, "Lt" )
generate_machine( :uModifierLetter, "Lm" )
generate_machine( :uOtherLetter, "Lo" )
generate_machine( :uNonspacingMark, "Mn" )
generate_machine( :uEnclosingMark, "Me" )
generate_machine( :uSpacingMark, "Mc" )
generate_machine( :uDecimalNumber, "Nd" )
generate_machine( :uLetterNumber, "Nl" )
generate_machine( :uOtherNumber, "No" )
generate_machine( :uSpaceSeparator, "Zs" )
generate_machine( :uLineSeparator, "Zl" )
generate_machine( :uParagraphSeparator, "Zp" )
generate_machine( :uFormat, "Cf" )
generate_machine( :uPrivateUse, "Co" )
generate_machine( :uSurrogate, "Cs" )
generate_machine( :uDashPunctuation, "Pd" )
generate_machine( :uOpenPunctuation, "Ps" )
generate_machine( :uClosePunctuation, "Pe" )
generate_machine( :uConnectorPunctuation, "Pc" )
generate_machine( :uOtherPunctuation, "Po" )
generate_machine( :uMathSymbol, "Sm" )
generate_machine( :uCurrencySymbol, "Sc" )
generate_machine( :uModifierSymbol, "Sk" )
generate_machine( :uOtherSymbol, "So" )
generate_machine( :uInitialPunctuation, "Pi" )
generate_machine( :uFinalPunctuation, "Pf" )
puts <<EOF
}%%
EOF
%option unicode
ascSymbol [!#$%&⋆+./<=>?@\\^\-~:]
uniSymbol [\p{Symbol}\p{Other_Symbol}\p{Punctuation}]{-}[\^|_"',;]
symbol {ascSymbol}|{uniSymbol}