Search code examples
rubyunicodecmakeflex-lexer

Flex (lexer) - matching unicode


Is there a way to get flex to match unicode along the lines of

ascSymbol     !|#|$|%|&|⋆|+|.|/|<|=|>|?|@|\|^|-|~|:
uniSymbol     \p{Symbol}|\p{Other_Symbol}|\p{Punctuation}
symbol        ascSymbol|uniSymbol{-}[^|_"',;]

I found http://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html via Flex(lexer) support for unicode but I'd want to be able to something in an automated way.

For example, I'm using cmake and it is configured to generate the lexer/parser on build from the *.l and *.y files. I'd ideally want a work around that didn't require GHC or another Haskell compiler to be installed.


Solution

  • As it turns out getting unicode support in Flex would be a pain unless the Flex source itself adds it. There seems to be some experimental stuff in there for unicode but never made it into a release that I can find.

    The Ragel doc is insightful, and comes with built in support for Unicode. I've since found this article which gives an example of how to make Ragel and C++ play nice. Seems to be the better option so going with that.

    Hopefully this saves someone else the time it took to figure this out.

    EDIT

    "Built in support" as stated above is perhaps an exaggeration. It has been easier to get unicode support but it's not just out of the box kind of thing. Using cmake I'm generating a state machine from the derived UCD 7 file. In CMakeLists.txt I do:

    #Ruby is required to generate a unicode Ragel machine
    FIND_PACKAGE(Ruby REQUIRED)
    MESSAGE("Found Ruby ${RUBY_VERSION}")
    SET(UNICODE_MACHINE_PATH "${PROJECT_SOURCE_DIR}/src/unicode.rl")
    if(NOT EXISTS ${UNICODE_MACHINE_PATH} OR gen_unicode)
    
    MESSAGE("Attempting to generate unicode state machine")
    EXECUTE_PROCESS(COMMAND ${RUBY_EXECUTABLE}  ${PROJECT_SOURCE_DIR}/unicode2ragel.rb
                    OUTPUT_FILE ${UNICODE_MACHINE_PATH}
                    RESULT_VARIABLE RAGEL_UNICODE_GEN_RES)
    
      if(${RAGEL_UNICODE_GEN_RES} EQUAL 0)
        MESSAGE("Generaged Ragel Unicode state machine")
      else()
        MESSAGE(SEND_ERROR "Unable to generate unicode state machine")
      endif()
    endif()
    

    Then in unicode2ragel.rb (ships with Ragel and modified slightly for UCD 7)

    #!/usr/bin/env ruby
    #
    # This script uses the unicode spec to generate a Ragel state machine
    # that recognizes unicode alphanumeric characters.  It generates 5
    # character classes: uupper, ulower, ualpha, udigit, and ualnum.
    # Currently supported encodings are UTF-8 [default] and UCS-4.
    #
    # Usage: unicode2ragel.rb [options]
    #    -e, --encoding [ucs4 | utf8]     Data encoding
    #    -h, --help                       Show this message
    #
    # This script was originally written as part of the Ferret search
    # engine library.
    #
    # Author: Rakan El-Khalil <[email protected]>
    
    require 'optparse'
    require 'open-uri'
    
    ENCODINGS = [ :utf8, :ucs4 ]
    ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" }
    CHART_URL = "http://www.unicode.org/Public/7.0.0/ucd/extracted/DerivedGeneralCategory.txt"#"http://www.unicode.org/Public/7.0.0/ucd/DerivedCoreProperties.txt"
    
    ###
    # Display vars & default option
    
    TOTAL_WIDTH = 80
    RANGE_WIDTH = 23
    @encoding = :utf8
    
    ###
    # Option parsing
    
    cli_opts = OptionParser.new do |opts|
      opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
        @encoding = o.downcase.to_sym
      end
      opts.on("-h", "--help", "Show this message") do
        puts opts
        exit
      end
    end
    
    cli_opts.parse(ARGV)
    unless ENCODINGS.member? @encoding
      puts "Invalid encoding: #{@encoding}"
      puts cli_opts
      exit
    end
    
    ##
    # Downloads the document at url and yields every alpha line's hex
    # range and description.
    
    def each_alpha( url, property )
      open( url ) do |file|
        file.each_line do |line|
          next if line =~ /^#/;
          next if line !~ /; #{property} #/;
    
          range, description = line.split(/;/)
          range.strip!
          description.gsub!(/.*#/, '').strip!
    
          if range =~ /\.\./
               start, stop = range.split '..'
          else start = stop = range
          end
    
          yield start.hex .. stop.hex, description
        end
      end
    end
    
    ###
    # Formats to hex at minimum width
    
    def to_hex( n )
      r = "%0X" % n
      r = "0#{r}" unless (r.length % 2).zero?
      r
    end
    
    ###
    # UCS4 is just a straight hex conversion of the unicode codepoint.
    
    def to_ucs4( range )
      rangestr  =   "0x" + to_hex(range.begin)
      rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
      [ rangestr ]
    end
    
    ##
    # 0x00     - 0x7f     -> 0zzzzzzz[7]
    # 0x80     - 0x7ff    -> 110yyyyy[5] 10zzzzzz[6]
    # 0x800    - 0xffff   -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
    # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
    
    UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
    
    def to_utf8_enc( n )
      r = 0
      if n <= 0x7f
        r = n
      elsif n <= 0x7ff
        y = 0xc0 | (n >> 6)
        z = 0x80 | (n & 0x3f)
        r = y << 8 | z
      elsif n <= 0xffff
        x = 0xe0 | (n >> 12)
        y = 0x80 | (n >>  6) & 0x3f
        z = 0x80 |  n        & 0x3f
        r = x << 16 | y << 8 | z
      elsif n <= 0x10ffff
        w = 0xf0 | (n >> 18)
        x = 0x80 | (n >> 12) & 0x3f
        y = 0x80 | (n >>  6) & 0x3f
        z = 0x80 |  n        & 0x3f
        r = w << 24 | x << 16 | y << 8 | z
      end
    
      to_hex(r)
    end
    
    def from_utf8_enc( n )
      n = n.hex
      r = 0
      if n <= 0x7f
        r = n
      elsif n <= 0xdfff
        y = (n >> 8) & 0x1f
        z =  n       & 0x3f
        r = y << 6 | z
      elsif n <= 0xefffff
        x = (n >> 16) & 0x0f
        y = (n >>  8) & 0x3f
        z =  n        & 0x3f
        r = x << 10 | y << 6 | z
      elsif n <= 0xf7ffffff
        w = (n >> 24) & 0x07
        x = (n >> 16) & 0x3f
        y = (n >>  8) & 0x3f
        z =  n        & 0x3f
        r = w << 18 | x << 12 | y << 6 | z
      end
      r
    end
    
    ###
    # Given a range, splits it up into ranges that can be continuously
    # encoded into utf8.  Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
    # This is not strictly needed since the current [5.1] unicode standard
    # doesn't have ranges that straddle utf8 boundaries.  This is included
    # for completeness as there is no telling if that will ever change.
    
    def utf8_ranges( range )
      ranges = []
      UTF8_BOUNDARIES.each do |max|
        if range.begin <= max
          return ranges << range if range.end <= max
    
          ranges << range.begin .. max
          range = (max + 1) .. range.end
        end
      end
      ranges
    end
    
    def build_range( start, stop )
      size = start.size/2
      left = size - 1
      return [""] if size < 1
    
      a = start[0..1]
      b = stop[0..1]
    
      ###
      # Shared prefix
    
      if a == b
        return build_range(start[2..-1], stop[2..-1]).map do |elt|
          "0x#{a} " + elt
        end
      end
    
      ###
      # Unshared prefix, end of run
    
      return ["0x#{a}..0x#{b} "] if left.zero?
    
      ###
      # Unshared prefix, not end of run
      # Range can be 0x123456..0x56789A
      # Which is equivalent to:
      #     0x123456 .. 0x12FFFF
      #     0x130000 .. 0x55FFFF
      #     0x560000 .. 0x56789A
    
      ret = []
      ret << build_range(start, a + "FF" * left)
    
      ###
      # Only generate middle range if need be.
    
      if a.hex+1 != b.hex
        max = to_hex(b.hex - 1)
        max = "FF" if b == "FF"
        ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
      end
    
      ###
      # Don't generate last range if it is covered by first range
    
      ret << build_range(b + "00" * left, stop) unless b == "FF"
      ret.flatten!
    end
    
    def to_utf8( range )
      utf8_ranges( range ).map do |r|
        build_range to_utf8_enc(r.begin), to_utf8_enc(r.end)
      end.flatten!
    end
    
    ##
    # Perform a 3-way comparison of the number of codepoints advertised by
    # the unicode spec for the given range, the originally parsed range,
    # and the resulting utf8 encoded range.
    
    def count_codepoints( code )
      code.split(' ').inject(1) do |acc, elt|
        if elt =~ /0x(.+)\.\.0x(.+)/
          if @encoding == :utf8
            acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
          else
            acc * ($2.hex - $1.hex + 1)
          end
        else
          acc
        end
      end
    end
    
    def is_valid?( range, desc, codes )
      spec_count  = 1
      spec_count  = $1.to_i if desc =~ /\[(\d+)\]/
      range_count = range.end - range.begin + 1
    
      sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
      sum == spec_count and sum == range_count
    end
    
    ##
    # Generate the state maching to stdout
    
    def generate_machine( name, property )
      pipe = " "
      puts "    #{name} = "
      each_alpha( CHART_URL, property ) do |range, desc|
    
        codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
    
        raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
          is_valid? range, desc, codes
    
        range_width = codes.map { |a| a.size }.max
        range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
    
        desc_width  = TOTAL_WIDTH - RANGE_WIDTH - 11
        desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
    
        if desc.size > desc_width
          desc = desc[0..desc_width - 4] + "..."
        end
    
        codes.each_with_index do |r, idx|
          desc = "" unless idx.zero?
          code = "%-#{range_width}s" % r
          puts "      #{pipe} #{code} ##{desc}"
          pipe = "|"
        end
      end
      puts "      ;"
      puts ""
    end
    
    puts <<EOF
    # The following Ragel file was autogenerated from: #{CHART_URL}
    #
    # It defines ualpha, udigit, ualnum.
    #
    # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
    # and that your input is in #{@encoding}.
    
    %%{
        machine WChar;
    EOF
    generate_machine( :uUppercaseLetter, "Lu" )
    generate_machine( :uLowercaseLetter, "Ll" )
    generate_machine( :uTitlecaseLetter, "Lt" )
    generate_machine( :uModifierLetter, "Lm" )
    generate_machine( :uOtherLetter, "Lo" )
    generate_machine( :uNonspacingMark, "Mn" )
    generate_machine( :uEnclosingMark, "Me" )
    generate_machine( :uSpacingMark, "Mc" )
    generate_machine( :uDecimalNumber, "Nd" )
    generate_machine( :uLetterNumber, "Nl" )
    generate_machine( :uOtherNumber, "No" )
    generate_machine( :uSpaceSeparator, "Zs" )
    generate_machine( :uLineSeparator, "Zl" )
    generate_machine( :uParagraphSeparator, "Zp" )
    generate_machine( :uFormat, "Cf" )
    generate_machine( :uPrivateUse, "Co" )
    generate_machine( :uSurrogate, "Cs" )
    generate_machine( :uDashPunctuation, "Pd" )
    generate_machine( :uOpenPunctuation, "Ps" )
    generate_machine( :uClosePunctuation, "Pe" )
    generate_machine( :uConnectorPunctuation, "Pc" )
    generate_machine( :uOtherPunctuation, "Po" )
    generate_machine( :uMathSymbol, "Sm" )
    generate_machine( :uCurrencySymbol, "Sc" )
    generate_machine( :uModifierSymbol, "Sk" )
    generate_machine( :uOtherSymbol, "So" )
    generate_machine( :uInitialPunctuation, "Pi" )
    generate_machine( :uFinalPunctuation, "Pf" )
    puts <<EOF
    }%%
    EOF
    

    Then in your ragel machine files you can include unicode.rl and get access to each of the unicode groups defined e.g uUppercaseLetter and so on...