parsing compiler-construction antlr grammar language-design

Capturing a group with a generic SQL dialect

I would like to define an expression of the grammar:

=expr + #native(...)

expr can be any valid expression in our grammar, and this currently works. However, #native(...) is a valid SQL statement in a target (potentially multiple) dialects, meaning we do not control it -- it could be in SQL Server, in Oracle, in MySQL, in Postgres, etc. We basically just want to "capture" what has been entered inside that construct. However, we run into an issue I believe which makes it ambiguous:

How do we know when the ) character terminates the native expression? We can parse a string such as '...' or "..." to ignore any characters inside that, but then it may also be ambiguous what the escape character is -- sometimes it may be a double of the qute-char, such as "He said ""hello"".", or it could be a backslash, such as "He said \"Hello\".". In other words, it's difficult in the general sense to know when a value is quoted or potentially commented/query-hinted, ec.

Other than doing the expression as a string literal, such as =expr + #native("...") with out own defined escape character, is there any other relatively-elegant way to handle this potential ambiguity?

Idea: perhaps for each target dialect we store the Quote, Comment, and Escape chars for acceptable SQL dialects? For example, a map such as:

{
    "postgres": {
        "comments": [
            "--"
        ],
        "blocks": [
           {"start": "/*","end": "*/"}
        ],
        "quotes": [
           {"value": "'","escape": "''"},
           {"value": "\"","escape": "\"\""}
        ]
    },
    "mysql": {
        "comments": [
            "#"
        ],
        "blocks": [
           {"start": "/*","end": "*/"}
        ],
        "quotes": [
            {"value": "'","escape": "\\"},
            {"value": "\"","escape": "\\"},
            {"value": "`",escape": "``"}
        ]
    }
}

And change the native to the dialect, such as =expr + #postgres(...). For example:

=1 + #postgres(SELECT (ARRAY[(2),'-3' /*)(*/, (((3)))])[1])
--> 3

Does that sound like a valid and good approach? Finally, if the lexing itself is dependent on characters, how would I 'extract' that #native(...) component? Would I need to add a pre-processor to the lexer or something? (I'm currently using ANTLR for the lexing/parsing).

Solution

You could do this in the lexer by adding some custom code and using the more advanced features:

The gist of it is this:

create some domain models that represent your SQL dialects containing the data about what escape/block chars are
when you encounter a #name( in the lexer, enter into a "native" mode
inside the "native" mode, check if you need to enter other modes (like a "block" mode or "quote" mode) or if you need to return to the default mode (needing to pop out of the "native" mode)

A small Java demo:

// DynamicLexer.g4
lexer grammar DynamicLexer;

@members {
  private java.util.Map<String, SqlDialect> dialects;
  private String dialect = null;
  private Block block = null;
  private Quote quote = null;

  public DynamicLexer(java.util.Map<String, SqlDialect> dialects, CharStream input) {
    this(input);
    this.dialects = dialects;
  }

  private void setDialect(String token) {
    this.dialect = token.replaceAll("[#(]", "");
  }

  private SqlDialect getDialect() {
    SqlDialect sqlDialect = this.dialects.get(this.dialect);
    if (sqlDialect == null) {
      throw new RuntimeException("Unknown dialect: '" + this.dialect + "'");
    }
    return sqlDialect;
  }

  private boolean blockStartAhead() {
    SqlDialect sqlDialect = this.getDialect();
    for (Block b : sqlDialect.blocks) {
      if (this.ahead(b.start)) {
        this.consume(b.start);
        this.block = b;
        return true;
      }
    }
    return false;
  }

  private boolean blockEndAhead() {
    if (this.ahead(this.block.end)) {
      this.consume(this.block.end);
      return true;
    }
    return false;
  }

  private boolean quoteStartAhead() {
    SqlDialect sqlDialect = this.getDialect();
    for (Quote q : sqlDialect.quotes) {
      if (this.ahead(q.start)) {
        this.consume(q.start);
        this.quote = q;
        return true;
      }
    }
    return false;
  }

  private boolean quoteEndAhead() {
    if (this.ahead(this.quote.start)) {
      this.consume(this.quote.start);
      return true;
    }
    return false;
  }

  private boolean quoteEscapeAhead(boolean consume) {
    if (this.ahead(this.quote.escape)) {
      if (consume) {
        this.consume(this.quote.escape);
      }
      return true;
    }
    return false;
  }

  private boolean ahead(String text) {
    for (int i = 1; i <= text.length(); i++) {
      if (this._input.LA(i) != text.charAt(i - 1)) {
        return false;
      }
    }
    return true;
  }

  private void consume(String text) {
    for (int i = 1; i < text.length(); i++) {
      this._input.consume();
    }
  }
}

SPACE  : [ \t\r\n] -> skip;
EQUAL  : '=';
ADD    : '+';
INT    : [0-9]+;
NATIVE : '#' [a-zA-Z]+ '(' {setDialect(getText());} -> pushMode(NATIVE_MODE);

mode NATIVE_MODE;

BLOCK_START : {blockStartAhead()}? . -> pushMode(BLOCK_MODE);
QUOTE_START : {quoteStartAhead()}? . -> pushMode(QUOTE_MODE);
LPAR        : ')' -> popMode;
RPAR        : '(' -> pushMode(NATIVE_MODE);
NATIVE_ATOM : [a-zA-Z0-9]+ | ~[a-zA-Z0-9];

mode BLOCK_MODE;

BLOCK_END  : {blockEndAhead()}? . -> popMode;
BLOCK_ATOM : . ;

mode QUOTE_MODE;

ESCAPE     : {quoteEscapeAhead(true)}? .;
QUOTE_END  : {!quoteEscapeAhead(false) && quoteEndAhead()}? . -> popMode;
QUOTE_ATOM : .;

The lexer above can be used by the parser:

// DynamicParser.g4
parser grammar DynamicParser;

options {
  tokenVocab=DynamicLexer;
}

parse
 : EQUAL expr EOF
 ;

expr
 : expr ADD expr
 | native
 | INT
 ;

native
 : NATIVE native_atom* LPAR
 ;

native_atom
 : NATIVE_ATOM
 | LPAR
 | RPAR
 | native_block
 | native_quote
 ;

native_block
 : BLOCK_START BLOCK_ATOM* BLOCK_END
 ;

native_quote
 : QUOTE_START ( ESCAPE | QUOTE_ATOM )* QUOTE_END
 ;

After generating the lexer and parser classes, test it with the following class:

public class Main {

    public static void main(String[] args) {

        Map<String, SqlDialect> dialects = new HashMap<>(){{
            put("postgres", new SqlDialect("--",
                    new Block[]{ new Block("/*", "*/") },
                    new Quote[]{ new Quote("'", "''"), new Quote("\"", "\"\"") }));
            put("mysql", new SqlDialect("#",
                    new Block[]{ new Block("/*", "*/") },
                    new Quote[]{ new Quote("'", "\\'"), new Quote("\"", "\\\""), new Quote("`", "```") }));
        }};

        String source = "=1 + #postgres(SELECT (ARRAY[(2),'-3' /*)(*/, (((3)))])[1])";
        DynamicLexer lexer = new DynamicLexer(dialects, CharStreams.fromString(source));
        CommonTokenStream stream = new CommonTokenStream(lexer);
        stream.fill();

        for (Token t : stream.getTokens()) {
            System.out.printf("%-20s '%s'%n",
                    DynamicLexer.VOCABULARY.getSymbolicName(t.getType()),
                    t.getText().replace("\n", "\\n"));
        }

        lexer = new DynamicLexer(dialects, CharStreams.fromString(source));
        DynamicParser parser = new DynamicParser(new CommonTokenStream(lexer));
        ParseTree root = parser.parse();

        System.out.println(root.toStringTree(parser));
    }
}

class SqlDialect {

    public final String commentStart;
    public final List<Block> blocks;
    public final List<Quote> quotes;

    public SqlDialect(String commentStart, Block[] blocks, Quote[] quotes) {
        this.commentStart = commentStart;
        this.blocks = Arrays.asList(blocks);
        this.quotes = Arrays.asList(quotes);
    }
}

class Block {
    public final String start;
    public final String end;

    public Block(String start, String end) {
        this.start = start;
        this.end = end;
    }
}

class Quote {
    public final String start;
    public final String escape;

    public Quote(String start, String escape) {
        this.start = start;
        this.escape = escape;
    }
}

After running the Main class, you will see the following being printed to your console:

EQUAL                '='
INT                  '1'
ADD                  '+'
NATIVE               '#postgres('
NATIVE_ATOM          'SELECT'
NATIVE_ATOM          ' '
RPAR                 '('
NATIVE_ATOM          'ARRAY'
NATIVE_ATOM          '['
RPAR                 '('
NATIVE_ATOM          '2'
LPAR                 ')'
NATIVE_ATOM          ','
QUOTE_START          '''
QUOTE_ATOM           '-'
QUOTE_ATOM           '3'
QUOTE_END            '''
NATIVE_ATOM          ' '
BLOCK_START          '/*'
BLOCK_ATOM           ')'
BLOCK_ATOM           '('
BLOCK_END            '*/'
NATIVE_ATOM          ','
NATIVE_ATOM          ' '
RPAR                 '('
RPAR                 '('
RPAR                 '('
NATIVE_ATOM          '3'
LPAR                 ')'
LPAR                 ')'
LPAR                 ')'
NATIVE_ATOM          ']'
LPAR                 ')'
NATIVE_ATOM          '['
NATIVE_ATOM          '1'
NATIVE_ATOM          ']'
LPAR                 ')'
EOF                  '<EOF>'

 (parse =
   (expr
     (expr 1)
     +
     (expr
       (native #postgres(
         (native_atom SELECT)
         (native_atom  )
         (native_atom ()
         (native_atom ARRAY)
         (native_atom [)
         (native_atom ()
         (native_atom 2)
         (native_atom ))
         (native_atom ,)
         (native_atom
           (native_quote ' - 3 '))
         (native_atom  )
         (native_atom
           (native_block /* ) ( */))
         (native_atom ,)
         (native_atom  )
         (native_atom ()
         (native_atom ()
         (native_atom ()
         (native_atom 3)
         (native_atom ))
         (native_atom ))
         (native_atom ))
         (native_atom ])
         (native_atom ))
         (native_atom [)
         (native_atom 1)
         (native_atom ]) ))))
   <EOF>)

(I manually indented the parse tree, it will be a single line when you run the Java code)