Search code examples
pythonparsingtreetokenantlr4

antlr4 python 3 print or dump tokens from plsql grammar


I am using antlr4 in Python, to read the following grammar :

https://github.com/antlr/grammars-v4/tree/master/plsql

file grants.sql just has "begin select 'bob' from dual; end;"

simple code to print lisp like tree

from antlr4 import *
from PlSqlLexer import PlSqlLexer
from PlSqlParser import PlSqlParser
from PlSqlParserListener import PlSqlParserListener

input = FileStream('grants.sql')
lexer = PlSqlLexer(input)

stream = CommonTokenStream(lexer)
parser = PlSqlParser(stream)
tree = parser.sql_script()

print ("Tree " + tree.toStringTree(recog=parser));

Output is thus:

Tree (sql_script (unit_statement (anonymous_block BEGIN (seq_of_statements (statement (sql_statement (data_manipulation_language_statements (select_statement (subquery (subquery_basic_elements (query_block SELECT (selected_element (select_list_elements (expressions (expression (logical_expression (multiset_expression (relational_expression (compound_expression (concatenation (model_expression (unary_expression (atom (constant (quoted_string 'bob')))))))))))))) (from_clause FROM (table_ref_list (table_ref (table_ref_aux (table_ref_aux_internal (dml_table_expression_clause (tableview_name (identifier (id_expression (regular_id DUAL))))))))))))))))) ;) END ;)) )

I'd like to be able to have python code that lists the above not in a lisp like statement but lists all the rules and tokens.. i.e

  1. .sql_script
    1. ..unit_statement
    2. ...anonymous_block
    3. ....BEGIN

etc etc

Can someone supply python code that does this or give me some hints. Gratefully appreciated.


Solution

  • Here's a start:

    from antlr4 import *
    from antlr4.tree.Tree import TerminalNodeImpl
    from PlSqlLexer import PlSqlLexer
    from PlSqlParser import PlSqlParser
    
    # Generate the lexer nad parser like this:
    #
    #   java -jar antlr-4.7.1-complete.jar -Dlanguage=Python3 *.g4
    #
    def main():
        lexer = PlSqlLexer(InputStream("SELECT * FROM TABLE_NAME"))
        parser = PlSqlParser(CommonTokenStream(lexer))
        tree = parser.sql_script()
        traverse(tree, parser.ruleNames)
    
    def traverse(tree, rule_names, indent = 0):
        if tree.getText() == "<EOF>":
            return
        elif isinstance(tree, TerminalNodeImpl):
            print("{0}TOKEN='{1}'".format("  " * indent, tree.getText()))
        else:
            print("{0}{1}".format("  " * indent, rule_names[tree.getRuleIndex()]))
            for child in tree.children:
                traverse(child, rule_names, indent + 1)
    
    if __name__ == '__main__':
        main()
    

    which prints:

    sql_script
      unit_statement
        data_manipulation_language_statements
          select_statement
            subquery
              subquery_basic_elements
                query_block
                  TOKEN='SELECT'
                  TOKEN='*'
                  from_clause
                    TOKEN='FROM'
                    table_ref_list
                      table_ref
                        table_ref_aux
                          table_ref_aux_internal
                            dml_table_expression_clause
                              tableview_name
                                identifier
                                  id_expression
                                    regular_id
                                      TOKEN='TABLE_NAME'
    

    Note that for the lexer and parser to work properly, I added the following Python classes:

    # PlSqlBaseLexer.py
    from antlr4 import *
    
    class PlSqlBaseLexer(Lexer):
    
        def IsNewlineAtPos(self, pos):
            la = self._input.LA(pos)
            return la == -1 or la == '\n'
    

    and:

    # PlSqlBaseParser.py
    from antlr4 import *
    
    class PlSqlBaseParser(Parser):
    
        _isVersion10 = False
        _isVersion12 = True
    
        def isVersion10(self):
            return self._isVersion10
    
        def isVersion12(self):
            return self._isVersion12
    
        def setVersion10(self, value):
            self._isVersion10 = value
    
        def setVersion12(self, value):
            self._isVersion12 = value
    

    which I placed in the same folder as the generated Python classes. I also needed to and the import statement from PlSqlBaseLexer import PlSqlBaseLexer in the generated PlSqlLexer.py class, and fix the import statement in PlSqlParser.py from from ./PlSqlBaseParser import PlSqlBaseParser to from PlSqlBaseParser import PlSqlBaseParser.

    Note that running the demo is rather slow. Unless you have a hard requirement to do this in Python, I recommend going with the (much!) faster Java or C# target instead.