Search code examples
parsingantlrantlr4hl7hl7-v2

Variable tokens in ANTLR grammar


I am attempting to create a ANTLR grammar for a HL7 derived language. HL7 has a feature that all the delimiters in a message are mapped using the first few bytes of the input itself. For example: MSH|^~\& specifies the various delimiters, in order of field separator | component separator ^, repetition separator ~, escape character \, subcomponent separator &.

Can an ANTLR grammar be produced that does not hardcode these tokens?


Solution

  • As hinted by Kaby76 in the comments: yes, it is possible with some predicate voodoo:

    lexer grammar HL7Lexer;
    
    @members {
      private char fieldSeparator;
      private char componentSeparator;
      private char repetitionSeparator;
      private char escapeSeparator;
      private char subcomponentSeparator;
      private boolean separatorsInitialised = false;
    
      private void setEncodingChars(String chars) {
        this.fieldSeparator = chars.charAt(3);
        this.componentSeparator = chars.charAt(4);
        this.repetitionSeparator = chars.charAt(5);
        this.escapeSeparator = chars.charAt(6);
        this.subcomponentSeparator = chars.charAt(7);
        this.separatorsInitialised = true;
      }
    
      private boolean isEncodingCharAhead() {
        if (!this.separatorsInitialised) {
          return true;
        }
    
        char ch = (char)this._input.LA(1);
    
        return ch == this.fieldSeparator || ch == this.componentSeparator
          || ch == this.repetitionSeparator || ch == this.escapeSeparator
          || ch == this.subcomponentSeparator;
      }
    }
    
    MSH
     : 'MSH' . . . . . {this.setEncodingChars(getText());}
     ;
    
    FIELD_SEP
     : {this._input.LA(1) == this.fieldSeparator}? .
     ;
    
    COMPONENT_SEP
     : {this._input.LA(1) == this.componentSeparator}? .
     ;
    
    REPETITION_SEP
     : {this._input.LA(1) == this.repetitionSeparator}? .
     ;
    
    ESCAPE_SEP
     : {this._input.LA(1) == this.escapeSeparator}? .
     ;
    
    SUBCOMPONENT_SEP
     : {this._input.LA(1) == this.subcomponentSeparator}? .
     ;
    
    OTHER
     : ( {!this.isEncodingCharAhead()}? . )+
     ;
    

    When testing this lexer grammar with the input MSH|^~\&|ADT1|GOOD HEALTH HOSPITAL|GHH LAB, INC.|GOOD HEALTH HOSPITAL|198808181126|SECURITY|ADT^A01^ADT_A01|MSG00001|P|2.8||:

    String message = "MSH|^~\\&|ADT1|GOOD HEALTH HOSPITAL|GHH LAB, INC.|GOOD HEALTH HOSPITAL|198808181126|SECURITY|ADT^A01^ADT_A01|MSG00001|P|2.8||";
    HL7Lexer lexer = new HL7Lexer(CharStreams.fromString(message));
    
    CommonTokenStream stream = new CommonTokenStream(lexer);
    
    stream.fill();
    
    for (Token t : stream.getTokens()) {
        System.out.printf("%-20s '%s'\n",
            HL7Lexer.VOCABULARY.getSymbolicName(t.getType()),
            t.getText().replace("\n", "\\n"));
    }
    

    the following tokens are created:

    MSH                  'MSH|^~\&'
    FIELD_SEP            '|'
    OTHER                'ADT1'
    FIELD_SEP            '|'
    OTHER                'GOOD HEALTH HOSPITAL'
    FIELD_SEP            '|'
    OTHER                'GHH LAB, INC.'
    FIELD_SEP            '|'
    OTHER                'GOOD HEALTH HOSPITAL'
    FIELD_SEP            '|'
    OTHER                '198808181126'
    FIELD_SEP            '|'
    OTHER                'SECURITY'
    FIELD_SEP            '|'
    OTHER                'ADT'
    COMPONENT_SEP        '^'
    OTHER                'A01'
    COMPONENT_SEP        '^'
    OTHER                'ADT_A01'
    FIELD_SEP            '|'
    OTHER                'MSG00001'
    FIELD_SEP            '|'
    OTHER                'P'
    FIELD_SEP            '|'
    OTHER                '2.8'
    FIELD_SEP            '|'
    FIELD_SEP            '|'
    EOF                  '<EOF>'