Search code examples
c#xpathxpathnavigator

Parse XPath Expressions


I am trying to create a 'AET' (Abstract Expression Tree) for XPath (as I am writing a WYSIWYG XSL editor). I have been hitting my head against the wall with the XPath BNF for the past three to four hours.

I have thought of another solution. I thought I could write a class that implements IXPathNavigable, which returns a XPathNavigator of my own when CreateNavigator is called. This XPathNavigator would always succeed on any method calls, and would keep track of those calls - e.g. we moved to the customers node and then the customer node. I could then use this information (hopefully) to create the 'AET' (so we would have customers/customer in a object model now).

Only question is: how on earth do I run a IXPathNavigable through an XPathExpression?

I know this is excessively lazy. But has anyone else gone through the effort and written a XPath expression parser? I haven't yet POC'd my possible solution, because I can't test it (because I can't run the XPathExpression against a IXPathNavigable), so I don't even know if my solution will even work.


Solution

  • There is an antlr xpath grammar here. Since its license permits, I copied the whole grammar here to avoid link rot in the future.

    grammar xpath;
    
    /*
    XPath 1.0 grammar. Should conform to the official spec at
    http://www.w3.org/TR/1999/REC-xpath-19991116. The grammar
    rules have been kept as close as possible to those in the
    spec, but some adjustmewnts were unavoidable. These were
    mainly removing left recursion (spec seems to be based on
    LR), and to deal with the double nature of the '*' token
    (node wildcard and multiplication operator). See also
    section 3.7 in the spec. These rule changes should make
    no difference to the strings accepted by the grammar.
    Written by Jan-Willem van den Broek
    Version 1.0
    Do with this code as you will.
    */
    /*
        Ported to Antlr4 by Tom Everett <[email protected]>
    */
    
    
    main  :  expr
      ;
    
    locationPath 
      :  relativeLocationPath
      |  absoluteLocationPathNoroot
      ;
    
    absoluteLocationPathNoroot
      :  '/' relativeLocationPath
      |  '//' relativeLocationPath
      ;
    
    relativeLocationPath
      :  step (('/'|'//') step)*
      ;
    
    step  :  axisSpecifier nodeTest predicate*
      |  abbreviatedStep
      ;
    
    axisSpecifier
      :  AxisName '::'
      |  '@'?
      ;
    
    nodeTest:  nameTest
      |  NodeType '(' ')'
      |  'processing-instruction' '(' Literal ')'
      ;
    
    predicate
      :  '[' expr ']'
      ;
    
    abbreviatedStep
      :  '.'
      |  '..'
      ;
    
    expr  :  orExpr
      ;
    
    primaryExpr
      :  variableReference
      |  '(' expr ')'
      |  Literal
      |  Number  
      |  functionCall
      ;
    
    functionCall
      :  functionName '(' ( expr ( ',' expr )* )? ')'
      ;
    
    unionExprNoRoot
      :  pathExprNoRoot ('|' unionExprNoRoot)?
      |  '/' '|' unionExprNoRoot
      ;
    
    pathExprNoRoot
      :  locationPath
      |  filterExpr (('/'|'//') relativeLocationPath)?
      ;
    
    filterExpr
      :  primaryExpr predicate*
      ;
    
    orExpr  :  andExpr ('or' andExpr)*
      ;
    
    andExpr  :  equalityExpr ('and' equalityExpr)*
      ;
    
    equalityExpr
      :  relationalExpr (('='|'!=') relationalExpr)*
      ;
    
    relationalExpr
      :  additiveExpr (('<'|'>'|'<='|'>=') additiveExpr)*
      ;
    
    additiveExpr
      :  multiplicativeExpr (('+'|'-') multiplicativeExpr)*
      ;
    
    multiplicativeExpr
      :  unaryExprNoRoot (('*'|'div'|'mod') multiplicativeExpr)?
      |  '/' (('div'|'mod') multiplicativeExpr)?
      ;
    
    unaryExprNoRoot
      :  '-'* unionExprNoRoot
      ;
    
    qName  :  nCName (':' nCName)?
      ;
    
    functionName
      :  qName  // Does not match nodeType, as per spec.
      ;
    
    variableReference
      :  '$' qName
      ;
    
    nameTest:  '*'
      |  nCName ':' '*'
      |  qName
      ;
    
    nCName  :  NCName
      |  AxisName
      ;
    
    NodeType:  'comment'
      |  'text'
      |  'processing-instruction'
      |  'node'
      ;
      
    Number  :  Digits ('.' Digits?)?
      |  '.' Digits
      ;
    
    fragment
    Digits  :  ('0'..'9')+
      ;
    
    AxisName:  'ancestor'
      |  'ancestor-or-self'
      |  'attribute'
      |  'child'
      |  'descendant'
      |  'descendant-or-self'
      |  'following'
      |  'following-sibling'
      |  'namespace'
      |  'parent'
      |  'preceding'
      |  'preceding-sibling'
      |  'self'
      ;
    
    
      PATHSEP 
           :'/';
      ABRPATH   
           : '//';
      LPAR   
           : '(';
      RPAR   
           : ')';
      LBRAC   
           :  '[';
      RBRAC   
           :  ']';
      MINUS   
           :  '-';
      PLUS   
           :  '+';
      DOT   
           :  '.';
      MUL   
           : '*';
      DOTDOT   
           :  '..';
      AT   
           : '@';
      COMMA  
           : ',';
      PIPE   
           :  '|';
      LESS   
           :  '<';
      MORE_ 
           :  '>';
      LE   
           :  '<=';
      GE   
           :  '>=';
      COLON   
           :  ':';
      CC   
           :  '::';
      APOS   
           :  '\'';
      QUOT   
           :  '\"';
      
    Literal  :  '"' ~'"'* '"'
      |  '\'' ~'\''* '\''
      ;
    
    Whitespace
      :  (' '|'\t'|'\n'|'\r')+ ->skip
      ;
    
    NCName  :  NCNameStartChar NCNameChar*
      ;
    
    fragment
    NCNameStartChar
      :  'A'..'Z'
      |   '_'
      |  'a'..'z'
      |  '\u00C0'..'\u00D6'
      |  '\u00D8'..'\u00F6'
      |  '\u00F8'..'\u02FF'
      |  '\u0370'..'\u037D'
      |  '\u037F'..'\u1FFF'
      |  '\u200C'..'\u200D'
      |  '\u2070'..'\u218F'
      |  '\u2C00'..'\u2FEF'
      |  '\u3001'..'\uD7FF'
      |  '\uF900'..'\uFDCF'
      |  '\uFDF0'..'\uFFFD'
    // Unfortunately, java escapes can't handle this conveniently,
    // as they're limited to 4 hex digits. TODO.
    //  |  '\U010000'..'\U0EFFFF'
      ;
    
    fragment
    NCNameChar
      :  NCNameStartChar | '-' | '.' | '0'..'9'
      |  '\u00B7' | '\u0300'..'\u036F'
      |  '\u203F'..'\u2040'
      ;