Search code examples
javascriptparsingcompiler-constructionabstract-syntax-treepeg

Distinguish operator precedence when traversing AST


I have an AST generated by a parsing expression grammar from a target language that will compile to a source language by traversing its nodes. A simple source like (10 + 20) * 2 should generate the following representation, as a native ECMAScript object:

var ast = {
   "type": "Stmt",
   "body": [
      {
         "type": "Expr",
         "expression": {
            "type": "BinaryExpr",
            "operator": "*",
            "left": {
               "type": "BinaryExpr",
               "operator": "+",
               "left": {
                  "type": "Literal",
                  "value": 10
               },
               "right": {
                  "type": "Literal",
                  "value": 20
               }
            },
            "right": {
               "type": "Literal",
               "value": 2
            }
         }
      }
   ]
};

The generated object clearly defines the precedence of the operators, and evaluating this source is pretty easy, however, generating code back from it is a pretty complex task when you have to deal with the parenthesis solving.

When generating code by traversing nodes, the precedence is completely lost. I have a function called visitor, which is the entry point of the program:

function visitor(node) {
  switch (node.type) {
    case "Stmt":
      return parseStmt(node.body);
  }
}

This simple grammar can accept multiple statements...

function parseStmt(body) {
  var stmtList = Array(body.length);

  for (var i = 0, len = body.length; i < len; i++) {
    stmtList[i] = (function(stmt) {
      switch (stmt.type) {
        case "Expr":
          return parseExpr(stmt.expression);
      }
    })(body[i]);
  }

  return stmtList.join(";\n");
}

... and two types of expressions:

function parseExpr(expr) {
  switch (expr.type) {
    case "BinaryExpr":
      return parseBinaryExpr(expr);
    case "Literal":
      return parseLiteral(expr);
  }
}

Where Literal just deals with string transformation...

function parseLiteral(expr) {
  return expr.value.toString();
}

... and BinaryExpr is ambiguous when solving precedence:

function parseBinaryExpr(expr) {
  var op = {
    left: parseExpr(expr.left),
    right: parseExpr(expr.right)
  };

  switch (expr.operator) {
    case "+":
      return Codegen.OP_ADD(op.left, op.right);
    case "*":
      return Codegen.OP_MUL(op.left, op.right);
  }
}

Only two math operations are here supported, and the code generation happens here:

var Codegen = {
  OP_ADD: function(left, right) {
    return left + " + " + right;
  },
  OP_MUL: function(left, right) {
    return left + " * " + right;
  }
};

When calling visitor(ast) back, I get 10 + 20 * 2, which would eval to 10 + (20 * 2) instead of (10 + 20) * 2, and inserting parenthesis in each side of the binary expression would be a ridiculous workaround: (10 + 20) * 2 where:

function parseBinaryExpr(expr) {
  var op = {
    left: "(" + parseExpr(expr.left) + ")",
    right: "(" + parseExpr(expr.right) + ")"
  };
...

How could this ambiguity be solved in a good way?


Solution

  • Wouldn't a simple precedence table and looking up at the parent expression solve it?

    Also, there were a little bug in the switch.

    var ast = {
       "type": "Stmt",
       "body": [
          {
             "type": "Expr",
             "expression": {
                "type": "BinaryExpr",
                "operator": "*",
                "left": {
                   "type": "BinaryExpr",
                   "operator": "+",
                   "left": {
                      "type": "Literal",
                      "value": 10
                   },
                   "right": {
                      "type": "Literal",
                      "value": 20
                   }
                },
                "right": {
                   "type": "Literal",
                   "value": 2
                }
             }
          }
       ]
    };
    
    var precedence = { "*": 0, "+": 1 };
    
    var Codegen = {
      OP_ADD: function(left, right) {
        return left + " + " + right;
      },
      OP_MUL: function(left, right) {
        return left + " * " + right;
      }
    };
    
    function visitor(node) {
      switch (node.type) {
        case "Stmt":
          return parseStmt(node.body);
      }
    }
    
    function parseStmt(body) {
      var stmtList = Array(body.length);
    
      for (var i = 0, len = body.length; i < len; i++) {
        stmtList[i] = (function(stmt) {
          switch (stmt.type) {
            case "Expr":
              return parseExpr(stmt.expression, null);
          }
        })(body[i]);
      }
    
      return stmtList.join(";\n");
    }
    
    function parseExpr(expr, parent) {
      switch (expr.type) {
        case "BinaryExpr":
          return parseBinaryExpr(expr, parent);
        case "Literal":
          return parseLiteral(expr);
      }
    }
    
    function parseLiteral(expr) {
      return expr.value.toString();
    }
    
    function parseBinaryExpr(expr, parent) {
      var op = {
        left: parseExpr(expr.left, expr),
        right: parseExpr(expr.right, expr)
      };
      var ret = "";
      switch (expr.operator) {
        case "+":
          ret = Codegen.OP_ADD(op.left, op.right); 
          break;
        case "*":
          ret = Codegen.OP_MUL(op.left, op.right); 
          break;
      }
      if (parent && precedence[expr.operator] > precedence[parent.operator]) {
        ret = "(" + ret + ")";
      }
      return ret;
    }
    
    visitor(ast);
    

    Or you can always put a parenthesis if there's a nested binary expression inside another, that would do the trick too.

      if (parent) {
        ret = "(" + ret + ")";
      }
    

    Just check parent because we only pass the parent if we already are inside a binary expression.