javascript algorithm parsing abstract-syntax-tree

Convert a string with a math formula to an object tree?

I am looking for a function that converts a math string passed as an argument (with the operations +, -, /, *), that returns an object that contains pieces of the math string, in an ordered/better way, which is easier to traverse.

Characteristics of the input:

is a string containing a formula
the formula doesn't have =, so it isn't an equation
there isn't any floating number, just integers
the integers can be positive or negative
no variables like x, y, z
can include parentheses

Test cases

Example 1: Basic Math (same operation)

N	Input (string)	Output (object)
1	1	`{ values: [1], operation: null }`
2	1+1	`{ values: [1,1], operation: "+" }`
3	1+2+3	`{ values: [1,2,3], operation: "+" }`
4	3-2-1	`{ values: [3,2,1], operation: "-" }`
5	10*80	`{ values: [10,80], operation: "*" }`
6	100/10	`{ values: [100,10], operation: "/" }`

Example 2: Formula with 2 operations

➡️ + and - samples

N1

input: 1+1-1

output:

{
    values: [
      {
        values: [1, 1],
        operation: "+",
      },
      1,
    ],
    operation: "-",
};

N2

input: 3+2-1+5

output:

{
    values: [
      {
        values: [
          {
            values: [3, 2],
            operation: "+",
          },
          1,
        ],
        operation: "-",
      },
      5,
    ],
    operation: "+",
};

N3

input: 3+2-1+5+10+7

output:

{
    values: [
      {
        values: [
          {
            values: [3, 2],
            operation: "+",
          },
          1,
        ],
        operation: "-",
      },
      5,
      10,
      7
    ],
    operation: "+",
};

➡️ + and / samples

N4

input: 1+2/3

output:

{
    values: [
      1,
      {
        values: [2, 3],
        operation: "/",
      },
    ],
    operation: "+",
};

N5

input: 2/3+1

output:

{
    values: [
      {
        values: [2, 3],
        operation: "/",
      },
      1,
    ],
    operation: "+",
};

N6

input: 1/2+3/4+5/6

output:

{
    values: [
      {
        values: [1, 2],
        operation: "/",
      },
      {
        values: [3, 4],
        operation: "/",
      },
      {
        values: [5, 6],
        operation: "/",
      },
    ],
    operation: "+",
};

N7

input: 1/2/3/4/5+6+7+8/9+10/11

output:

{
    values: [
      {
        values: [1, 2, 3, 4, 5],
        operation: "/",
      },
      6,
      7,
      {
        values: [8, 9],
        operation: "/",
      },
      {
        values: [10, 11],
        operation: "/",
      },
    ],
    operation: "+",
};

➡️ / and - samples

N8

input: 1-2/3

output:

{
    values: [
      1,
      {
        values: [2, 3],
        operation: "/",
      },
    ],
    operation: "-",
};

➡️ / and * samples

N9

input: 10/2*5

output:

{
    values: [
      {
        values: [10, 2],
        operation: "/",
      },
      5,
    ],
    operation: "*",
};

Example 3: Formula with 4 operations

N1

input: 10/2*5+1-1*5/3+2*4

output:

{
    values: [
      {
        values: [
          {
            values: [
              {
                values: [
                  {
                    values: [10, 2],
                    operation: "/",
                  },
                  5,
                ],
                operation: "*",
              },
              1,
            ],
            operation: "+",
          },
          {
            values: [
              {
                values: [1, 5],
                operation: "*",
              },
              3,
            ],
            operation: "/",
          },
        ],
        operation: "-",
      },
      {
        values: [2, 4],
        operation: "*",
      },
    ],
    operation: "+",
};

Example 4: Formula with () parenthesis

N1

input: 1+2*(3+2)

output:

{
    values: [
      1,
      {
        values: [
          2,
          {
            values: [3, 2],
            operation: "+",
          },
        ],
        operation: "*",
      },
    ],
    operation: "+",
};

N2

input: (1+2*3)*2

output:

{
    values: [
      {
        values: [
          1,
          {
            values: [2, 3],
            operation: "*",
          },
        ],
        operation: "+",
      },
      2,
    ],
    operation: "*",
};

N3

input: (1/1/10)+1/30+1/50

output:

{
    values: [
      {
        values: [1, 1, 10],
        operation: "/",
      },
      {
        values: [1, 30],
        operation: "/",
      },
      {
        values: [1, 50],
        operation: "/",
      },
    ],
    operation: "+",
  };

Other Cases

N1

input: -(1+2)

output:

{
    values: [
      {
        values: [1, 2],
        operation: "+",
      },
    ],
    operation: "-",
};

...

Solution

Here's a function that seems to pass all of your test cases.

Somehow I've written a lot of expression parsers, and I eventually settled on doing it this way in pretty much all cases. This is a recursive descent parser that is just about as simple as a fully-featured expression parser can be.

The secret is the parseTokens function's minPrec parameter, which tells it to parse until it encounters an operator with lower precedence. That makes it easy for the function to call itself recursively at each precedence level.

/**
 * Parse an expression into the required tree
 * @param str {string}
 */
function parseExpression(str) {
    // break string into tokens, in reverse order because pop() is faster than shift()
    const tokens = str.match(/[.0-9Ee]+|[^\s]/g).reverse();
    const tree = parseTokens(tokens, 0);
    if (tokens.length) {
        throw new Error(`Unexpected ${tokens.pop()} after expression`);
    }
    return tree;
}

const BINARY_PRECEDENCE = {
    '+': 0,
    '-': 0,
    '*': 1,
    '/': 1,
}

const UNARY_PRECEDENCE = {
    '+': 10,
    '-': 10,
}

/**
 * Given an array of tokens in reverse order, return binary expression tree
 * 
 * @param tokens {string[]} tokens
 * @param minPrec {number} stop at operators with precedence smaller than this
 */
function parseTokens(tokens, minPrec) {
    if (!tokens.length) {
        throw new Error('Unexpected end of expression');
    }

    // get the left operand
    let leftToken = tokens.pop();
    let leftVal;
    if (leftToken === '(') {
        leftVal = parseTokens(tokens, 0);
        if (tokens.pop() !== ')') {
            throw new Error('Unmatched (');
        }
    } else if (UNARY_PRECEDENCE[leftToken] != undefined) {
        const operand = parseTokens(tokens, UNARY_PRECEDENCE[leftToken]);
        if (typeof operand === 'number' && leftToken === '-') {
            leftVal = -operand;
        } else if (typeof operand === 'number' && leftToken === '+') {
            leftVal = operand;
        } else {
            leftVal = {
                operation: leftToken,
                values: [operand]
            }
        }
    } else {
        leftVal = Number(leftToken);
        if (isNaN(leftVal)) {
            throw new Error(`invalid number ${leftToken}`);
        }
    }

    // Parse binary operators until we hit the end or a stop
    while (tokens.length) {
        // end of expression
        const opToken = tokens.pop();
        const opPrec = BINARY_PRECEDENCE[opToken];
        if (opToken === ')' || (opPrec != undefined && opPrec < minPrec)) {
            // We have to stop here.  put the token back and return
            tokens.push(opToken);
            return leftVal;
        }
        if (opPrec == undefined) {
            throw new Error(`invalid operator ${opToken}`)
        }

        // we have a binary operator.  Get the right operand
        const operand = parseTokens(tokens, opPrec + 1);
        if (typeof leftVal === 'object' && leftVal.operation === opToken) {
            // Extend the existing operation
            leftVal.values.push(operand);
        } else {
            // Need a new operation
            leftVal = {
                values: [leftVal, operand],
                operation: opToken
            }
        }
    }
    return leftVal;
}