Search code examples
javascriptparsingabstract-syntax-tree

Creating AST from custom syntax in JavaScript


I am building a small framework that reads my custom made HTML syntax and converts it into HTML code. However, I am stuck at tokenising my code and creating an AST. I understand that the algorithm requires recursive approach but I cannot figure out how to do it properly.

This is my custom code in app.txt file:

View {
    Heading {
        
    }

    Text {

    }
}

And here is my recursive parser so far:

function parse(source) {
    let tag = "";
    let children = [];

    for (let i = 0; i < source.length; i++) {
        const char = source[i];

        if (char === "{") {
            const child = parse(source.substring(i + 1, source.length));
            children.push(child);
        } else if (char === "}") {
            return {
                tag: tag,
                children: children
            };
        } else {
            tag += char;
        }
    }

    return;
}

The parses is expected to produce something like this (and should be able to go to any depth):

{
    tag: "View",
    children: [
        {
            tag: "Heading",
            children: []
        },
        {
            tag: "Text",
            children: []
        }
    ]
}

What am I doing wrong? I'll be thankful for any help.


Solution

  • Let's write your grammar more or less formally:

    tag := name '{' children '}'
    name := letter | letter name
    children := tag | tag children
    letter := [a-z]
    

    Then, let's write a parsing function for each rule in the grammar. We need two helper functions: getsym, which returns the first meaningful (non-whitespace) symbol from the input, and nextsym, which removes that symbol.

    Working example:

    function parse(text) {
        let chars = [...text]
    
        function getsym() {
            while (chars.length > 0 && /\s/.test(chars[0]))
                chars.shift()
            return chars[0] || ''
        }
    
        function nextsym() {
            return chars.shift()
        }
    
        return tag()
    
        //
    
        function tag() {
            let n = name()
    
            if (getsym() !== '{')
                throw new SyntaxError()
            nextsym()
    
            let c = children(text)
    
            if (getsym() !== '}')
                throw new SyntaxError()
            nextsym()
    
            return {name: n, children: c}
        }
    
        function name() {
            let t = letter()
            if (t)
                return t + name()
            return ''
        }
    
        function letter() {
            if (/[a-z]/i.test(getsym()))
                return nextsym()
        }
    
        function children() {
            if (getsym() === '}')
                return []
            let t = tag()
            return [t, ...children()]
        }
    
    }
    
    ///
    
    text = ` View {
        Heading {
            Content {
                One {}
                Two {}
                Three {}
            }
        }
        Text {
            More {}
        }
    }
    `
    
    console.log(parse(text))

    That being said, if you plan for a more complex grammar, a more practical option would be to use a parser generator like peg.js.