Search code examples
javascriptlexer

Why does my JavaScript lexer split my floats as an identifier and a float?


I am currently trying to make my own little programming language for the first time. I am, as of now, creating the basic lexer. I am trying to allow floating point values in my code, but alas, it gets split into an identifier and a float value (see output below)

Question: How do I fix my lexer to properly handle floats?

Output: let value = 7.24 const pi = 3.14

Code:

function lexer(input) {
    const tokens = [];
    const keywords = new Set(['let', 'var', 'const', 'def', 'float', 'floater', 'double', 'int', 'integer', 'bool', 'boolean', 'string', 'char']);
    const alphaNumericRegex = /[a-zA-Z0-9]/;
    const digitRegex = /\d/;

    let current = 0;
    let length = input.length;

    while (current < length) {
        let char = input[current];

        if (char === ' ' || char === '\n') {
            current++;
            continue;
        }

        if (alphaNumericRegex.test(char)) {
            let wordStart = current;
            while (alphaNumericRegex.test(char)) {
                char = input[++current];
            }
            let word = input.slice(wordStart, current);

            if (keywords.has(word)) {
                tokens.push({ type: 'keyword', value: word });
            } else {
                tokens.push({ type: 'identifier', value: word });
            }

            continue;
        }

        if (digitRegex.test(char) || char === '.') {
            let numStart = current;
            let hasDecimal = false;
        
            while (digitRegex.test(char) || (!hasDecimal && char === '.')) {
                if (char === '.') {
                    hasDecimal = true;
                }
                char = input[++current];
            }
        
            let numStr = input.slice(numStart, current);
            let num = parseFloat(numStr);
        
            if (isNaN(num)) {
                throw new SyntaxError("Invalid number");
            }
        
            if (Number.isInteger(num)) {
                tokens.push({ type: 'number', value: num });
            } else {
                tokens.push({ type: 'number', value: parseFloat(numStr) });
            }
        
            continue;
        }

        if (char === '"') {
            let strStart = ++current;
            while (input[current] !== '"') {
                if (++current >= length) throw new SyntaxError("Unterminated string literal");
            }
            let str = input.slice(strStart, current++);
            tokens.push({ type: 'string', value: str });
            continue;
        }

        if (char === "'") {
            let charValue = input[++current];
            if (input[++current] === "'") {
                tokens.push({ type: 'char', value: charValue });
                current++;
            } else {
                throw new SyntaxError("Invalid character literal");
            }
            continue;
        }

        if (char === '=') {
            tokens.push({ type: 'assign' });
            current++;
            continue;
        }

        if (char === ';') {
            tokens.push({ type: 'semicolon' });
            current++;
            continue;
        }

        if (char === '.') {
            tokens.push({ type: 'dot' });
            current++;
            continue;
        }

        current++;
    }

    return tokens;
}

const code = `let value = 7.24;
var count = 5;
const pi = 3.14;
bool isTrue = true;
string message = "Hello";
char initial = 'A';`;

console.log(JSON.stringify(lexer(code), null, 2));

I'm still somewhat new to JavaScript and completely new to lexers, so I tried getting help from ai tools such as ChatGPT and AskCodi. They attempted to fix the problem and any changes I made from their recommendations made no difference.


Solution

  • all I did was change alphaNumeric from /[a-zA-Z0-9]/ to /[a-zA-Z0-9-.]/ in an attempt to include the dot when parsing a number and funny enough, it worked :D

    window.onload=_=>document.getElementsByClassName('as-console-wrapper')[0].style.maxHeight='100%';
    //this above just to make the console seen in its full space instead of half
    
    function lexer(input) {
        const tokens = [];
        const keywords = new Set(['let', 'var', 'const', 'def', 'float', 'floater', 'double', 'int', 'integer', 'bool', 'boolean', 'string', 'char']);
        const alphaNumericRegex = /[a-zA-Z0-9-.]/;
        const digitRegex = /\d/;
    
        let current = 0;
        let length = input.length;
    
        while (current < length) {
            let char = input[current];
    
            if (char === ' ' || char === '\n') {
                current++;
                continue;
            }
    
            if (alphaNumericRegex.test(char)) {
                let wordStart = current;
                while (alphaNumericRegex.test(char)) {
                    char = input[++current];
                }
                let word = input.slice(wordStart, current);
    
                if (keywords.has(word)) {
                    tokens.push({ type: 'keyword', value: word });
                } else {
                    tokens.push({ type: 'identifier', value: word });
                }
    
                continue;
            }
    
            if (digitRegex.test(char) || char === '.') {
                let numStart = current;
                let hasDecimal = false;
            
                while (digitRegex.test(char) || (!hasDecimal && char === '.')) {
                    if (char === '.') {
                        hasDecimal = true;
                    }
                    char = input[++current];
                }
            
                let numStr = input.slice(numStart, current);
                let num = parseFloat(numStr);
            
                if (isNaN(num)) {
                    throw new SyntaxError("Invalid number");
                }
            
                if (Number.isInteger(num)) {
                    tokens.push({ type: 'number', value: num });
                } else {
                    tokens.push({ type: 'number', value: parseFloat(numStr) });
                }
            
                continue;
            }
    
            if (char === '"') {
                let strStart = ++current;
                while (input[current] !== '"') {
                    if (++current >= length) throw new SyntaxError("Unterminated string literal");
                }
                let str = input.slice(strStart, current++);
                tokens.push({ type: 'string', value: str });
                continue;
            }
    
            if (char === "'") {
                let charValue = input[++current];
                if (input[++current] === "'") {
                    tokens.push({ type: 'char', value: charValue });
                    current++;
                } else {
                    throw new SyntaxError("Invalid character literal");
                }
                continue;
            }
    
            if (char === '=') {
                tokens.push({ type: 'assign' });
                current++;
                continue;
            }
    
            if (char === ';') {
                tokens.push({ type: 'semicolon' });
                current++;
                continue;
            }
    
            if (char === '.') {
                tokens.push({ type: 'dot' });
                current++;
                continue;
            }
    
            current++;
        }
    
        return tokens;
    }
    
    const code = `let value = 7.24;
    var count = 5;
    const pi = 3.14;
    bool isTrue = true;
    string message = "Hello";
    char initial = 'A';`;
    
    console.log(JSON.stringify(lexer(code), null, 2));