Lexical Analyzer in Java. Operators Shouldn't be Tokenized as individuals like '++' or '>=' and any unlisted tokens shouldnt print out anything

I am using a Lexical analyzer to tokenize some operators, conditions, and syntaxes. My approach is checking each and every character and when it finds a space between characters, it tokenizes the combined characters. eg. when it finds 'String' it tokenizes it as STR or ';' and tokenizes it as SEMI. so whenever it finds an operator like '++' it just tokenizes it as ADD_OP ADD_OP but I want it to tokenize it as one token i.e '++' should be INC or '>=' it prints out GT ASSIGN rather than GE. here is my code:

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

public class Lexer {

public static void Tokenize(String fileName) {
    BufferedReader reader = null;
    try {
        reader = new BufferedReader(new FileReader(fileName));
        int r;
        String token = "";
        while ((r = reader.read()) != -1) {
            char ch = (char) r;
            if (Character.isWhitespace(ch)) {
                if (!token.isBlank()) {
                    String[] tokens = tokenizeToken(token);
                    for (String t : tokens) {
                        if (!t.isBlank()) {
                            System.out.println(t);
                        }
                    }
                }
                token = "";
            } else {
                token += ch;
            }
        }
        if (!token.isBlank()) {
            String[] tokens = tokenizeToken(token);
            for (String t : tokens) {
                if (!t.isBlank()) {
                    System.out.println(t);
                }
            }
        }
    } catch (IOException e) {
        System.err.println("Error reading file: " + e.getMessage());
    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
        } catch (IOException e) {
            System.err.println("Error closing file: " + e.getMessage());
        }
    }
}

private static String[] tokenizeToken(String token) {
    String[] tokens = token.split("(?=[\\[\\](){}<>=,;+-/*%|&!])|(?<=[\\[\\](){}<>=,;+-/*%|&!])");
    for (int i = 0; i < tokens.length; i++) {
        String t = tokens[i].trim();
        if (t.matches("procedure")) {
            tokens[i] = "PROC";
        } else if (t.matches("int")) {
            tokens[i] = "INT";
        } else if (t.matches("[0-9]+")) {
            tokens[i] = "INT_CONST";
        } else if (t.matches("end")) {
            tokens[i] = "END";
        } else if (t.matches("String") || t.matches("string")) {
            tokens[i] = "STR";
        } else if (t.matches("[(]")) {
            tokens[i] = "LP";
        } else if (t.matches("[)]")) {
            tokens[i] = "RP";
        } else if (t.matches("\".*\"")) {
            tokens[i] = "STR_CONST";
        } else if (t.matches("if")) {
            tokens[i] = "IF";
        } else if (t.matches("for")) {
            tokens[i] = "FOR";
        } else if (t.matches("while")) {
            tokens[i] = "WHILE";
        } else if (t.matches("return")) {
            tokens[i] = "RETURN";
        } else if (t.matches("[;]")) {
            tokens[i] = "SEMI";
        } else if (t.matches("do")) {
            tokens[i] = "DO";
        } else if (t.matches("break")) {
            tokens[i] = "BREAK";
        } else if (t.matches("[a-zA-Z][a-zA-Z0-9]*")) {
            tokens[i] = "IDENT";
        } else if (t.matches("[=]")) {
            tokens[i] = "ASSIGN";
        } else if (t.matches("[<]")) {
            tokens[i] = "LT";
        } else if (t.matches("[>]")) {
            tokens[i] = "RT";
        } else if (token.matches("[++]")) {
            tokens[i] = "INC";
        } else if (t.matches("[+]")) {
            tokens[i] = "ADD_OP";
        } else if (token.matches("[{]")) {
            tokens[i] = "RB";
        } else if (token.matches("[}]")) {
            tokens[i] = "LB";
        } else if (token.matches("[*]")) {
            tokens[i] = "MUL_OP";
        } else if (token.matches("[/]")) {
            tokens[i] = "DIV_OP";
        } else if (token.matches("[>=]")) {
            tokens[i] = "GE";
        } else {
            System.out.println("SYSTEM ERROR: INVALID IDENTIFIER NAME");
        }
    }

    return tokens;
}
}

and here is the output:

PROC
IDENT
LP
INT
IDENT
RP
FOR
LP
INT
IDENT
ASSIGN
INT_CONST
SEMI
IDENT
LT
IDENT
SEMI
IDENT
ASSIGN
IDENT
ADD_OP
ADD_OP
RP
RB
IDENT
ASSIGN
IDENT
MUL_OP
LP
IDENT
DIV_OP
INT_CONST
RP
SEMI
IF
LP
IDENT
RT
ASSIGN
INT_CONST
RP
BREAK
SEMI
LB
RETURN
IDENT
SEMI
END
IDENT
INT
SYSTEM ERROR: INVALID IDENTIFIER NAME
9user
ASSIGN
INT_CONST
SEMI

Also if any syntax or operator isn't listed should not print out in the output. e.g. the 9user shouldn't be printed out.

Solution

You are having issues with your regexes.

In the one you are using for split
i)it splits if there is a +>=< behind or before (lookahead & look behind) so ++ will be split as +,+.
ii) +-/ "-" has a special meaning inside [] in regex it means match any character from the Unicode value of + till the Unicode value of /
In the if-else section inside the matches you have used the unnecessary "[]" which means any one of the characters inside so "[++]" means match + character or + character

Split regex can be modified like this : "([^+><]+?=[-[](){}<>=,;+/%|&!])|((?<=[-[](){}<>=,;/+%|&!])(?![+=]))"

and in the if-else section can be modified as shown in the working example

A minimal working example is as below

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

public class Lexer {

    public static void Tokenize(String fileName) {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(fileName));
            int r;
            String token = "";
            while ((r = reader.read()) != -1) {
                char ch = (char) r;
                if (Character.isWhitespace(ch)) {
                    if (!token.isBlank()) {
                        String[] tokens = tokenizeToken(token);
                        for (String t : tokens) {
                            if (!t.isBlank()) {
                                System.out.println(t);
                            }
                        }
                    }
                    token = "";
                } else {
                    token += ch;
                }
            }
            if (!token.isBlank()) {
                String[] tokens = tokenizeToken(token);
                for (String t : tokens) {
                    if (!t.isBlank()) {
                        System.out.println(t);
                    }
                }
            }
        } catch (IOException e) {
            System.err.println("Error reading file: " + e.getMessage());
        } finally {
            try {
                if (reader != null) {
                    reader.close();
                }
            } catch (IOException e) {
                System.err.println("Error closing file: " + e.getMessage());
            }
        }
    }

    private static String[] tokenizeToken(String token) {
        String[] tokens = token.split("([^+><]+?=[-\\[\\](){}<>=,;+/*%|&!])|((?<=[-\\[\\](){}<>=,;/+*%|&!])(?![+=]))");
        for (int i = 0; i < tokens.length; i++) {
            String t = tokens[i].trim();

            if (t.matches("=")) {
                tokens[i] = "ASSIGN";
            } else if (t.matches("[<]")) {
                tokens[i] = "LT";
            } else if (t.matches("[>]")) {
                tokens[i] = "RT";
            } else if (token.matches("[+]{2}")) {
                tokens[i] = "INC";
            } else if (t.matches("[+]")) {
                tokens[i] = "ADD_OP";
            } else if (token.matches(">=")) {
                tokens[i] = "GE";
            } else {
                System.out.println("SYSTEM ERROR: INVALID IDENTIFIER NAME");
            }
        }

        return tokens;
    }
}