Search code examples
cregexsyntax-highlighting

Getting start and end of tokens in string with regular expressions


I am trying to implement some kind of syntax highlighting for C programs in C itself. Imagine I have stored all the source code I want to highlight into a string, and before printing it, I would like to know what color to use for each token.

I know I can use <regex.h> to test regular expressions, but I am not sure how to find the positions of matching expressions in a string.

Imagine I had the following input code:

int main(int argc, char** argv) {
    int var = my_func("Hello, world.");
    return 0;
}

And I want to render it with colors, just like it's shown in the code block above.

What I would want is to be able to test some expressions in the string, and get a pointer (or position) in the string where the first match ends. So I would start by testing int main(int [...], and the first match would be the int keyword, which ends at position 3. Then I could keep checking from there, and so on.

It's the first time I have to deal with something like this, so if anyone knows a better method, please let me know.


Solution

  • If you have the source code loaded in a char array, you might consider writing a get_token() function that returns the token type and length. Depending on the token type, you output the corresponding color and the token source using:

    #include <stdio.h>
    
    enum tokenType {
        END, WHITESPACE, NEWLINE, COMMENT, PREPROCESSOR,
        KEYWORD, IDENTIFIER, STRING, CHARCONST, NUMBER, OPERATOR,
        OTHER
    };
    
    struct parse_context {
        const char *filename;
        const char *source;
        size_t source_pos;
        int line_number;
        int column_number;
        int at_bol;
        ...
    };
    
    const char *colors[] = {
        [END] = ...,
        [WHITESPACE] = ...,
        [NEWLINE] = ...,
        [COMMENT] = ...,
        [PREPROCESSOR] = ...,
        [KEYWORD] = ...,
        [IDENTIFIER] = ...,
        [STRING] = ...,
        [CHARCONST] = ...,
        [NUMBER] = ...,
        [OPERATOR] = ...,
        [OTHER] = ...,
    };
    
    enum tokenType get_token(struct parse_context *pc,
                             const char *s, int *token_len) {
        ...
    }
    
    void my_func(const char *filename, const char *s) {
        struct parse_context ctx = {
            filename, s, 0, 1, 1, 1,
        };
        enum tokenType last_token = END;
    
        for (;;) {
            int len;
            enum tokenType tok_type = get_token(&ctx, s, &len);
            if (tok_type != last_token) {            
                printf("%s", colors[tok_type]);
                last_token = tok_type;
            }
            if (tok_type == END) {
                break;
            } else {
                printf("%.*s", len, s);
                s += len;
            }
        }
    }
    

    Here is a quick parser I wrote to colorize C source code my way:

    #include <ctype.h>
    #include <errno.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    // Parse C tokens.
    
    // XXX: no support for UTF-8 identifiers
    // XXX: no support for universal-character-name in identifiers
    
    enum c_token_type {
        END, WHITESPACE, NEWLINE, COMMENT, PREPROCESSOR,
        KEYWORD, IDENTIFIER, STRING, CHARCONST, NUMBER, OPERATOR,
        CTYPE, FUNCALL, OTHER, ERROR,
    };
    
    struct c_parse_context {
        const char *filename;
        const char *source;
        const char *p;
        const char *token_start;
        char token_string[80];
        int line_number;
        int column_number;
        int at_bol;
        int in_preprocess;
        //...
    };
    
    static const char c_keywords[] =
        " auto break case const continue default do else enum extern for "
        " goto if inline register restrict return sizeof static struct "
        " switch typedef union volatile while "
        /* types */
        " char double float int long unsigned short signed void "
        /* C99 and C11 keywords */
        " _Alignas _Alignof _Atomic _Generic _Noreturn _Static_assert "
        " _Thread_local "
        /* C99 and C11 types */
        " _Bool _Complex _Imaginary "
        /* C23 keywords */
        " alignas alignof constexpr false nullptr static_assert thread_local "
        " true typeof typeof_unqual "
        /* C23 types */
        " bool _BitInt _Decimal128 _Decimal32 _Decimal64 "
        ;
    
    static const char c_types[] =
        /* types */
        " char double float int long unsigned short signed void va_list "
        /* C99 and C11 types */
        " _Bool _Complex _Imaginary "
        /* C23 types */
        "  bool _BitInt _Decimal128 _Decimal32 _Decimal64 "
        /* common standard types */
        "  FILE va_list "
        ;
    
    static const char c_punctuators[] =
        " [ ] ( ) { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | "
        " && || ? : :: ; ... = *= /= %= += -= <<= >>= &= ^= |= , # ## "
        " <: :> <% %> %: %:%: ";
    
    static int c_find_word(const char *words, const char *s, int len) {
        for (const char *p = words; (p = strchr(p, *s)) != NULL; p++) {
            if (p[-1] == ' ' && !strncmp(p, s, len) && p[len] == ' ')
                return 1;
        }
        return 0;
    }
    
    static int c_is_keyword(const char *s, int len) {
        return c_find_word(c_keywords, s, len);
    }
    
    static int c_is_type(const char *s, int len) {
        return c_find_word(c_types, s, len)
        ||     (len > 2 && s[len - 2] == '_' && s[len - 1] == 't');
    }
    
    static int c_getc(struct c_parse_context *pc) {
        for (;;) {
            int c = (unsigned char)*pc->p++;
            if (c == '\0') {
                pc->p--;
                return 0;
            }
            if (c == '\r') {  // convert end of line sequences to '\n'
                if (*pc->p == '\n')
                    pc->p += 1;
                return '\n';
            }
    #if 0   // trigraphs can be handled here.
            if (c == '?' && *pc->p == '?') {
                switch (pc->p[1]) {
                case '=':  c = '#';  pc->p += 2; break;
                case '(':  c = '[';  pc->p += 2; break;
                case '/':  c = '\\'; pc->p += 2; break;
                case ')':  c = ']';  pc->p += 2; break;
                case '\’': c = '^';  pc->p += 2; break;
                case '<':  c = '{';  pc->p += 2; break;
                case '!':  c = '|';  pc->p += 2; break;
                case '>':  c = '}';  pc->p += 2; break;
                case '-':  c = '~';  pc->p += 2; break;
                }
            }
    #endif
            if (c == '\\') {  // remove escaped newlines
                if (*pc->p == '\n') {
                    pc->p++;
                    continue;
                }
                if (*pc->p == '\r') {
                    pc->p++;
                    if (*pc->p == '\n')
                        pc->p++;
                    continue;
                }
            }
            return c;
        }
    }
    
    static int c_peekc(struct c_parse_context *pc) {
        const char *start = pc->p;
        int c = c_getc(pc);
        pc->p = start;
        return c;
    }
    
    static int c_peekc2(struct c_parse_context *pc) {
        const char *start = pc->p;
        int c = c_getc(pc);
        c = c_getc(pc);
        pc->p = start;
        return c;
    }
    
    static inline int c_isalnum_(int c) {
        return isalnum(c) || c == '_';
    }
    
    static int c_parse_number(struct c_parse_context *pc, int lastc) {
        /* parse a pp-number, the grammar allows for invalid numbers */
        int c;
        for (; (c = c_peekc(pc)) != '\0'; lastc = c, c_getc(pc)) {
            if (!c_isalnum_(c) && c != '.') {
                if (c == '+' || c == '-') {
                    if (!memchr("eEpP", lastc, 4))
                        break;
                } else
                if (c == '\'') { // C23 digit separators
                    if (!c_isalnum_(c_peekc2(pc)))
                        break;
                } else {
                    break;
                }
            }
        }
        return NUMBER;
    }
    
    static int c_parse_string(struct c_parse_context *pc, int sep) {
        int c;
        while ((c = c_peekc(pc)) != '\0' && c != '\n') {
            c_getc(pc);
            if (c == sep)
                return (sep == '\'') ? CHARCONST : STRING;
            if (c == '\\' && c_getc(pc) == '\0')
                break;
        }
        // unterminated string or character constant
        return ERROR;
    }
    
    static int c_parse_operator(struct c_parse_context *pc, int c) {
        const char *save[4];
        size_t len = 0;
        for (size_t i = 0;;) {
            pc->token_string[i] = (char)c;
            save[i] = pc->p;
            i++;
            if (c_find_word(c_punctuators, pc->token_string, i))
                len = i;
            if (i == 4 || !ispunct(c = c_getc(pc)))
                break;
        }
        if (len) {
            pc->p = save[len - 1];
            pc->token_string[len] = '\0';
            return OPERATOR;
        } else {
            pc->p = save[0];
            return OTHER;
        }
    }
    
    static int c_parse_comment1(struct c_parse_context *pc) {
        int c;
        while ((c = c_peekc(pc)) != '\0' && c != '\n') {
            c_getc(pc);
        }
        return COMMENT;
    }
    
    static int c_parse_comment2(struct c_parse_context *pc) {
        int c;
        while ((c = c_peekc(pc)) != '\0') {
            c_getc(pc);
            if (c == '*' && c_peekc(pc) == '/') {
                c_getc(pc);
                return COMMENT;
            }
        }
        // unterminated comment
        return ERROR;
    }
    
    static int c_parse_identifier(struct c_parse_context *pc, int c) {
        size_t len = 0;
        pc->token_string[len++] = (char)c;
        while (c_isalnum_(c = c_peekc(pc))) {
            if (len < sizeof(pc->token_string) - 1)
                pc->token_string[len++] = (char)c;
            c_getc(pc);
        }
        pc->token_string[len] = '\0';
        if (c_is_type(pc->token_string, len))
            return CTYPE;
        if (c_is_keyword(pc->token_string, len))
            return KEYWORD;
        if (isblank(c))
            c = c_peekc2(pc);
        if (c == '(')
            return FUNCALL;
        return IDENTIFIER;
    }
    
    enum c_token_type c_get_token(struct c_parse_context *pc) {
        int c, c1, c2;
    
        pc->token_start = pc->p;
        c = c_getc(pc);
    
        if (isspace(c)) {
            if (c == '\n') {
                pc->at_bol = 1;
                pc->in_preprocess = 0;
                return NEWLINE;
            }
            while (memchr(" \t\f\v", c_peekc(pc), 4)) {
                c_getc(pc);
            }
            return WHITESPACE;
        }
    
        if (pc->at_bol) {
            pc->at_bol = 0;
            if (c == '#') {
                pc->at_bol = 0;
                pc->in_preprocess = 1;
                return PREPROCESSOR;
            }
        }
    
        switch (c) {
        case '\0':
            return END;
        case '/':
            if (c_peekc(pc) == '/') {
                c_getc(pc);
                return c_parse_comment1(pc);
            }
            if (c_peekc(pc) == '*') {
                c_getc(pc);
                return c_parse_comment2(pc);
            }
            break;
        case '.':
            if (isdigit(c_peekc(pc)))
                return c_parse_number(pc, c);
            break;
        case '0': case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
            return c_parse_number(pc, c);
        case '\'':
        case '\"':
            return c_parse_string(pc, c);
        case 'L':
        case 'U':
            if ((c1 = c_peekc(pc)) == '\'' || c1 == '\"')
                return c_parse_string(pc, c_getc(pc));
            break;
        case 'u':
            if ((c1 = c_peekc(pc)) == '8' && ((c2 = c_peekc2(pc)) == '\'' || c2 == '\"')) {
                c_getc(pc);
                return c_parse_string(pc, c_getc(pc));
            }
            if (c1 == '\'' || c1 == '\"')
                return c_parse_string(pc, c_getc(pc));
            break;
        }
        // XXX: should handle UTF-8 and universal-character-name here
        if (c_isalnum_(c))
            return c_parse_identifier(pc, c);
        return c_parse_operator(pc, c);
    }
    
    // C token colorizer
    
    // ANSI colors
    #define RESET          "\033[0m"
    #define BLACK          "\033[30m"
    #define RED            "\033[31m"
    #define GREEN          "\033[32m"
    #define YELLOW         "\033[33m"
    #define BLUE           "\033[34m"
    #define MAGENTA        "\033[35m"
    #define CYAN           "\033[36m"
    #define WHITE          "\033[37m"
    #define GREY           "\033[90m"
    #define BRIGHT_RED     "\033[91m"
    #define BRIGHT_GREEN   "\033[92m"
    #define BRIGHT_YELLOW  "\033[93m"
    #define BRIGHT_BLUE    "\033[94m"
    #define BRIGHT_MAGENTA "\033[95m"
    #define BRIGHT_CYAN    "\033[96m"
    #define BRIGHT_WHITE   "\033[97m"
    #define DEFAULT  BRIGHT_GREEN
    
    const char * const c_colors[] = {
        [END] = RESET,
        [WHITESPACE] = DEFAULT,
        [NEWLINE] = RESET,
        [COMMENT] = WHITE,
        [PREPROCESSOR] = CYAN,
        [KEYWORD] = BRIGHT_WHITE,
        [IDENTIFIER] = DEFAULT,
        [STRING] = BRIGHT_CYAN,
        [CHARCONST] = BRIGHT_CYAN,
        [NUMBER] = GREEN,
        [OPERATOR] = DEFAULT,
        [CTYPE] = BRIGHT_MAGENTA,
        [FUNCALL] = BRIGHT_YELLOW,
        [OTHER] = RED,
        [ERROR] = RED,
    };
    
    void c_colorize(const char *filename, const char *source) {
        struct c_parse_context ctx = {
            filename, source, source, NULL, { 0 }, 1, 1, 1, 0,
        };
        enum c_token_type last_color = END;
    
        for (;;) {
            enum c_token_type tok_type = c_get_token(&ctx);
            const char *s = ctx.token_start;
            int len = ctx.p - s;
            enum c_token_type color = tok_type;
            if (ctx.in_preprocess && tok_type != COMMENT) {
                color = PREPROCESSOR;
            }
            if (last_color != color) {
                if (c_colors[color]) {
                    fputs(c_colors[color], stdout);
                }
                last_color = color;
            }
            if (tok_type == END) {
                break;
            } else {
                printf("%.*s", len, s);
                s += len;
            }
        }
    }
    
    char *load_file(const char *filename, FILE *fp) {
        char buf[4096];
        char *source = NULL;
        size_t len = 0;
        size_t nread;
        FILE *fp_close = NULL;
    
        if (fp == NULL) {
            fp = fopen(filename, "r");
            if (fp == NULL) {
                fprintf(stderr, "cannot open %s: %s\n",
                        filename, strerror(errno));
                return NULL;
            }
            fp_close = fp;
        }
        while ((nread = fread(buf, 1, sizeof buf, fp)) > 0) {
            char *new_buf = realloc(source, len + nread + 1);
            if (new_buf == NULL) {
                fprintf(stderr, "out of memory for %s\n", filename);
                free(source);
                source = NULL;
                break;
            }
            source = new_buf;
            memcpy(source + len, buf, nread);
            len += nread;
            source[len] = '\0';
        }
        if (fp_close)
            fclose(fp_close);
        return source;
    }
    
    int main(int argc, char *argv[]) {
        if (argc > 1) {
            for (int i = 1; i < argc; i++) {
                char *source = load_file(argv[i], NULL);
                if (source) {
                    c_colorize(argv[i], source);
                    free(source);
                }
            }
        } else {
            char *source = load_file("<stdin>", stdin);
            if (source) {
                c_colorize("<stdin>", source);
                free(source);
            }
        }
        return 0;
    }