Search code examples
cundefined-behavior

Tokenizer in C is not showing any output


I am writing a simple tokenizer to teach myself C and am struggling a lot with what seems to be cases of undefined behavior. The code seems fine to me and I am not sure where the problem is. When I run the program, I see no output in the console. The compiler throws no errors at all.

This is the simple text file that I am trying to tokenize:

1
2.34


xxy =  5e6

My code below is producing no output to the console:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>
// #include "hexal.h"

#define HX_VERSION "0.0.1"

typedef enum
{
    TOK_EOF,                    // End of file      0
    TOK_ILLEGAL,                // Illegal token    1
    TOK_SPACE,                  // Space            2
    TOK_VAR,                    // var              3
    TOK_CONST,                  // const            4
    TOK_ASSIGN,                 // =                5
    TOK_INT_LIT,                // Integer literal  6
    TOK_FLOAT_LIT,              // Float literal    7
    TOK_EXPO_LIT,              // Float literal    7
    TOK_BOOL_LIT,               // Bool literal     8
    TOK_IDENTIFIER,             // Identifier       9
} Hx_Token_Type;

typedef struct Token
{
    Hx_Token_Type type;

    char *value;
    fpos_t cursor;
    int line;
} Token;

typedef enum {
    // Compiler Errors
    UNEXPECTED_TOKEN,
    UNEXPECTED_EOF,
    UNCLOSED_STRING,
    UNCLOSED_COMMENT,
    UNCLOSED_PAREN,
    UNCLOSED_BRACE,
    UNCLOSED_BRACKET,
    UNCLOSED_ANGLE_BRACKET,

    // Compiler Warnings
    UNUSED_IDENTIFIER,
    DEPRECATED_FEATURE,

    // 

} Hx_Compiler_Error_Types;

typedef struct {
    Hx_Compiler_Error_Types type;
    const char* err_message;
    size_t at_pos;
    size_t at_line;
} Hx_Compiler_Error;

// a type to hold the list of Hx_Compiler_Error

// Compiler metadata
typedef struct {
    Hx_Compiler_Error* Hx_Compiler_Errors;
    
    size_t Hx_Compiling_Duration;
    size_t Hx_Bytes_Compiled;

} Hx_Compiler_Metadata;

typedef struct {
    FILE* fp;

    int line;

    Token* prev_token;
    Token* curr_token;
    Token* next_token;

} Hx_File_Parsing_Context;

typedef struct {
    char* data;
    size_t length;
    size_t capacity;
} String;

String* init_string() {
    String* string = malloc(sizeof(String));
    if (string == NULL) {
        printf("Error: Failed to allocate memory\n");
        exit(EXIT_FAILURE);
    }

    string->length = 1;
    string->capacity = 8;
    string->data = malloc(string->capacity * sizeof(char));
    if (string->data == NULL) {
        printf("Error: Failed to allocate memory\n");
        exit(EXIT_FAILURE);
    }
    string->data[0] = '\0';

    return string;
}

// Function to append a char to a String
String* append_char_to_string(String* string, int ch) {
    if (string->length + 1 >= string->capacity) {
        // Double the capacity
        string->capacity *= 2;

        // Reallocate memory with double the capacity
        string->data = realloc(string->data, string->capacity * sizeof(char));
        if (string->data == NULL) {
            printf("Error: Failed to allocate memory\n");
            exit(EXIT_FAILURE);
        }
    }

    // Append the character to the string
    string->data[string->length] = (char) ch;
    string->data[string->length + 1] = '\0';
    string->length += 1;

    return string;
}

// Function to free the memory allocated for a String
void free_string(String* string) {
    free(string->data);
    free(string);
}

Token* get_next_token(Hx_File_Parsing_Context* ctx, Hx_Compiler_Metadata* meta) {
    // get the next token
    int ch = fgetc(ctx->fp);

    // setup return token
    Hx_Token_Type tok_type;
    String* tok_val = init_string();

    // Handle end of file
    if (ch == EOF) {
        tok_type = TOK_EOF;

        // Handle spaces & newlines. Roll all spaces into a single space token
    } else if (isspace(ch)) {
        do {
            if (ch == '\n') {
                ctx->line++;
            }
            ch = fgetc(ctx->fp);
        } while (isspace(ch));

        ungetc(ch, ctx->fp);
        tok_type = TOK_SPACE;

        // Handle Floats, exponentials and Integers
    } else if (isdigit(ch)) {
        tok_val = append_char_to_string(tok_val, ch);
        tok_type = TOK_INT_LIT;

        // roll all digits into a single float
        do {
            ch = fgetc(ctx->fp);
            if (ch == '_') {
                continue;
            } else if (ch == '.') {
                if (tok_type == TOK_INT_LIT) {
                    tok_val = append_char_to_string(tok_val, ch);
                    tok_type = TOK_FLOAT_LIT;
                } else {
                    ungetc(ch, ctx->fp);
                    break;
                }
            } else if (ch == 'e') {
                if (tok_type == TOK_INT_LIT || tok_type == TOK_FLOAT_LIT) {
                    tok_val = append_char_to_string(tok_val, ch);
                    tok_type = TOK_EXPO_LIT;
                } else {
                    ungetc(ch, ctx->fp);
                    break;
                }
            } else if (isdigit(ch)) {
                tok_val = append_char_to_string(tok_val, ch);
            } else {
                ungetc(ch, ctx->fp);
                break;
            }
        } while (isdigit(ch) || ch == '.' || ch == '_' || ch == 'e');

        // Handle Illegal Characters
    } else {
        tok_val = append_char_to_string(tok_val, ch);
        tok_type = TOK_ILLEGAL;
    }

    Token* res = malloc(sizeof(Token));
    res->type = tok_type;
    res->value = tok_val->data;
    res->line = ctx->line;
    free_string(tok_val);

    return res;
}

// parse_file
int parse_file(Hx_File_Parsing_Context* ctx, Hx_Compiler_Metadata* meta) {
    // advance the tokens to fill the prev, curr, and next tokens
    ctx->prev_token = ctx->curr_token;
    ctx->curr_token = ctx->next_token;
    ctx->next_token = get_next_token(ctx, meta);

    printf("\nAll Tokens in the file = \n");

    do {
        free(ctx->prev_token->value);
        free(ctx->prev_token);

        // advance the tokens in a loop till file ends
        ctx->prev_token = ctx->curr_token;
        ctx->curr_token = ctx->next_token;
        ctx->next_token = get_next_token(ctx, meta);
        // printf("{ type: %i, value: %s, line: %i },\n", ctx->curr_token->type, ctx->curr_token->value, ctx->curr_token->line);
        printf("{ type: %i, value: %s, line: %i },\n", (int)ctx->curr_token->type, ctx->curr_token->value, ctx->curr_token->line);

    } while (ctx->curr_token->type != TOK_EOF);

    return 0;
}

// Section for handling multiple files
int compile_source(char* entry_file) {
    // parse = tokenize + parse + typecheck
    // compile = parse + generate
    // present = compile + present + run

    Hx_File_Parsing_Context ctx;
    ctx.line = 1;

    Hx_Compiler_Metadata meta;
    meta.Hx_Bytes_Compiled = 0;
    meta.Hx_Compiling_Duration = 0;
    meta.Hx_Compiler_Errors = NULL;

    ctx.fp = fopen(entry_file, "r, ccs=UTF-8");
    if (ctx.fp == NULL) {
        printf("Error: Failed to open file\n");
        exit(EXIT_FAILURE);
    }
    printf("Parsing file: %s\n", entry_file);
    int res = parse_file(&ctx, &meta);

    fclose(ctx.fp);

    return res;
}

int main(int argc, char* argv[]) {

    int res = compile_source("input/test.hex");

    return res;
}

This is what I see in the console:

src\hexal.c:251:14: warning: 'fopen' is deprecated: This
      function or variable may be unsafe. Consider using
      fopen_s instead. To disable deprecation, use
      _CRT_SECURE_NO_WARNINGS. See online help for
      details. [-Wdeprecated-declarations]
    ctx.fp = fopen(entry_file, "r, ccs=UTF-8");
             ^
C:\Program Files (x86)\Windows Kits\10\Include\10.0.19041.0\ucrt\stdio.h:212:20: note: 
      'fopen' has been explicitly marked deprecated here
    _Check_return_ _CRT_INSECURE_DEPRECATE(fopen_s)
                   ^
C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\include\vcruntime.h:355:55: note: 
      expanded from macro '_CRT_INSECURE_DEPRECATE'
        #define _CRT_INSECURE_DEPRECATE(_Replacement) _C...
                                                      ^
C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\include\vcruntime.h:345:47: note: 
      expanded from macro '_CRT_DEPRECATE_TEXT'
#define _CRT_DEPRECATE_TEXT(_Text) __declspec(deprecated...
                                              ^
1 warning generated.
   Creating library bin\hexal.lib and object bin\hexal.exp
Parsing file: input/test.hex

C:\Users\risharan\Documents\GitHub\seawitch>cls && clang -g3 -Wall -Wextra -Wconversion -Wdouble-promotion -Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion -fsanitize=undefined src\hexal.c -o bin\hexal.exe && bin\hexal.exe

Solution

  • Running your code on my system, I get this runtime error:

    230520-token(77235,0x20a1d8140) malloc: *** error for object 0x40000: pointer being freed was not allocated
    230520-token(77235,0x20a1d8140) malloc: *** set a breakpoint in malloc_error_break to debug
    /bin/sh: line 1: 77235 Abort trap: 6           ./230520-token
    make: *** [230520-token.run] Error 134
    

    Reading your code, I can see these problems:

    • [major] neither ctx nor meta are initialized in main. You should initialize these structures to ensure that all pointers are null:

      Hx_File_Parsing_Context ctx = { 0 };
      Hx_Compiler_Metadata meta = { 0 };
      
    • [major] when freeing the previous token, you should test that it was indeed allocated:

        if (ctx->prev_token) {
            free(ctx->prev_token->value);
            free(ctx->prev_token);
        }
      
    • [major] in init_string, the initial value of string->length should be 0.

    • append_char_to_string does not change its first argument, you do not need to update tok_val as tok_val = append_char_to_string(tok_val, ch);, just use

       append_char_to_string(tok_val, ch);
      
    • the loop to read numbers should just be a for (;;) loop. Using a do / while loop adds redundant tests. Hex numbers are not supported.

    • you should save the line number at the start of get_next_token so the line field is set to the line where the token starts. This will matter more when you parse multiline tokens such as comments and token broken on multiple lines with escaped newlines.

    • you should parse identifiers, operators, strings, character constants, comments...

    • [major] you free the string with free_string(tok_val); but you saved the contents pointer to the token structure, so this pointer becomes invalid. You should allocate a copy of the string with

          res->value = strdup(tok_val->data);
      

    Fixing the above major problems, I get this output:

    All Tokens in the file =
    { type: 6, value: 1, line: 1 },
    { type: 2, value: , line: 2 },
    { type: 7, value: 2.34, line: 2 },
    { type: 2, value: , line: 5 },
    { type: 1, value: x, line: 5 },
    { type: 1, value: x, line: 5 },
    { type: 1, value: y, line: 5 },
    { type: 2, value: , line: 5 },
    { type: 1, value: =, line: 5 },
    { type: 2, value: , line: 5 },
    { type: 8, value: 5e6, line: 5 },
    { type: 2, value: , line: 6 },
    { type: 0, value: , line: 6 },