c memory compiler-construction segmentation-fault lex

Segmentation Fault in Lexer for JACK Language

I am writing a lexer for the JACK language as part of a compiler I am making and I keep getting a segmentation fault in my list of lexemes. I have variable that is a pointer to a pointer to a token that stores the lexeme list. It is passed to two different functions, both of which allocate memory to it. This question is an update to my previous question here with all the code included.

main.c

#include <stdio.h>
#include <stdlib.h>

#include "jlex.h"

int main(int argc, char * argv[])
{
    FILE * sourceFile;
    int lexerStatus;
    token ** tokenList = NULL;

    printf("Attempting to open file...\n");

    if(argc > 1) {
        if(!(sourceFile = fopen(argv[1], "r"))) {
            fprintf(stderr, "Error: Could not open file \'%s\'!\n", argv[1]);
            return FILE_ERROR;
        }
    } else {
        fprintf(stderr, "Error: No input file given!\n");
        return FILE_ERROR;
    }

    printf("Success!\nLexing input file...\n");

    if((lexerStatus = lexer(&tokenList, sourceFile)) != EXEC_SUCCESS) {
        fprintf(stderr, "Error: Failed to lex source file! (%d)\n", lexerStatus);
        return lexerStatus;
    }

    fclose(sourceFile);

    printf("Lexing complete!\n");
    printf("Token Name\tToken Type\tLine Number\n");

    for(token * currToken = tokenList[0]; currToken->type != terminator; currToken++) {
        if(currToken->type == integer || currToken->type == keyword || currToken->type == identifier)
            printf("%s", currToken->string);
        else
            putchar(currToken->character);

        printf("\t\t%d\t\t%d\n", currToken->type, currToken->lineNum);
    }

    return EXEC_SUCCESS;
}

jlex.h

#ifndef JLEX_H
#define JLEX_H

#include <stdio.h>  /* Required for FILE data type */

#define EXEC_SUCCESS 0
#define FILE_ERROR 1
#define MEM_ERROR 2
#define LEX_ERROR 3

#define DEFAULT_LIST_SIZE 1024

typedef enum tokenTypes { keyword, identifier, operator, string, integer, punctuator, terminator } tokenName;

typedef struct token {
    union {
        char * string;
        int character;
    };
    tokenName type;
    int lineNum;
} token;

extern const char * const keywords[];
extern const char * const operators;
extern const char * const punctuators;

int addTokenToList(token * nextToken, token *** tokenList);
int getNextToken(token * nextToken, FILE * sourceFile);
int lexer(token *** tokenList, FILE * sourceFile);

#endif

jlex.c

#include <ctype.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

#include "jlex.h"

const char * const tokenTypeNames[] = { "keyword", "identifier", "operator", "string", "integer", "punctuator", "terminator" };

const char * const keywords[] = {   "boolean",
                                    "char",
                                    "class",
                                    "constructor",
                                    "do",
                                    "else",
                                    "false",
                                    "field",
                                    "function",
                                    "if",
                                    "int",
                                    "method",
                                    "null",
                                    "return",
                                    "static",
                                    "true",
                                    "this"
                                    "var",
                                    "void",
                                    "while" };

const char * const operators = "+-*/&|~<>+=";

const char * const punctuators = "({[)}],.;";

static inline bool isoperator(int c)
{
    for(unsigned int i = 0; i < strlen(operators); i++)
        if(c == operators[i])
            return true;

    return false;
}

static inline bool ispunctuator(int c)
{
    for(unsigned int i = 0; i < strlen(punctuators); i++)
        if(c == punctuators[i])
            return true;

    return false;
}

static inline bool iskeyword(char * string)
{
    for(unsigned int i = 0; i < sizeof(keywords) / sizeof(char*); i++)
        if(!strcmp(keywords[i], string))
            return true;

    return false;
}

int getNextToken(token * nextToken, FILE * sourceFile)
{
    /*
     *  Skip all whitespace and comments
     *  From first token try and determine token type (possible if an operator, punctuator, terminator, integer, or terminator)
     *  If token is determined then build it and return lexer status
     *  if not then keep reading until a full token can be contstructed
     *  Change chosen delimiters based on what kind of token we think we're reading
     *  Return lexer status (might fail if invalid lexeme is detected, i.e. a number followed by letters)
     */

    int c;
    static int lineNum = 1;

    do {
        c = fgetc(sourceFile);

        if(c == '\n')
            lineNum++;

    } while((c == '\n') || (c == '\t') || (c == ' '));

    nextToken->lineNum = lineNum;
    nextToken->character = c;

    if(c == EOF) {
        nextToken->type = terminator;
        return EXEC_SUCCESS;
    }

    if(isoperator(c)) {
        nextToken->type = operator;
        return EXEC_SUCCESS;
    }

    if(ispunctuator(c)) {
        nextToken->type = punctuator;
        return EXEC_SUCCESS;
    }

    /* If we get to this point then c is not a single character lexeme so we need to allocate some space for it in the token */

    if(!(nextToken->string = malloc(1024 * sizeof(char))))
        return MEM_ERROR;

    int pos = 0;

    if(isdigit(c)) { 
        do {
            nextToken->string[pos++] = c;
            c = fgetc(sourceFile);
        } while(isdigit(c) && pos < 1023);

        nextToken->string[pos] = '\0';

        if(!isoperator(c) && !ispunctuator(c) && !isspace(c))
            return LEX_ERROR;

        nextToken->type = integer;

        return EXEC_SUCCESS;
    }

    /* If we get to this point then we have to be reading an identifier or a keyword */

    do {
        nextToken->string[pos++] = c;
        c = fgetc(sourceFile);
    } while((isalpha(c) || isdigit(c) || c == '_') && pos < 1023);

    nextToken->string[pos] = '\0';

    if(iskeyword(nextToken->string))
        nextToken->type = keyword;
    else
        nextToken->type = identifier;

    return EXEC_SUCCESS;
}

int addTokenToList(token * nextToken, token *** tokenList)
{
    static unsigned int listSize = DEFAULT_LIST_SIZE;
    static unsigned int tokenNum = 0;

    if(listSize <= tokenNum) {
        listSize *= 2;
        if(!(*tokenList = realloc(*tokenList, listSize * sizeof(token *)))) /* If the list isn't large enough then double its size */
            return MEM_ERROR;
    }

    if(!(tokenList[tokenNum] = malloc(sizeof(token)))) /* Allocate memory for the data we are about to copy */
        return MEM_ERROR;

    memcpy(tokenList[tokenNum++], nextToken, sizeof(token)); /* Copy token into the array */

    if(nextToken->type == terminator)
        if(!(*tokenList = realloc(*tokenList, tokenNum  * sizeof(token *)))) /* After EOF we know what the final size of the list is so resize it appropriately */
            return MEM_ERROR;

    return EXEC_SUCCESS;
}

int lexer(token *** tokenList, FILE * sourceFile)
{
    int status;
    token nextToken;

    if(!(*tokenList = malloc(DEFAULT_LIST_SIZE * sizeof(token *))))
        return MEM_ERROR;

    do {
        status = getNextToken(&nextToken, sourceFile);

        if(addTokenToList(&nextToken, tokenList) != EXEC_SUCCESS)
            status = MEM_ERROR;

    } while(nextToken.type != terminator && status == EXEC_SUCCESS);

    return status;
}

Test file:

jackExample.jack

class Main {
    function void main () {
        var Array a;
        var int length;
        var int i, sum;

        let length = Keyboard.readInt();
        let a = Array.new(length);
        let i = 0;

        while (i < length) {
            let a[i] = Keyboard.readInt();
            let sum = sum + a[i];
            let i= i+1;
        }

        do Output.printString();
        do Output.printInt(sum / length);
        do Output.println();
        return;
    }
}

When run on that source file the program outputs the following:

Attempting to open file...
Success!
Lexing input file...
Lexing complete!
Token Name  Token Type  Line Number
class       0           1
            1041        0
Segmentation fault

The Valgrind output identifies multiple errors associated with the call to malloc() in the addTokenToList() function.

Note: The above JACK source file is not valid JACK but a version which the lexer in its current state should be able to process. It is not yet able to deal with string literals and comments.

Solution

You missed 2 times to dereference tokenList in addTokenToList :

if(!((*tokenList)[tokenNum] = malloc(sizeof(token)))) /* Allocate memory for the data we are about to copy */

memcpy((*tokenList)[tokenNum++], nextToken, sizeof(token)); /* Copy token into the array */

In main the loop to write the tokens is wrong, must be for instance

int i = 0;

for(token * currToken = tokenList[i]; currToken->type != terminator; currToken = tokenList[++i]) {

Now the execution has no error except of course the memory leaks :

pi@raspberrypi:/tmp $ valgrind ./a.out jackExample.jack 
==17597== Memcheck, a memory error detector
==17597== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==17597== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==17597== Command: ./a.out jackExample.jack
==17597== 
Attempting to open file...
Success!
Lexing input file...
Lexing complete!
Token Name  Token Type  Line Number
class       0       1
Main        1       1
{       5       1
function        0       2
void        0       2
main        1       2
(       5       2
)       5       2
{       5       2
var     1       3
Array       1       3
a       1       3
var     1       4
int     0       4
length      1       4
var     1       5
int     0       5
i       1       5
sum     1       5
let     1       7
length      1       7
=       2       7
Keyboard        1       7
readInt     1       7
)       5       7
;       5       7
let     1       8
a       1       8
=       2       8
Array       1       8
new     1       8
length      1       8
;       5       8
let     1       9
i       1       9
=       2       9
0       4       9
while       0       11
(       5       11
i       1       11
<       2       11
length      1       11
{       5       11
let     1       12
a       1       12
i       1       12
=       2       12
Keyboard        1       12
readInt     1       12
)       5       12
;       5       12
let     1       13
sum     1       13
=       2       13
sum     1       13
+       2       13
a       1       13
i       1       13
;       5       13
let     1       14
i       1       14
i       1       14
1       4       14
}       5       15
do      0       17
Output      1       17
printString     1       17
)       5       17
;       5       17
do      0       18
Output      1       18
printInt        1       18
sum     1       18
/       2       18
length      1       18
;       5       18
do      0       19
Output      1       19
println     1       19
)       5       19
;       5       19
return      0       20
}       5       21
}       5       22
==17597== 
==17597== HEAP SUMMARY:
==17597==     in use at exit: 58,704 bytes in 142 blocks
==17597==   total heap usage: 147 allocs, 5 frees, 88,496 bytes allocated
==17597== 
==17597== LEAK SUMMARY:
==17597==    definitely lost: 340 bytes in 1 blocks
==17597==    indirectly lost: 58,364 bytes in 141 blocks
==17597==      possibly lost: 0 bytes in 0 blocks
==17597==    still reachable: 0 bytes in 0 blocks
==17597==         suppressed: 0 bytes in 0 blocks
==17597== Rerun with --leak-check=full to see details of leaked memory
==17597== 
==17597== For counts of detected and suppressed errors, rerun with: -v
==17597== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 6 from 3)