Search code examples
ctrie

Why is my trie leaking data?


I am trying to make a speller checker using a trie, but the words I try to add to my trie don't seem to be getting put in. Where is the leak?

I've spent hours using a debugger and stepping through my code...

Main Function:

/**
 * Implements a spell-checker.
 */

#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>

#include "dictionary.h"
#undef calculate
#undef getrusage

// default dictionary
#define DICTIONARY "dictionaries/large"

// prototype
double calculate(const struct rusage *b, const struct rusage *a);

int main(int argc, char *argv[])
{
    // check for correct number of args
    if (argc != 2 && argc != 3)
    {
        printf("Usage: speller [dictionary] text\n");
        return 1;
    }

    // structs for timing data
    struct rusage before, after;

    // benchmarks
    double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;

    // determine dictionary to use
    char* dictionary = (argc == 3) ? argv[1] : DICTIONARY;

    // load dictionary
    getrusage(RUSAGE_SELF, &before);
    bool loaded = load(dictionary);
    getrusage(RUSAGE_SELF, &after);

    // abort if dictionary not loaded
    if (!loaded)
    {
        printf("Could not load %s.\n", dictionary);
        return 1;
    }

    // calculate time to load dictionary
    time_load = calculate(&before, &after);

    // try to open text
    char *text = (argc == 3) ? argv[2] : argv[1];
    FILE *fp = fopen(text, "r");
    if (fp == NULL)
    {
        printf("Could not open %s.\n", text);
        unload();
        return 1;
    }

    // prepare to report misspellings
    printf("\nMISSPELLED WORDS\n\n");

    // prepare to spell-check
    int index = 0, misspellings = 0, words = 0;
    char word[LENGTH+1];

    // spell-check each word in text
    for (int c = fgetc(fp); c != EOF; c = fgetc(fp))
    {
        // allow only alphabetical characters and apostrophes
        if (isalpha(c) || (c == '\'' && index > 0))
        {
            // append character to word
            word[index] = c;
            index++;

            // ignore alphabetical strings too long to be words
            if (index > LENGTH)
            {
                // consume remainder of alphabetical string
                while ((c = fgetc(fp)) != EOF && isalpha(c));

                // prepare for new word
                index = 0;
            }
        }

        // ignore words with numbers (like MS Word can)
        else if (isdigit(c))
        {
            // consume remainder of alphanumeric string
            while ((c = fgetc(fp)) != EOF && isalnum(c));

            // prepare for new word
            index = 0;
        }

        // we must have found a whole word
        else if (index > 0)
        {
            // terminate current word
            word[index] = '\0';

            // update counter
            words++;

            // check word's spelling
            getrusage(RUSAGE_SELF, &before);
            bool misspelled = !check(word);
            getrusage(RUSAGE_SELF, &after);

            // update benchmark
            time_check += calculate(&before, &after);

            // print word if misspelled
            if (misspelled)
            {
                printf("%s\n", word);
                misspellings++;
            }

            // prepare for next word
            index = 0;
        }
    }

    // check whether there was an error
    if (ferror(fp))
    {
        fclose(fp);
        printf("Error reading %s.\n", text);
        unload();
        return 1;
    }

    // close text
    fclose(fp);

    // determine dictionary'size
    getrusage(RUSAGE_SELF, &before);
    unsigned int n = size();
    getrusage(RUSAGE_SELF, &after);

    // calculate time to determine dictionary's size
    time_size = calculate(&before, &after);

    // unload dictionary
    getrusage(RUSAGE_SELF, &before);
    bool unloaded = unload();
    getrusage(RUSAGE_SELF, &after);

    // abort if dictionary not unloaded
    if (!unloaded)
    {
        printf("Could not unload %s.\n", dictionary);
        return 1;
    }

    // calculate time to unload dictionary
    time_unload = calculate(&before, &after);

    // report benchmarks
    printf("\nWORDS MISSPELLED:     %d\n", misspellings);
    printf("WORDS IN DICTIONARY:  %d\n", n);
    printf("WORDS IN TEXT:        %d\n", words);
    printf("TIME IN load:         %.2f\n", time_load);
    printf("TIME IN check:        %.2f\n", time_check);
    printf("TIME IN size:         %.2f\n", time_size);
    printf("TIME IN unload:       %.2f\n", time_unload);
    printf("TIME IN TOTAL:        %.2f\n\n", 
     time_load + time_check + time_size + time_unload);

    // that's all folks
    return 0;
}

/**
 * Returns number of seconds between b and a.
 */
double calculate(const struct rusage *b, const struct rusage *a)
{
    if (b == NULL || a == NULL)
    {
        return 0.0;
    }
    else
    {
        return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
                 (b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
                ((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
                 (b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
                / 1000000.0);
    }
}

Functions that check to see if a word is in the trie:

/**
 * Implements a dictionary's functionality.
 */

#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdio.h>

#include "dictionary.h"

int dictionary_size;

bool find(char *word, node *next)
{
    word++;
    if (next == NULL)
    {
        return false;
    }
    else if (word[0] == '\0')
    {
        if (next->is_word == true)
        {
            return true;
        }
        else
        {
            return false;
        }
    }
    else
    {
        //go to this nodes chold that is now eual to the first letter of the word
        if (word[0] == '\'')
        {
            return find(word, next->children[26]);
        }
        else
        {
            return find(word, next->children[word[0] - 'a']);
        }
    }
}

/**
 * Returns true if word is in dictionary else false.
 */
bool check(const char *word)
{
    //get word from text and make it malluable
    char w[47];
    memcpy(w, word, strlen(word) + 1);
    //make sure it's all lower case
    for (int i = 0; i < strlen(word); i++)
    {
        w[i] = tolower(w[i]);
    }
    //go to the root child node equal to the first letter of the word
    if (find(w, root))
    {
        return true;
    }
    else
    {
        return false;
    }
}

functions that load a dictionary of words into the trie. (This where the leak is i'm guessing?):

struct node *newNode(char *word, node *next)
{
    next = malloc(sizeof(node));
    word++;
    if (word[0] == '\0')
    {
        next->is_word = true;
        return next;
    }
    else
    {
        node *new_node = NULL;
        // go to this nodes choild that is now equal to the first letter of the word
        if (word[0] == '\'')
        {
           return next->children[26] = newNode(word, new_node);
        }
        else
        { 
           return next->children[word[0] - 97] = newNode(word, new_node);
        }
    }
}


/**
 * Loads dictionary into memory. Returns true if successful else false.
 */
bool load(const char *dictionary)
{
    dictionary_size = 0;
    FILE *dic = fopen(dictionary, "r");
    if (dic == NULL)
    {
        return false;
    }
    // Initalize root node
    root = malloc(sizeof(node));

    //get word from dictinary
    int ch;
    while ((ch = fgetc(dic)) != EOF)
    {
        char word[47];
        fgets(word, 47, dic);
        dictionary_size++;
        // make sure it's all lower case
        for (int i = 0; i < 47; i++)
        {
            word[i] = tolower(word[i]);
        }
        // get rid of new line char
        char *pos;
        if ((pos = strchr(word, '\n')) != NULL)
        {
            *pos = '\0';
        }
        printf("%s\n", word);
        //go to root nodes child that is equal to the first letter of the word
        node *child_node = NULL;
        root->children[word[0] - 'a'] = newNode(word, child_node);
    }
return true;
}

/**
 * Returns number of words in dictionary if loaded else 0 if not yet loaded.
 */
unsigned int size(void)
{
    if (dictionary_size != 0)
    {
        return dictionary_size;
    }
    return 0;
}

void free_node(node *next)
{
    // safety including root node
    if(!next) return;   

    // takes you to end of trie
    for (int i = 0; i < 26; i++)
    {
       free_node(next->children[i]);
    }

    // base case
    free(next);
}

/**
 * Unloads dictionary from memory. Returns true if successful else false.
 */
bool unload(void)
{
    free_node(root);
    return true;
}

Defining the node struct:

typedef struct node
{
    bool is_word;
    struct node *children[27]; 
}
node;

node *root;

Solution

  • This code works using radically rewritten addWord() function in place of addNode().

    /* SO 3994-9116 */
    #include <assert.h>
    #include <ctype.h>
    #include <stdbool.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    typedef struct node
    {
        bool is_word;
        struct node *children[27];
    } node;
    
    static node *root = 0;
    static int dictionary_size = 0;
    
    static void addWord(char *word, node *trie)
    {
        assert(islower((unsigned char)word[0]) || word[0] == '\0');
        if (word[0] == '\0')
            trie->is_word = true;
        else
        {
            int code = word[0] - 'a';
            if (trie->children[code] == 0)
            {
                trie->children[code] = calloc(sizeof(node), 1);
                if (trie->children[code] == 0)
                {
                    fprintf(stderr, "Memory allocation failed\n");
                    exit(1);
                }
            }
            addWord(&word[1], trie->children[code]);
        }
    }
    
    static
    bool load(const char *dictionary)
    {
        dictionary_size = 0;
        FILE *dic = fopen(dictionary, "r");
        if (dic == NULL)
            return false;
        root = calloc(1, sizeof(node));
        if (root == 0)
        {
            fprintf(stderr, "Memory allocation failed\n");
            exit(1);
        }
    
        char word[47];
        while (fgets(word, sizeof(word), dic) != 0)
        {
            char *pos;
            if ((pos = strchr(word, '\n')) != NULL)
                *pos = '\0';
            dictionary_size++;
            for (int i = 0, len = strlen(word); i < len; i++)
            {
                if (!isalpha((unsigned char)word[i]))
                    fprintf(stderr, "Non-alphabetic character in '%s'\n", word);
                else
                    word[i] = tolower((unsigned char)word[i]);
            }
            addWord(word, root);
        }
        return true;
    }
    
    static void print_subtrie(const char *prefix, char nchar, node *child)
    {
        if (child != 0)
        {
            int len = strlen(prefix);
            char buffer[len + 2];
            strcpy(buffer, prefix);
            buffer[len] = nchar;
            buffer[len+1] = '\0';
            if (child->is_word)
                printf("%s\n", buffer);
            for (int i = 0; i < 26; i++)
                print_subtrie(buffer, 'a' + i, child->children[i]);
        }
    }
    
    static void print_trie(node *top)
    {
        for (int i = 0; i < 26; i++)
            print_subtrie("", 'a' + i, top->children[i]);
    }
    
    static void free_trie(node *trie)
    {
        if (trie != 0)
        {
            for (int i = 0; i < 27; i++)
            {
                if (trie->children[i] != 0)
                    free_trie(trie->children[i]);
            }
            free(trie);
        }
    }
    
    int main(void)
    {
        if (load("words.list"))
        {
            printf("Nominal dictionary size: %d\n", dictionary_size);
            print_trie(root);
            free_trie(root);
        }
        return 0;
    }
    

    Test File 1:

    aardvark
    abelone
    assignation
    assignment
    assigned
    assert
    assertion
    

    Output File 1:

    Nominal dictionary size: 7
    aardvark
    abelone
    assert
    assertion
    assignation
    assigned
    assignment
    

    Test File 2:

    Mike
    Aquarius
    delta
    assert
    zulu
    assertion
    alpha
    aardvark
    abelone
    culmination
    Zulu
    kilo
    amniocentesis
    Oscar
    Yankee
    cabal
    echo
    Sierra
    duck
    Charlie
    Quebec
    centre
    assigned
    Papa
    foxtrot
    uMBRelLA
    Romeo
    cab
    India
    Juliet
    bravo
    amniocentesis
    hotel
    amniocentesis
    Lima
    Whisky
    cab
    culminate
    cold
    Xray
    cab
    cabs
    assignation
    cam
    Victor
    golf
    Tango
    November
    cinema
    cabbie
    assignment
    

    Output File 2:

    Nominal dictionary size: 56
    a
    aardvark
    abelone
    alpha
    amniocentesis
    aquarius
    as
    assert
    assertion
    assign
    assignation
    assigned
    assignment
    bravo
    cab
    cabal
    cabbie
    cabs
    cam
    centre
    charlie
    cinema
    cold
    culminate
    culmination
    delta
    duck
    echo
    foxtrot
    golf
    hotel
    i
    india
    juliet
    kilo
    lima
    mike
    november
    o
    oscar
    papa
    quebec
    romeo
    sierra
    tango
    umbrella
    victor
    whisky
    xray
    yankee
    zulu
    

    Note that the 'nominal size' of 56 is too large; there are 46 distinct words in the input and hence in the output.