Search code examples
c

Trying to make a tree containing every english word, but only one word is showing up in the tree


also if you have any criticisms please share.

the search function does a depth first search throughout the tree but it's showing connections as null pointers that I've checked weren't null in the while loop in the main function that creates the tree. This results in the last word in the word bank to be the only word present in the tree.

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

#define ALPHSIZE 26
#define WORDBANKNAME "testwordbank.txt"

int hashy(int c){
    return (int) c-97;
};

struct Node{
    bool end;   
    char c;
    struct Node** connections;
};

void search(struct Node* head,int depth){
    depth++;
    for(int i = 0; i < ALPHSIZE; i++){
        printf("depth %i checking %c = %p\n",depth,i+97,head->connections[i]);
        if (head->connections[i]){
            if(head->connections[i]->end) printf("end!!: ");
            printf("%c\n",head->connections[i]->c);
            search(head->connections[i],depth);
        }
    }
};

struct Node* createNode(char c) {
    struct Node* newNode = malloc(sizeof(struct Node));
    newNode->end = false;
    newNode->c = c;
    newNode->connections = malloc(ALPHSIZE*sizeof(struct Node*));
    for (int i = 0; i < ALPHSIZE; i++) {
        newNode->connections[i] = NULL;
    }
    return newNode;
}


int main(){
    FILE* wordbank = fopen(WORDBANKNAME,"r");
    struct Node*root = createNode((char)1);
    
    struct Node* head = root;
    int nextchar = getc(wordbank);
    
  
    while(nextchar != EOF){
        if (nextchar == '\n'){
            head->end = true; 
            head = root;
            nextchar = getc(wordbank);
            continue;
        }
        int cindex = hashy(nextchar);
        head->connections[cindex] = createNode(nextchar);
        head = head->connections[cindex];
        nextchar = getc(wordbank);
    }
    printf("-----------\n");
    search(root,0);

    fclose(wordbank);
    return 0;
};

I think it might be a problem with overwriting/allocating in use memory.


Solution

  • Your code assumes that int cindex = hashy(nextchar) returns an cindex >= 0 && cindex < ALPHSIZE. Which means the input file testword.txt must only contain lowercase words that are terminated with newline. As you didn't supply sample input I added checks for that.

    Use character constants ('a') instead of magic values (97). ALPHSIZE would benefit for that, too.

    Always check return values.

    To simplify the answer I changed your program to accept the wordlist from stdin and for to only generate output if head->connections[i] isn't NULL. Here's a counter example that only the first word is stored (i.e. works as expected):

    $ printf 'a\nb\n' | ./a.out
    depth 1 checking a = 0x561595e103b0
    end!!: a
    depth 1 checking b = 0x561595e104b0
    end!!: b
    

    Here's an example failure:

    $ printf 'aa\nab\n' | ./a.out 
    depth 1 checking a = 0x558aeb3405b0
    a
    depth 2 checking b = 0x558aeb3406b0
    end!!: b
    

    With the first word we create the tree 'a->a (end)' and for the 2nd word we create the code tree a->b (end) but lost the node for the 'a' of the first word. In other words the problem is that we only keep the last word that have any prefix in common with previous words (which are then lost). Pro tip: a precise problem description is extremely helpful when fixing defects. The resolution is to reuse existing nodes:

            if(!head->connections[cindex]) {
                head->connections[cindex] = createNode(nextchar);
                if(!head->connections[cindex]) {
                    perror("");
                    goto err;
                }
            }
            head = head->connections[cindex];
    

    which now returns the expected result:

    $ echo -e 'aa\nab\n' | ./a.out 
    -----------
    depth 1 checking a = 0x5597d682b3b0
    a
    depth 2 checking a = 0x5597d682b4b0
    end!!: a
    depth 2 checking b = 0x5597d682b5b0
    end!!: b
    

    Here is revised program with additional error handling:

    #include <stdio.h>
    #include <stdlib.h>
    #include <stdbool.h>
    
    #define ALPHSIZE ('z' - 'a' + 1)
    #define WORDBANKNAME "testwordbank.txt"
    
    int hashy(int c) {
        return (int) c-'a';
    };
    
    struct Node{
        bool end;
        char c;
        struct Node** connections;
    };
    
    void search(struct Node* head, int depth) {
        depth++;
        for(int i = 0; i < ALPHSIZE; i++){
            if (head->connections[i]){
                printf("depth %i checking %c = %p\n", depth, i+'a', head->connections[i]);
                if(head->connections[i]->end)
                    printf("end!!: ");
                printf("%c\n", head->connections[i]->c);
                search(head->connections[i], depth);
            }
        }
    };
    
    struct Node* createNode(char c) {
        struct Node* newNode = malloc(sizeof *newNode);
        if(!newNode) {
            perror("");
            return NULL;
        }
        newNode->end = false;
        newNode->c = c;
        newNode->connections = malloc(sizeof *newNode->connections * ALPHSIZE);
        if(!newNode->connections) {
            perror("");
            return NULL;
        }
        for (int i = 0; i < ALPHSIZE; i++)
            newNode->connections[i] = NULL;
        return newNode;
    }
    
    
    int main() {
        struct Node *root = createNode(1);
        if(!root) {
            perror("");
            goto err;
        }
        struct Node* head = root;
        int nextchar = getc(stdin);
        while(nextchar != EOF) {
            if (nextchar == '\n'){
                head->end = true;
                head = root;
                nextchar = getc(stdin);
                continue;
            }
            int cindex = hashy(nextchar);
            if(cindex < 0 || cindex >= ALPHSIZE) {
                fprintf(stderr, "invalid input %c", nextchar);
                goto err;
            }
            if(!head->connections[cindex]) {
                head->connections[cindex] = createNode(nextchar);
                if(!head->connections[cindex]) {
                    perror("");
                    goto err;
                }
            }
            head = head->connections[cindex];
            nextchar = getc(stdin);
        }
        printf("-----------\n");
        search(root, 0);
        // freeTree();
        return 0;
    err:
        // freeTree();
        return 1;
    };
    

    Btw, a great way to find these types of problems is to implement freeTree() to deallocate everything then check with valgrind or similar tools if you have any memory leaks.