Search code examples
ctextbinary-treewords

Why is my dictionary not counting occurrences correctly?


I'm entering words into a binary tree and also counting the number of times they appear.

This is the text file: http://pastebin.com/FY9ZTQX6

Here is my code so far. It all happens in the insert() function:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

/*
Name: Marcus Lorenzana
Assignment: Final
*/

/*binary tree struct to hold left and right node
as well as the word and number of occurrences*/
typedef struct node
{
    char *word;
    int count;
    struct node *left;
    struct node *right;
}
node;

//,.:;-
int punctuation[4] = {0};

/*Function Prototypes*/
void insert(node ** dictionary, node * entry);
char* readFile(char* filename);
void printDictionary(node * tree);
void printPunctuation(); 
void toLower(char** word);
char *replace(char *st, char *orig, char *repl);; 



int main()
{

    char *word;
    char* filecontents = readFile("data.txt");
    FILE *fp = fopen("data.txt","r");


    //create dictionary node
    node *dictionary;
    node *entry;

    //read words and punctuation in from the text file
    word = strtok (filecontents, " \n");

    dictionary = NULL;

    //create and add lowercase entries into the dictionary
    //by calling insert function 
    while (word != NULL)
    {

        //store punctuation in it's own array of occurrences
        //since there are only 4 to watch out for
        if (word[0]==',') {
            punctuation[0]++; 
            word = strtok (NULL, " \n");
        } else if (word[0]=='.')  {
            punctuation[1]++; 
            word = strtok (NULL, " \n");
        }
         else if (word[0]==';')  {
            punctuation[2]++; 
            word = strtok (NULL, " \n");
        }
         else if (word[0]=='-')  {
            punctuation[3]++; 
            word = strtok (NULL, " \n");
        } 
         else if (word[0]=='$') {
            break;
        }
        //use binary tree to store unique words
         else {
            //convert word to lowercase first, so capital words
            //won't have precedence
            word = strlwr(word);
            //create entry node and call insert with dictionary being call by reference
            //so that is is mutable
            entry = (node *) malloc(sizeof(node));
            entry->left = entry->right = NULL;
            entry->word = malloc(sizeof(char)*(strlen(word)+1));
            entry->word = word;
            insert(&dictionary,entry);
            word = strtok (NULL, " \n");
        }

    }
    //print the dictionary of words and their number of occurrences
    printDictionary(dictionary);
    //print the punctuation and their number of occurrences
    printPunctuation();


    //find word to be replaced

    int found = 0;
    char buffer[250];
    char src[50] , dest[50];
    while(fgets(buffer , sizeof(buffer) , fp) != NULL)
    {
        if(strchr(buffer , '<'))
        {
            found++;
            break;
        }
    }
    if(found)
    {
        fscanf(fp , "%s < %s" , src , dest);
    }

    //example of replace()
    FILE *fout = fopen("newdata.txt","w");
    filecontents = readFile("data.txt");

    fprintf(fout,"%s",replace(filecontents,src,dest));

    fclose(fout);
    fclose(fp);

    return 0;
}

/*inserts an entry into the dictionary in lexicographical order*/
void insert(node ** dictionary, node * entry)
{
    //if there are no entries, set dictionary point
    //to new one and set count to 1 
    if(!(*dictionary))
    {
        *dictionary = entry;
        (*dictionary)->count=1;
        return;
    }

    //compare word to see if it of higher
    //or lower alphabetical order
    int result = strcmp(entry->word,(*dictionary)->word);

    //if it is lower, place on the left
    if(result<0)
    {

        insert(&(*dictionary)->left, entry);
        (*dictionary)->count=1; 


    }
    //if it is higher, place on the right
    if(result>0)
    {

        insert(&(*dictionary)->right, entry);
        (*dictionary)->count=1; 


    }
    //if it is equal, don't create a new entry but update the count
    if(result == 0)
    {
        (*dictionary)->count++; 
    }

}

/*put file contents in string for strtok*/
char* readFile(char* filename)
{
    FILE* file = fopen(filename,"r");
    if(file == NULL)
    {
        return NULL;
    }

    fseek(file, 0, SEEK_END);
    long int size = ftell(file);
    rewind(file);

    char* content = calloc(size + 1, 1);

    fread(content,1,size,file);

    return content;
}

/*prints the dictionary in lexicographical order
and number of occurrences for each word*/
void printDictionary(node * dictionary)
{
    //traverse dictionary in lexicographical order
    if(dictionary->left)
    {
        printDictionary(dictionary->left);
    }
    //print word and number of occurrences
    printf("%s\n",dictionary->word);
    printf("=%d\n",dictionary->count); 

    if(dictionary->right)
    {
        printDictionary(dictionary->right);
    }
}

/*prints the punctuation and number of occurrences*/
void printPunctuation(){
    //,.:;-
    printf("\n, = %d",punctuation[0]); 
    printf("\n. = %d",punctuation[1]); 
    printf("\n; = %d",punctuation[2]); 
    printf("\n- = %d",punctuation[3]); 

}

/*replace word*/
char *replace(char *st, char *orig, char *repl)
{
    static char buffer[2000];
    char *ch;
    if (!(ch = strstr(st, orig)))
        return st;
    strncpy(buffer, st, ch-st);
    buffer[ch-st] = 0;
    sprintf(buffer+(ch-st), "%s%s", repl, ch+strlen(orig));
    return buffer;
}

This is the output I'm getting: http://pastebin.com/8qSPQkiM


Solution

  • I'd start by changing this:

            entry->word = malloc(sizeof(char)*(strlen(word)+1));
            entry->word = word;
    

    to this:

            entry->word = malloc(sizeof(char)*(strlen(word)+1));
            strcpy(entry->word, word);
    

    As it is now, you're leaking memory and storing a stack-var pointer, which eventually is invalid. You're code has other problems, but this is likely the most immediate problem you're facing.

    Next, in insert(), you're incorrectly resetting counters on intermediate nodes. This:

    //if it is lower, place on the left
    if(result<0)
    {
        insert(&(*dictionary)->left, entry);
        (*dictionary)->count=1; // <=== SHOULD NOT BE HERE
    }
    
    //if it is higher, place on the right
    if(result>0)
    {
        insert(&(*dictionary)->right, entry);
        (*dictionary)->count=1; // <=== SHOULD NOT BE HERE
    }
    
    //if it is equal, don't create a new entry but update the count
    if(result == 0)
    {
        (*dictionary)->count++; 
    }
    

    should be this:

    if(result<0)
    {
        insert(&(*dictionary)->left, entry);
    }
    else if (result>0)
    {
        insert(&(*dictionary)->right, entry);
    }
    else
    {   // found it. update the count
        (*dictionary)->count++; 
    }