Search code examples
cuniquecountingcpu-word

Unique Words Counter in C


I'm programming a little program in C which should count unique words in c. To do this, I'm having an wordbook to store all found words. Normally it should only put words inside that aren't already in it but it keeps entering all written words. How can I fix this and how can I delete all the empty parts in my wordbook "woerterbuch"?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>


char lies_wort(char *Text);
char suche_wort(char *wort);
char neues_wort(char *wort);
char *woerterbuch[1000];

int main(void)
{
    char Text[1000];
    printf("Bitte Text eingeben : \n") ;
    fgets (Text, 1000, stdin);
    lies_wort(Text);
    int i;
    for(i=0;i<1000;i++){
        printf("woerterbuch :%s\n",woerterbuch[i]);}
}
char lies_wort(char *Text){
    char *wort;
    int i=1;
    wort = strtok(Text, " ,.!?;:");
    while(wort != NULL) {
        suche_wort(wort);
        printf("gefunden %d: %s\n", i++, wort);
        wort = strtok(NULL, " ,.!?;:");}
}
char suche_wort(char *wort)
{
    int i;
    for (i = 0; i>1000; i++){
        if (!strcmp(woerterbuch[i],wort)){return 0;}}
    neues_wort(wort);
    return 0;
}
char neues_wort(char *wort)
{
    int i;
    for (i=0; i<1000; i++){
        if(woerterbuch[i]==0){
            woerterbuch[i]=wort;
            return 0;}}
}

For Testing this program is just printing all words in "woerterbuch" so I can check if it's working.


Solution

  • I believe you have some issues in your code:

    Firstly, in this line:

    woerterbuch[i]=wort;
    

    Will only overwrite the address of woerterbuch[i], and this will lead to wrong results. Instead you need to allocate space for worterbuch[i], via malloc or strdup.

    You can use allocate space for a single pointer like this:

    worterbuch[i] = malloc(strlen(wort)+1);
    

    Note: It is always good to check the return from malloc(), and free() these pointers at the end.

    Now, since the pointer is pointing somewhere, you can copy stuff into it. You can use strcpy to do this. If you want to skip this copy step, you can just use strdup() instead.

    Secondly, instead of globally defining char *woerterbuch[1000];, you can manage this array of pointers in a struct:

    typedef struct {
        char *woerterbuch[1000];
        size_t n;
    } worterbuch;
    

    Which will make it easier to manage your array.

    Thirdly, you are not checking the return of fgets(). This can return NULL if unsuccessful. You should also check for buffer overflow here.

    Lastly, if their are a lot of words in your worterbuch, it might not be efficient to use linear search to check for duplicates. This process is O(N) time on average. Instead, you can use binary search, which is O(logN) on average, therefore much more efficient if n becomes very large.

    Here is some code I wrote a while ago which does something similar:

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    #define TEXTSIZE 1000
    
    typedef struct {
        char *dictionary[TEXTSIZE];
        size_t numwords;
    } dictionary_t;
    
    void read_text(char *text);
    void read_words(char *text, dictionary_t *dict);
    int search_word(dictionary_t *dict, char *word);
    void print_words(dictionary_t *dict);
    int str_cmp(const void *a, const void *b);
    
    int main(void) {
        dictionary_t dict;
        char text[TEXTSIZE];
    
        read_text(text);
    
        read_words(text, &dict);
    
        print_words(&dict);
    
        return 0;
    }
    
    void read_text(char *text) {
        size_t slen;
    
        printf("Please enter text: \n");
        if (fgets(text, TEXTSIZE, stdin) == NULL) {
            fprintf(stderr, "Error reading text\n");
            exit(EXIT_FAILURE);
        }
    
        /* removes '\n' character from fgets(), and checks for overflow */
        slen = strlen(text);
        if (slen > 0) {
            if (text[slen-1] == '\n') {
                text[slen-1] = '\0';
            } else {
                printf("Buffer overflow detected.\n");
                exit(EXIT_FAILURE);
            }
        }
    
        if (!*text) {
            printf("No text entered.\n");
            exit(EXIT_FAILURE);
        }
    }
    
    void read_words(char *text, dictionary_t *dict) {
        char *word;
        const char *delim = " ,.!?;:";
        dict->numwords = 0;
    
        word = strtok(text, delim);
        while (word != NULL) {
    
            if (search_word(dict, word)) {
    
                /* allocate space for ptr */
                dict->dictionary[dict->numwords] = malloc(strlen(word)+1);
                if (!dict->dictionary[dict->numwords]) {
                    printf("Cannot allocate word.\n");
                    exit(EXIT_FAILURE);
                }
    
                /* copy it into array */
                strcpy(dict->dictionary[dict->numwords], word);
    
                /* increment count, ready for next word */
                dict->numwords++;
            }
            word = strtok(NULL, delim);
        }
    }
    
    /* linear searching the word */
    int search_word(dictionary_t *dict, char *word) {
        size_t i;
    
        for (i = 0; i < dict->numwords; i++) {
            if (strcmp(dict->dictionary[i], word) == 0) {
                return 0;
            }
        }
        return 1;
    }
    
    /* cmp function for sorting dictionary */
    int str_cmp(const void *a, const void *b) {
        const char **str1 = (const char **)a;
        const char **str2 = (const char **)b;
    
        return strcmp(*str1, *str2);
    }
    
    void print_words(dictionary_t *dict) {
        size_t i;
    
        /* sort using qsort */
        qsort(dict->dictionary, dict->numwords, sizeof(*(dict->dictionary)), str_cmp);
    
        printf("\nDictionary:\n");
        for (i = 0; i < dict->numwords; i++) {
            printf("%s\n", dict->dictionary[i]);
    
            /* freeing memory previosly allocated from malloc() */
            free(dict->dictionary[i]);
            dict->dictionary[i] = NULL;
        }
    }