Search code examples
cstructfgets

C - How to count occurrences of each word in a file and remove duplicate words


My current assignment in university is to read in words from a file and count the occurrences of each word in that file, printing my results to the console/new file.

I have been successful in counting the word occurrences, however, I am having difficulty removing the rest of the values.

E.g. I want to remove the additional occurrences of C, but keep its count.

C : 2 they : 2 are : 1 not : 1 they : 2 written : 1 in : 1 C : 2

Here is the code I have at the moment...

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define SIZE 256

struct words
{
    char *word;
    unsigned int count;
};

int count_words()
{
    // Allocate memory for first word to compare + words struct
    char *key_word = (char*)malloc(sizeof(char) * SIZE);
    struct words *w = (struct words*)malloc(sizeof(struct words) * SIZE);
    // Create variable to read words from file
    FILE *word_list = fopen("single_words_test.txt", "r");
    // Variable to store total word count
    int total_words = 0;

    // Read in words from file line by line
    while (fgets(key_word, SIZE, word_list) != NULL)
    {
        // Remove the newline character
        key_word[strlen(key_word) - 1] = '\0';
        // Initialize members of words structure
        w->word = key_word;
        w->count = 0;
        // Allocate memory for current word being compared to key_word
        char *current_word = (char*)malloc(sizeof(char) * SIZE);
        // Create variable to read second list of words from file
        FILE *word_list2 = fopen("single_words_test.txt", "r");

        while (fgets(current_word, SIZE, word_list2) != NULL)
        {
            // Remove newline character
            current_word[strlen(current_word) - 1] = '\0';
            // If currrent read word matches keyword, increase its count
            if (strcmp(key_word, current_word) == 0)
            {
                w->count++;
            }
        }
        // Free the allocated memory
        free(current_word);
        fclose(word_list2);

        total_words++;

        printf("%s : %d\n", w->word, w->count);
    }

    free(w);
    free(key_word);
    fclose(word_list);

    return total_words;
}

int main(int argc, char **argv)
{

    printf("\n\n\n%d\n\n\n", count_words());

    return 0;

}

I know the code is messy but I have been stuck on this for some time and I am unsure of how to implement it into my current solution.

Also, I know this could be done by creating a linked-list but I want to avoid that solution and keep it similar to the current solution.

Thank you and sorry for the ambiguity of the question

EDIT: This isn't a code request. I would just like some general guidance as to what I could use


Solution

  • Check to see if you found the current_word earlier in the list than the key_word you're looking for. You can do this using ftell(...) on the file pointer.

    Here's an example;

    int count_words()
    {
        // Allocate memory for first word to compare + words struct
        char *key_word = (char*)malloc(sizeof(char) * SIZE);
        struct words *w = (struct words*)malloc(sizeof(struct words) * SIZE);
        // Create variable to read words from file
        FILE *word_list = fopen("single_words_test.txt", "r");
        // Variable to store total word count
        int total_words = 0;
        long word_list_pos, word_list2_pos;
    
    
        // Read in words from file line by line
        while (fgets(key_word, SIZE, word_list) != NULL)
        {
            // Remove the newline character
            key_word[strlen(key_word) - 1] = '\0';
            // Initialize members of words structure
            w->word = strdup(key_word);
            w->count = 0;
            word_list_pos = ftell(word_list);
    
            // Allocate memory for current word being compared to key_word
            char *current_word = (char*)malloc(sizeof(char) * SIZE);
            // Create variable to read second list of words from file
            FILE *word_list2 = fopen("single_words_test.txt", "r");
    
            while (fgets(current_word, SIZE, word_list2) != NULL)
            {
                // Remove newline character
                current_word[strlen(current_word) - 1] = '\0';
                // If currrent read word matches keyword, increase its count
                if (strcmp(key_word, current_word) == 0)
                {
                    word_list2_pos = ftell(word_list2);                
                    if (word_list2_pos < word_list_pos)
                        break;
    
                    w->count++;
                }
            }
            total_words++;
    
            if (word_list2_pos >= word_list_pos)
                printf("%s : %d\n", w->word, w->count);
    
            // Free the allocated memory
            free(w->word);
            free(current_word);
            fclose(word_list2);
    
        }
    
        free(w);
        free(key_word);
        fclose(word_list);
    
        return total_words;
    }