Programming Pearls: Frequency of all words in a file

I'm trying to write a code to count the frequency of all words in a file using hash table(referred from Programming Pearls). I used the logic from that book. But I'm getting some errors like this

[Error] request for member 'next' in something not a structure or union
[Error] request for member 'word' in something not a structure or union
[Warning] conflicting types for 'incword'

here's the code I've written

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define NHASH 29989
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

int
main(void)
{
    FILE *fptr;
    int i, buf_size = 512, *p;
    char buffer[1024], name[100];

    printf("\nEnter the file name:");
    scanf("%s", name);

    fptr = fopen(name, "r");
    if (fptr == NULL) {
        printf("\nProblem with opening the file");
        exit(1);
    }

    for (i = 0; i < NHASH; i++) {
        bin[i] = NULL;
    }

    // while scanf("%s", buf) != EOF{
    memset(buffer, ' ', buf_size * 2);
    while (feof(fptr) == 0) {
        memmove(&buffer[0], &buffer[buf_size - 1], buf_size);
        fread(&buffer[buf_size - 1], 1, buf_size, fptr);
        incword(buffer);
    }

    for (i = 0; i < NHASH; i++) {
        for (p = bin[i]; p != NULL; p = p->next) {
            printf("%s%d", p->word, p->count);
        }
        return 0;
    }
}

void
incword(char *s)
{
    int *p;

    h = hash(s);

    for (p = bin[h]; p != NULL; p = p->next) {
        if (strcmp(s, p->word) == 0 {
            (p->count)++;
            return;
        }
    }

    p = malloc(sizeof(hashnode));
    p->count = 1;
    p->word = malloc(strlen(s) + 1);
    strcpy(p->word, s);
    p->next = bin[h];
    bin[h] = p;
}

This is s confusing to me. I don't get the mistakes I've made. Can you please help me sort it out? This is the contents of the file that I'm using(for now, just for testing).

Once upon a time, there was a good man. 

He was a really good man!

Solution

I've cleaned up your compile errors and annotated the source explaining the errors.

I've bracketed your/old code under #if 0 and new/my code under #if 1.

The reason you got a complaint about incword is that it was defined after main, so it either needed a "forward declaration" or needed to be physically moved above main [which is what I did below].

The "request for member ..." messages were because p was defined as int *p instead of nodeptr p

This is just syntax cleanup and not a debug of your program logic:

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define NHASH 29989
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

void
incword(char *s)
{
// NOTE/BUG: this needs to be a pointer to a node, not merely an int
#if 0
    int *p;
#else
    nodeptr p;
#endif

// NOTE/BUG: 'h' was not defined
#if 0
    h = hash(s);
#else
    unsigned int h = hash(s);
#endif

    for (p = bin[h]; p != NULL; p = p->next) {
// NOTE/BUG: this had a missing closing paren
#if 0
        if (strcmp(s, p->word) == 0 {
#else
        if (strcmp(s, p->word) == 0) {
#endif
            (p->count)++;
            return;
        }
    }

// NOTE/BUG: the type name for a node is 'node' [and _not_ hashnode]
#if 0
    p = malloc(sizeof(hashnode));
#else
    p = malloc(sizeof(node));
#endif

    p->count = 1;
    p->word = malloc(strlen(s) + 1);
    strcpy(p->word, s);
    p->next = bin[h];
    bin[h] = p;
}

int
main(void)
{
    FILE *fptr;
// NOTE/BUG: p needs to be a pointer to a node
#if 0
    int i, buf_size = 512, *p;
#else
    int i, buf_size = 512;
    nodeptr p;
#endif
    char buffer[1024], name[100];

    printf("\nEnter the file name:");
    scanf("%s", name);

    fptr = fopen(name, "r");
    if (fptr == NULL) {
        printf("\nProblem with opening the file");
        exit(1);
    }

    for (i = 0; i < NHASH; i++) {
        bin[i] = NULL;
    }

    // while scanf("%s", buf) != EOF{
    memset(buffer, ' ', buf_size * 2);
    while (feof(fptr) == 0) {
        memmove(&buffer[0], &buffer[buf_size - 1], buf_size);
        fread(&buffer[buf_size - 1], 1, buf_size, fptr);
        incword(buffer);
    }

    for (i = 0; i < NHASH; i++) {
        for (p = bin[i]; p != NULL; p = p->next) {
            printf("%s%d", p->word, p->count);
        }
#if 0
        return 0;
#endif
    }

#if 1
    return 0;
#endif
}

UPDATE:

Okay, as I mentioned in my comment below, using fread won't tokenize the input lines into words.

So, I've rewritten the input loop to use fgets and strtok. I've generated a small representative input file and tested it.

The good news is that your hash logic seems to work unchanged. Considering the number of syntax errors that were masking this fact, it makes it all the more impressive--good job!

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define NHASH 29989
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

void
incword(char *s)
{
// NOTE/BUG: this needs to be a pointer to a node, not merely an int
#if 0
    int *p;
#else
    nodeptr p;
#endif

// NOTE/BUG: 'h' was not defined
#if 0
    h = hash(s);
#else
    unsigned int h = hash(s);
#endif

    for (p = bin[h]; p != NULL; p = p->next) {
// NOTE/BUG: this had a missing closing paren
#if 0
        if (strcmp(s, p->word) == 0 {
#else
        if (strcmp(s, p->word) == 0) {
#endif
            (p->count)++;
            return;
        }
    }

// NOTE/BUG: the type name for a node is 'node' [and _not_ hashnode]
#if 0
    p = malloc(sizeof(hashnode));
#else
    p = malloc(sizeof(node));
#endif

    p->count = 1;
    p->word = malloc(strlen(s) + 1);
    strcpy(p->word, s);
    p->next = bin[h];
    bin[h] = p;
}

int
main(void)
{
    FILE *fptr;
// NOTE/BUG: p needs to be a pointer to a node
#if 0
    int i, buf_size = 512, *p;
#else
    int i, buf_size = 512;
    nodeptr p;
#endif
    char buffer[1024], name[100];

    printf("\nEnter the file name:");
    scanf("%s", name);

    fptr = fopen(name, "r");
    if (fptr == NULL) {
        printf("\nProblem with opening the file");
        exit(1);
    }

    for (i = 0; i < NHASH; i++) {
        bin[i] = NULL;
    }

    // while scanf("%s", buf) != EOF{
// NOTE/BUG: this won't read text words too well
#if 0
    memset(buffer, ' ', buf_size * 2);
    while (feof(fptr) == 0) {
        memmove(&buffer[0], &buffer[buf_size - 1], buf_size);
        fread(&buffer[buf_size - 1], 1, buf_size, fptr);
        incword(buffer);
    }
#else
    while (1) {
        // get next line of file
        if (fgets(buffer,sizeof(buffer),fptr) == NULL)
            break;

        char *bp = buffer;

        // split up line into words [separated by whitespace]
        while (1) {
            char *cp = strtok(bp," \t\n");
            bp = NULL;

            if (cp == NULL)
                break;

            // add an individual word
            incword(cp);
        }
    }
#endif

    for (i = 0; i < NHASH; i++) {
        int valid = 0;
        for (p = bin[i]; p != NULL; p = p->next) {
            valid += 1;
// NOTE/BUG: this needs spacing
#if 0
            printf("%s%d", p->word, p->count);
#else
            printf("%s %d\n", p->word, p->count);
#endif
        }
        if (valid > 1)
            printf("\n");
#if 0
        return 0;
#endif
    }

#if 1
    return 0;
#endif
}

UPDATE #2:

Just for cleanliness, here's a version without the #if 0 and bug annotations:

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define NHASH 29989
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

void
incword(char *s)
{
    nodeptr p;

    unsigned int h = hash(s);

    for (p = bin[h]; p != NULL; p = p->next) {
        if (strcmp(s, p->word) == 0) {
            (p->count)++;
            return;
        }
    }

    p = malloc(sizeof(node));

    p->count = 1;
    p->word = malloc(strlen(s) + 1);
    strcpy(p->word, s);
    p->next = bin[h];
    bin[h] = p;
}

int
main(void)
{
    FILE *fptr;
    int i, buf_size = 512;
    nodeptr p;
    char buffer[1024], name[100];

    printf("\nEnter the file name:");
    scanf("%s", name);

    fptr = fopen(name, "r");
    if (fptr == NULL) {
        printf("\nProblem with opening the file");
        exit(1);
    }

    for (i = 0; i < NHASH; i++) {
        bin[i] = NULL;
    }

    while (1) {
        // get next line of file
        if (fgets(buffer,sizeof(buffer),fptr) == NULL)
            break;

        char *bp = buffer;

        // split up line into words [separated by whitespace]
        while (1) {
            char *cp = strtok(bp," \t\n");
            bp = NULL;

            if (cp == NULL)
                break;

            // add an individual word
            incword(cp);
        }
    }

    for (i = 0; i < NHASH; i++) {
        int valid = 0;
        for (p = bin[i]; p != NULL; p = p->next) {
            valid += 1;
            printf("%s %d\n", p->word, p->count);
        }
        if (valid > 1)
            printf("\n");
    }

    return 0;
}

UPDATE #3:

And also when fgets() is used to store a string into the buffer, words at the end of buffer might get split across different buffers, that's why I preferred the previous logic over this

Since fgets reads a single line of text up to and including the newline, the only way to chop a word is if the buffer were shorter than the maximum expected line length.

The usual/simple solution is use a buffer size that is guaranteed to be large enough to contain the line. A buffer length of 1024 is usually large enough, but one could use a length of 10000 [or 100000].

The previous logic had a number of issues:

It did not break up the data into words.
It [still] suffered from the same issue of the buffer not being large enough.
Because it used a fixed read length of 512, it was much more likely to chop off a word in the middle.
It just read 512 bytes into the back half of the buffer. On the first iteration, the front half would have spaces.
It had UB [undefined behavior] because it did not guarantee that the buffer had an EOS character in it (i.e. ensuring the argument to incword was a valid/single string), so the strcmp [and/or strlen] could run past the end.

The fastest method for processing a text file is an advanced technique using mmap. See my answers:

Thank you so much. I have a doubt. When the string is made in into tokens, the special characters are also included with the word. So in my input file, "man!" counted as being different from "man" and "man."

The simple solution is to add more delimiters to the strtok call (e.g.):

cp = strtok(bp," \t\n.!,;?:");

But, it may be easier to scan the file a single character at a time [using fgetc], retain just the alphabetic chars in a "word" buffer, and calling incword when a non-alpha character is encountered [using isalpha] to distinguish them.

Also, for problems like this one, it is [more] usual to ignore capitalization and treat She and she as the same word. So, adding tolower to the mix might be helpful.

This does not handle words like don't as it would treat them as two separate words: don and t

So, we should consider a ' to be an alphabetic character, so we'd want something like: isalpha(c) || (c == '\'')

This is okay except we'd mess up on a quoted string:

I don't use phrases such as 'quick brown fox jumps over the lazy dog' often.

And, how do we treat possessives? Should [the plural] workers be treated differently than the possessive form workers' [as in workers' club]?

Here's an updated sample input:

Once upon a time, there was a good man.

He was a really good man!

Poetically, a really good man was he.

Man, I'm tired of using the word 'man' everywhere.

I don't use phrases such as 'quick brown fox jumps over the lazy dog' often.

I prefer a brown dog and a lazy fox even though it cuts me to the quick to say
that.

A women's club is not quite the same as a woman's club if the woman possesses
a club.

A worker may belong to a union, which is a form of workers' club. Many workers
may belong to the same union.

Here's some updated code that handles most of that [but doesn't differentiate the plural possessive]:

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#ifndef NHASH
#define NHASH 29989
#endif
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

FILE *fptr;
int eof;                            // got eof (i.e previous word was last)
int peek;                           // next character in stream
char buffer[1000];                  // word buffer

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

void
incword(char *s)
{
    nodeptr p;

    unsigned int h = hash(s);

    for (p = bin[h]; p != NULL; p = p->next) {
        if (strcmp(s, p->word) == 0) {
            (p->count)++;
            return;
        }
    }

    p = malloc(sizeof(node));

    p->count = 1;
    p->word = strdup(s);
    p->next = bin[h];

    bin[h] = p;
}

char *
getword(void)
{
    int gotone = 0;
    int chr;
    int alfa;
    char *bp = buffer;

    while (! eof) {
        if (peek) {
            chr = peek;
            peek = 0;
        }
        else
            chr = fgetc(fptr);

        // handle eof
        if (chr == EOF) {
            eof = 1;
            break;
        }

        // is this an alphabetic char?
        alfa = isalpha(chr);

        // is this a single quote -- it can be:
        // (1) the start of a quoted string (it's a delimiter)
        // (2) or part of a contraction (it's quasi-alplabetic)
        // (3) the end of a quoted string (it's a delimiter)
        if (chr == '\'') {
            if (! gotone)
                continue;
            peek = fgetc(fptr);
            alfa = isalpha(peek);
        }

        // non-alpha char (i.e. a word delimiter)
        // if no chars have been stored in the word buffer, this is leading
        // whitespace [and we ignore it]
        // otherwise, it's trailing whitespace and the buffer is now a fully
        // formed word
        if (! alfa) {
            if (gotone)
                break;
            continue;
        }

        // unify upper/lower case
        chr = tolower(chr);

        // store the character in the word buffer
        *bp++ = chr;

        // remember that we have at least one valid character in the buffer
        gotone = 1;
    }

    // close off the string
    *bp = 0;

    // set up caller's return
    if (gotone)
        bp = buffer;
    else
        bp = NULL;

    return bp;
}

int
main(int argc,char **argv)
{
    int i;
    nodeptr p;
    char name[100];

    --argc;
    ++argv;

    for (i = 0; i < NHASH; i++)
        bin[i] = NULL;

    if (argc < 1) {
        ++argc;
        --argv;

        printf("\nEnter the file name:");
        fflush(stdout);

        scanf("%s", name);

        argv[0] = name;
    }

    for (;  argc > 0;  --argc, ++argv) {
        fptr = fopen(*argv, "r");
        if (fptr == NULL) {
            printf("\nProblem with opening the file");
            exit(1);
        }

        eof = 0;
        peek = 0;

        while (1) {
            char *cp = getword();
            if (cp == NULL)
                break;
            incword(cp);
        }

        fclose(fptr);
    }

    for (i = 0; i < NHASH; i++) {
        int valid = 0;
        for (p = bin[i]; p != NULL; p = p->next) {
            valid += 1;
            printf("%s %d\n", p->word, p->count);
        }
        if (valid > 1)
            printf("\n");
    }

    return 0;
}