Search code examples
cmemorytreegdbheap-memory

Heap block warning during the words counting of a program


I think there are some problems related to the memory and the heap corruption that don't allow my program to run properly (mainly because of some bug inside of it). The program just stops running, or crashes after its quit.

I'm trying to learn how trees work and for my case I have to write a cross-referencer that reads all the words in a document (in my case, the input line), and for each word, a list of the line numbers on which it occurs. For example:

foo
bar bar
foo bar

Should produce as output:

2 foo: [1, 3]
2 bar: [2, 3]

where the numbers inside the [] are the lines where our words are found.

There are 2 main issues with my code:

  • it only prints 1 inside the brackets, as if the program never checks the newline
  • if I try to run more than 10 lines of input it crashes. Without gdb it allows me to output all the lines I want, and won't crash until it reaches the 10 lines:
t
t
t
t
t
quit
   5 t: [1, 1, 1, 1, 1]

When I run it with gdb, instead, it gives me this:

(gdb) r
Starting program: C:\...\6.exe
[New Thread 15276.0x14fc]
t
t
t
warning: HEAP[6.exe]:
warning: Heap block at 000001E191B97CA0 modified at 000001E191B97CB6 past requested size of 6

Thread 1 received signal SIGTRAP, Trace/breakpoint trap.
0x00007ff981f969ff in ntdll!RtlRegisterSecureMemoryCacheCallback () from C:\WINDOWS\SYSTEM32\ntdll.dl
(gdb) bt
#0  0x00007ff981f969ff in ntdll!RtlRegisterSecureMemoryCacheCallback ()
   from C:\WINDOWS\SYSTEM32\ntdll.dll
#1  0x00007ff981f9288a in ntdll!RtlZeroHeap () from C:\WINDOWS\SYSTEM32\ntdll.dll
#2  0x00007ff981f61357 in ntdll!EtwLogTraceEvent () from C:\WINDOWS\SYSTEM32\ntdll.dll
#3  0x00007ff981f95839 in ntdll!RtlRegisterSecureMemoryCacheCallback ()
   from C:\WINDOWS\SYSTEM32\ntdll.dll
#4  0x00007ff981f4de29 in ntdll!EtwLogTraceEvent () from C:\WINDOWS\SYSTEM32\ntdll.dll
#5  0x00007ff981ed24b7 in ntdll!RtlReAllocateHeap () from C:\WINDOWS\SYSTEM32\ntdll.dll
#6  0x00007ff981ed237a in ntdll!RtlReAllocateHeap () from C:\WINDOWS\SYSTEM32\ntdll.dll
#7  0x00007ff97fb71a89 in ucrtbase!_realloc_base () from C:\WINDOWS\System32\ucrtbase.dll
#8  0x00007ff71ff81bbe in addtree ()
#9  0x00007ff71ff81a4e in main ()

I didn't even type quit (the word to break the loop) and it just stopped by giving me this warning.

I don't know how to fix this, because probably I am forgetting to free something (there is some heap allocation), but I have no idea on where the problem may be.

This is the code:

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUFSIZE     100
#define MAXWORD     100

#define IS_NOISE_WORD(word) \
    (strcmp(word, "a") == 0 || \
    strcmp(word, "an") == 0 || \
    strcmp(word, "the") == 0 || \
    strcmp(word, "and") == 0 || \
    strcmp(word, "or") == 0 || \
    strcmp(word, "in") == 0 || \
    strcmp(word, "of") == 0 || \
    strcmp(word, "to") == 0 || \
    strcmp(word, "is") == 0 || \
    strcmp(word, "are") == 0 || \
    strcmp(word, "was") == 0 || \
    strcmp(word, "were") == 0 || \
    strcmp(word, "be") == 0 || \
    strcmp(word, "been") == 0 || \
    strcmp(word, "being") == 0 || \
    strcmp(word, "have") == 0 || \
    strcmp(word, "has") == 0 || \
    strcmp(word, "had") == 0 || \
    strcmp(word, "having") == 0)
    /* etc. */

#define IS_NOT_NOISE_WORD(word) (!IS_NOISE_WORD(word))

/* the tree node */
struct tnode {
    char *word;             /* points to the text */
    int count;              /* number of occurrences */
    int *lines;             /* lines where the word occurs */
    struct tnode *left;     /* left child */
    struct tnode *right;    /* right child */
};

char buf[BUFSIZE];          /* buffer for ungetch */
int bufp = 0;               /* next free position in buf */

/* char *strdup(char *); */

int getword(char *, int);

struct tnode *addtree(struct tnode *, char *, int);
void tfree(struct tnode *);
void treeprint(struct tnode *);

/* word frequency count */
int main(int argc, char *argv[])
{
    struct tnode *root = NULL;
    char word[MAXWORD];
    int n = 1;  /* number of lines */

    while (getword(word, MAXWORD) != EOF)
    {   
        if (word[0] == '\n')
            n++;
        
        /* if there is a word and it's not a noise */
        if (isalpha(word[0]) && IS_NOT_NOISE_WORD(word) && strcmp(word, "quit") != 0 && strcmp(word, "exit") != 0)
            root = addtree(root, word, n);

        if (!strcmp(word, "quit") || !strcmp(word, "exit"))
            break;
    }

    treeprint(root);
    tfree(root);

    return 0;
}

/* addtree: add a node with the word w at line l, at or below p */
struct tnode *addtree(struct tnode *p, char *w, int l)
{
    int cond;

    /* a new word has arrived */
    if (p == NULL)
    {
        /* make a new node */
        p = malloc(sizeof(struct tnode));
        p->word = strdup(w);
        p->count = 1;
        p->lines = calloc(p->count + 1, sizeof(int));
        p->lines[p->count - 1] = l;
        p->left = p->right = NULL;
    }
    else {
        cond = strcmp(w, p->word);

        if (cond == 0) {
            /* repeated word */
            p->count++;
            p->lines = realloc(p->lines, p->count + 1 * sizeof(int));
            p->lines[p->count - 1] = l;
        }
        else if (cond < 0) {
            /* less than into left subtree */
            p->left = addtree(p->left, w, l);
        }
        else {
            /* greater than into right subtree */
            p->right = addtree(p->right, w, l);
        }
    }

    return p;
}

/* tfree: free a tnode */
void tfree(struct tnode *p)
{
    if (p == NULL)
        return;

    tfree(p->left);
    tfree(p->right);
    free(p);

    if (p->word != NULL) {
        free(p->word);
        p->word = NULL;
    }

    if (p->lines != NULL) {
        free(p->lines);
        p->lines = NULL;
    }
}

/* treeprint: in-order print of tree p */
void treeprint(struct tnode *p)
{
    int i;

    if (p != NULL) {
        treeprint(p->left);
        printf("%4d %s: [%d", p->count, p->word, p->lines[0]);

        for (i = 1; i < p->count; i++)
            printf(", %d", p->lines[i]);

        printf("]\n");
        treeprint(p->right);
    }
}

/* getword: get next word or character from input */
int getword(char *word, int lim)
{
    char *w = word;
    int c, getch(void);
    void ungetch(int);

    int in_comment = 0;     /* 1 if inside a comment */
    int in_pp_line = 0;     /* 1 if inside a preprocessor line */
    int in_string = 0;      /* 1 if inside a string */

    /* skip spaces */
    while (isspace(c = getch()))
        ;

    if (c != EOF)
        *w++ = c;

    /* not underscore, pp line, comment, string */
    if (!isalpha(c) && c != '_' && c != '\"' && c != '#' && c != '/') {
        *w = '\0';
        return c;
    }

    if (c == '\"')
        in_string = 1;

    if (c == '#')
        in_pp_line = 1;

    /* it only checks single line comments for now */
    if (c == '/') {
        if ((c = getch()) == '/')
            in_comment = 1;
        else
            ungetch(c);
    }

    while (--lim > 0)
    {
        c = getch();

        if (in_comment && (c == '\n'))
            in_comment = 0;

        if (in_pp_line && (c == '\n'))
            in_pp_line = 0;

        /* if the char is in a string or in a comment or in a pp line, and is not alphanumeric */
        if (!isalnum(c) && c != '_' && (in_string == 1 || c != '\"') && !in_pp_line && !in_comment)
        {
            ungetch(c);
            break;
        }

        if (c == '/' && *(w - 1) == '/')
            in_comment = 1;

        if (c == '\"')
            in_string = (in_string == 1) ? 0 : 1;

        *w++ = c;
    }

    *w = '\0';

    return word[0];
}

/* get a (possibly pushed-back) character */
int getch(void) {
    return (bufp > 0) ? buf[--bufp] : getchar();
}

/* push character back on input */
void ungetch(int c) {
    if (bufp >= BUFSIZE)
        printf("ungetch: too many characters\n");
    else
        buf[bufp++] = c;
}

Besides the crash issues I don't get why the n count doesn't increase. Is the getword function not returning '\n' at all?


Solution

    1. tfree(): It's undefined behavior to deference a pointer after it's freed. Also, there is no point of setting p->word and p->lines to NULL when you free(p).
    void tfree(struct tnode *p) {
        if (!p)
            return;
        tfree(p->left);
        tfree(p->right);
        if (p->word)
            free(p->word);
        if (p->lines)
            free(p->lines);
        free(p);
    }
    
    1. addtree(): * has higher precedence than +. It should be:
                p->lines = realloc(p->lines, (p->count + 1) * sizeof(int));
    

    but as you increment p->count the line before you just want:

                p->lines = realloc(p->lines, p->count * sizeof(int));
    

    There is a similar logic error in the call to calloc().

    valgrind is happy after I fix these these two issues, and I cannot reproduce the crash with 10 lines of input.

    1. getword(): line numbers don't advance as you skip the \n from the last line with:
        while (isspace(c = getch()))
    

    yet caller expect word[0] to be a '\n' to advance the line number n. Here is a minimal fix:

        do {
            c = getch();
        } while(c != '\n' && isspace(c));
    

    and the output is now:

       3 bar: [2, 2, 3]
       2 foo: [1, 3]
    

    That said I suggest you have caller read a line with fgets() then split that line into words with a revised version of getwork().

    1. (not fixed) getword(): It's problematic that you 3 separate calls to getch() unless you really want to handle input differently in each case.

    2. addtree(): You currently record duplicate lines for a given word but you want subsequent duplicates lines to be no-op it seems. Also, might as well just use malloc() instead of calloc() as you explicitly set p->lines right after. Prefer using variable instead of type to sizeof().

    struct tnode *addtree(struct tnode *p, char *w, int l) {
        if (!p) {
            /* make a new node */
            p = malloc(sizeof *p);
            p->word = strdup(w);
            p->count = 1;
            p->lines = malloc(sizeof *p->lines);
            p->lines[p->count - 1] = l;
            p->left = NULL;
            p->right = NULL;
            return p;
        }
        int cond = strcmp(w, p->word);
        if(cond < 0)
            p->left = addtree(p->left, w, l);
        else if(!cond && l != p->lines[p->count - 1]) {
            p->count++;
            p->lines = realloc(p->lines, p->count * sizeof(int));
            p->lines[p->count - 1] = l;
        } else if(cond > 0)
            p->right = addtree(p->right, w, l);
        return p;
    }
    

    and the output is now:

       2 bar: [2, 3]
       2 foo: [1, 3]
    
    1. (not fixed) p = realloc(p, ...) leaks p if realloc() fails. It should be:
        int *tmp = realloc(p->lines, (p->count + 1) * sizeof(int));
        if(!tmp) {
            // handle error
            return NULL;
        }
        p->lines = tmp;
    
    1. (not fixed) malloc(), calloc(), strdup() may fail and return NULL. You want to check for that:
            p = malloc(sizeof(struct tnode));
            if(!p) {
                // handle error
                return NULL;
            }