Search code examples
cregexcountscanfwords

C counting word program works if there is no space between symbols, why?


This is the code I'm using and it works perfectly unless:

  1. The text file starts with integers.
  2. If there is no space between symbols e.g [hi!] works perfectly but [hi !] prints �
  3. If i put multiple symbols together e.g [?????????] the number of words will be incorrect.

Just to clarify it is fine if there are integers within the code but not if they are at the start.

Why is that the case? I am really curious how to solve that issue.

EDIT: It was pointed out to me that fscanf does not support regex, why does it filter out the letters correctly then?

#include <stdio.h>
#include <stdlib.h>

/* counts words in the file */
int main(void) 
{
    FILE *fp;
    int r,n,i; /* a variable for result of a function, returning int */
     /* the words counter */
    const char *filename = "test2.txt"; /* a file name opening for read */
    char word[10]; /* an array for the check if a non-empty word was read */

    if ((fp = fopen(filename, "r")) == NULL) {
        fprintf(stderr, "error: file" "\n");
        return 1;
    }  
    /* if can't open the file for read
       then print an error message and return false to the environment */
    int arraylen = sizeof(word)/sizeof(word[0]); /*write the length of array word to arraylen*/ 
    n = 0; /* turn the counter of words to zero */
    word[0] = '\0'; /* turn the word array to an empty state */
   while ((r = fscanf(fp, "\n%10[^A-Za-z]%*c", word)) == 1) {
        printf("firstoutput\n");
        for(i=0;i<arraylen;i++)

                printf("%c",word[i]);


    if((r = fscanf(fp, "\n%[A-Za-z]%*c", word)) == 0) { /*in case next character is not a 
        letter do nothing, this is in place to prevent the program from getting stuck*/

         printf("secondoutput\n");
        for(i=0;i<arraylen;i++)
            if(word[i] != (' '))
                printf("%c",word[i]);
        }

        if (word[0] != '\0')
            n++;
        /* if the word array got something,
           then it was a word, count it */ 

        word[0] = '\0'; /* turn the word back into an empty state */
    }  
    /* skipping words delimeted by ' ' or '\n' or ','
       while file fp can be read, continue skipping
       and count every skip */


    if (ferror(fp) != 0) { /* check the file for read error if EOF occured */
        fprintf(stderr, "error: read file" "\n");
        fclose(fp);
        return 1;
    }
    /* if there was an error while reading the file
       then print error, close the file (because it was opened though)
       and return false to the environment */

    if (n == 1) /* control "to be" and endings for word or words */
        printf("\nthere is %d word" "\n", n);
    else
        printf("\nthere are %d words" "\n", n);

    fclose(fp); /* close the file */

    return 0; /* return success to the environment */
}

EDIT2 : Im posting a full solution which works 101%, I combined all the various tips and stuff I found and created this, I believe this solution is extremely compact and efficient, if I am in the wrong do correct me !

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define WORD "A-Za-z"


int countWords(FILE *f,int* now){
   char ch[100][100];
   int x;
   *now=0;
   int count = 0;
       while (fscanf(f, "%[^" WORD "]",ch[count]) != EOF
        && fscanf(f, "%10[" WORD "]",ch[count]) != EOF) {

        if(count>0) {

        for(x=0;x<count;x++) {

          if((strcasecmp(ch[count],ch[x]))==0) { /* Compare if the two words match, 
          case insensitive*/
            if(fscanf(f, "%[^" WORD "]",ch[count]) != EOF
            && fscanf(f, "%10[" WORD "]",ch[count]) != EOF) /* Since the words match,
            now we have to overwrite the double word by scanning the next set of characters.*/
              printf("String is equal\n");

              (*now)++;
          }
        }
      }
        puts(ch[count]);
        count++;
        (*now)++;
    }
    puts(ch[13]);
    printf("%d\n",*now );

   return count;
}

int main(void){

   int uniquewordCount=0,wordCount=0;
   FILE *rFile = fopen("test2.txt", "r");
   uniquewordCount += countWords(rFile,&wordCount);
   printf("%d\n",(a+b));
   printf("Amount of unique words: %d\n", uniquewordCount);
   printf("Amount of words: %d\n", wordCount);
   return 0;
}

Solution

  • Your first scanning command is

    fscanf(fp, "\n%10[^A-Za-z]%*c", word)
    

    This will skip any white-space ("\n"), then read into a buffer of up to 10 chars that are not letters ("%10[^A-Za-z]") and finally read the character after that ("*c").

    You don't enter the while loop when the file begins with a letter, because fscanf will return 0, as it cound't scan a non-empty string of non-letters. If you even skip white space before that, the while loop is only entered when the first non-space character is not a letter, which is not very likely.

    Because you read an extra character at the end of the fscanf, your words will miss their first letter.

    By constraining the word to 10 letters, you read longer words in chunks, which should throw your word count off by counting longer words a stwo or three words. The char buffer word should also have at least 11 chars.

    The proper way to print a string is to printf("%s", word)or to puts(word) it. Your loop is basically okay, but prints all ten letters even if the word might have fewer letters.

    I don't think that using fscanf is a good way to count words, but if you want to use it, you could do something like this:

    #include <stdio.h>
    #include <stdlib.h>
    
    #define WORD "A-Za-z0-9'"
    
    int main(void)
    {
        FILE *fp = stdin;
        int n = 0;
    
        while (fscanf(fp, "%*[^" WORD "]") != EOF
            && fscanf(fp, "%*[" WORD "]") != EOF) {
            n++;
        }
    
        printf("%d words\n", n);
    
        return 0;
    }
    

    Here, we fscanf for non-words and words alternately and end the lopp when one of the fscanfs signals the end of the file. Note that we don't care about the actual contents, as we skip them by not converting anything with the * asterisk. That means that the only outcome of the fscanf can be 0 and EOF.

    The usual approach to count words is to read chars and detect when the "context" switches from non-words to words:

    #include <stdio.h>
    #include <stdlib.h>
    
    int isword(int c)
    {
        if ('A' <= c && c <= 'Z') return 1;
        if ('a' <= c && c <= 'z') return 1;
        if ('0' <= c && c <= '9') return 1;
        if (c == '-') return 1;
        if (c == '\'') return 1;
        return 0;
    }
    
    int main(void)
    {
        FILE *fp = stdin;       // or use a real file, of course
        int word = 0;           // 0: space cntext, 1: word context
        int n = 0;
    
        for (;;) {
            int c = fgetc(fp);
    
            if (c == EOF) break;
    
            if (isword(c)) {
                if (word == 0) n++;
                word = 1;
            } else {
                word = 0;
            }
        }
    
        printf("%d words\n", n);
    
        return 0;
    }
    

    What counts as a word is defined here by the function isword. Note how there is no need to keep two fscanf formats in sync. A letter is part of a word or it is not., That's unambiguous with the else clause.