Search code examples
cfilefile-ioio

Read from a text file and parse lines into words in C


I'm a beginner in C and system programming. For a homework assignment, I need to write a program that reads input from stdin parsing lines into words and sending words to the sort sub-processes using System V message queues (e.g., count words). I got stuck at the input part. I'm trying to process the input, remove non-alpha characters, put all alpha words in lower case and lastly, split a line of words into multiple words. So far I can print all alpha words in lower case, but there are lines between words, which I believe isn't correct. Can someone take a look and give me some suggestions?

Example from a text file: The Project Gutenberg EBook of The Iliad of Homer, by Homer

I think the correct output should be:

the
project
gutenberg
ebook
of
the
iliad
of
homer
by
homer

But my output is the following:

project
gutenberg
ebook
of
the
iliad
of
homer
                         <------There is a line there
by
homer

I think the empty line is caused by the space between "," and "by". I tried things like "if isspace(c) then do nothing", but it doesn't work. My code is below. Any help or suggestion is appreciated.

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>


//Main Function
int main (int argc, char **argv)
{
    int c;
    char *input = argv[1];
    FILE *input_file;

    input_file = fopen(input, "r");

    if (input_file == 0)
    {
        //fopen returns 0, the NULL pointer, on failure
        perror("Canot open input file\n");
        exit(-1);
    }
    else
    {        
        while ((c =fgetc(input_file)) != EOF )
        {
            //if it's an alpha, convert it to lower case
            if (isalpha(c))
            {
                c = tolower(c);
                putchar(c);
            }
            else if (isspace(c))
            {
                ;   //do nothing
            }
            else
            {
                c = '\n';
                putchar(c);
            }
        }
    }

    fclose(input_file);

    printf("\n");

    return 0;
}

EDIT **

I edited my code and finally got the correct output:

int main (int argc, char **argv)
{
    int c;
    char *input = argv[1];
    FILE *input_file;

    input_file = fopen(input, "r");

    if (input_file == 0)
    {
        //fopen returns 0, the NULL pointer, on failure
        perror("Canot open input file\n");
        exit(-1);
    }
    else
    {
        int found_word = 0;

        while ((c =fgetc(input_file)) != EOF )
        {
            //if it's an alpha, convert it to lower case
            if (isalpha(c))
            {
                found_word = 1;
                c = tolower(c);
                putchar(c);
            }
            else {
                if (found_word) {
                    putchar('\n');
                    found_word=0;
                }
            }

        }
    }

    fclose(input_file);

    printf("\n");

    return 0;
}

Solution

  • I think that you just need to ignore any non-alpha character !isalpha(c) otherwise convert to lowercase. You will need to keep track when you find a word in this case.

    int found_word = 0;
    
    while ((c =fgetc(input_file)) != EOF )
    {
        if (!isalpha(c))
        {
            if (found_word) {
                putchar('\n');
                found_word = 0;
            }
        }
        else {
            found_word = 1;
            c = tolower(c);
            putchar(c);
        }
    }
    

    If you need to handle apostrophes within words such as "isn't" then this should do it -

    int found_word = 0;
    int found_apostrophe = 0;
        while ((c =fgetc(input_file)) != EOF )
        {
        if (!isalpha(c))
        {
            if (found_word) {
                if (!found_apostrophe && c=='\'') {
                    found_apostrophe = 1;
                }
                else {
                    found_apostrophe = 0;
                    putchar('\n');
                    found_word = 0;
                }
                    }
        }
        else {
            if (found_apostrophe) {
                putchar('\'');
                found_apostrophe = 0;
            }
            found_word = 1;
            c = tolower(c);
            putchar(c);
        }
    }