I am trying to read a file and store every word into a dynamically allocated 2D array. The size of the input file is unknown.
I am totally lost and don't know how I could "fix/finish" the program.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(void) {
char filename[25];
printf("Input the filename");
scanf("%s", filename);
fileConverter(filename);
}
int fileConverter(char filename[25]) {
//int maxLines = 50000;
//int maxWordSize = 128;
//char words[maxLines][maxWordSize];
//char **words;
char **arr = (char**) calloc(num_elements, sizeof(char*));
for ( i = 0; i < num_elements; i++ ) {
arr[i] = (char*) calloc(num_elements_sub, sizeof(char));
}
FILE *file = NULL;
int amountOfWords = 0;
file = fopen(filename, "r");
if(file == NULL) {
exit(0);
}
while(fgets(words[amountOfWords], 10000, file)) {
words[amountOfWords][strlen(words[amountOfWords]) - 1] = "\0";
amountOfWords++;
}
for(int i = 0; i < amountOfWords; i++) {
printf("a[%d] = ", i);
printf("%s\n", words[i]);
}
printf("The file contains %d words and the same amount of lines.\n", amountOfWords);
return amountOfWords;
The main challenges for this kind of problem are
fgets
.The general approach for these kind of parsing problems, is to design a state machine. The state machine here has two states:
Particularly difficult is the case in which we are in state 2 and reach the end of the buffer. This means that this word spans multiple buffers. To accommodate for this, we deviate slightly from a direct state machine implementation. State 2 is slightly different, depending on if we are reading a new word or continuing one that was started in a previous buffer.
We now keep track of wordSize
. If we start reading from the start of a buffer, but wordSize
is not 0, then we know we are continuing a previous word and we know what size it was for the realloc
we need.
Below is one possible implementation. All the work is done in the wordArrayRead
function. Walking through it from the top of the function:
First we declare the variables that we need across lineBuffer
reads: an index for the word itself and the length of the word we are currently reading, followed by the declaration of the buffer itself. The outside loop repeatedly reads using fgets
until we have exhausted the input.
We start reading at index 0 and stop at the null-terminator. The first if-statement checks if we should be in state 2: either the current character is the start of a word or we were already reading a word.
State 2
The index wordStartIdx
stays at the first character of the word (segment) and we walk the wordEndIdx
to the end of the word (segment) or to the end of the buffer.
We then check if we need to increase the size of the array of strings. Here we increase it to 2 times + 1 the previous size to avoid frequent reallocations.
We set a boolean value, indication whether we have reached the end of a word. If we have, we need to allocate for and write the null-terminator at the end of the string.
If wordLength == 0
it means we are reading a new word and have to allocate memory for it for the first time. If wordLength != 0
, we have to reallocate to append to an existing word.
We copy the word (segment) currently in the lineBuffer
to the array of strings.
Now, we do some bookkeeping. If we reached the end of a word, we write the null-terminator, increment the index to point to the next word location and reset wordLength
. If this wasn't the case, we only increment the wordLength
with the length of the segment we just read. Finally, we update wordStartIdx
, which still points to the start of the word, to point to the end of the word, so we can continue iterating over the buffer.
State 1
Having finishing the State 2 processing, we go into State 1 which has only two lines. It simply advances the index until we land at non-whitespace. Note that the null-terminator of the lineBuffer
('\0'
) does not count as whitespace, so this loop will not continue past the end of the buffer.
After all input has been processed, we shrink the array of strings to the actual size of its data. This "corrects" the allocation policy of increasing the size by 2n+1 each time it wasn't large enough.
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// BUFFER_SIZE must be >1U
#define BUFFER_SIZE 1024U
struct WordArray
{
char **words;
size_t numberOfWords;
};
static struct WordArray wordArrayConstruct(void);
static void wordArrayResize(struct WordArray *wordArray, size_t const newSize);
static void wordArrayDestruct(struct WordArray *wordArray);
static void wordArrayRead(FILE *restrict stream, struct WordArray *wordArray);
static char *reallocStringWrapper(char *restrict str, size_t const newSize);
static void wordArrayPrint(struct WordArray const *wordArray);
int main(void)
{
struct WordArray wordArray = wordArrayConstruct();
wordArrayRead(stdin, &wordArray);
wordArrayPrint(&wordArray);
wordArrayDestruct(&wordArray);
}
static void wordArrayRead(FILE *restrict stream, struct WordArray *wordArray)
{
size_t wordArrayIdx = 0U;
size_t wordLength = 0U;
char lineBuffer[BUFFER_SIZE];
while (fgets(lineBuffer, sizeof lineBuffer, stream) != NULL)
{
size_t wordStartIdx = 0U;
while (lineBuffer[wordStartIdx] != '\0')
{
if (!isspace(lineBuffer[wordStartIdx]) || wordLength != 0U)
{
size_t wordEndIdx = wordStartIdx;
while (!isspace(lineBuffer[wordEndIdx]) && wordEndIdx != BUFFER_SIZE - 1U)
++wordEndIdx;
if (wordArrayIdx >= wordArray->numberOfWords)
wordArrayResize(wordArray, wordArray->numberOfWords * 2U + 1U);
size_t wordSegmentLength = wordEndIdx - wordStartIdx;
size_t foundWordEnd = wordEndIdx != BUFFER_SIZE - 1U; // 0 or 1 bool
// Allocate for a new word, or reallocate for an existing word
// If a word end was found, add 1 to the size for the '\0' character
char *dest = wordLength == 0U ? NULL : wordArray->words[wordArrayIdx];
size_t allocSize = wordLength + wordSegmentLength + foundWordEnd;
wordArray->words[wordArrayIdx] = reallocStringWrapper(dest, allocSize);
memcpy(&(wordArray->words[wordArrayIdx][wordLength]),
&lineBuffer[wordStartIdx], wordSegmentLength);
if (foundWordEnd)
{
wordArray->words[wordArrayIdx][wordLength + wordSegmentLength] = '\0';
++wordArrayIdx;
wordLength = 0U;
}
else
{
wordLength += wordSegmentLength;
}
wordStartIdx = wordEndIdx;
}
while (isspace(lineBuffer[wordStartIdx]))
++wordStartIdx;
}
}
// All done. Shrink the words array to the size of the actual data
if (wordArray->numberOfWords != 0U)
wordArrayResize(wordArray, wordArrayIdx);
}
static struct WordArray wordArrayConstruct(void)
{
return (struct WordArray) {.words = NULL, .numberOfWords = 0U};
}
static void wordArrayResize(struct WordArray *wordArray, size_t const newSize)
{
assert(newSize > 0U);
char **tmp = (char**) realloc(wordArray->words, newSize * sizeof *wordArray->words);
if (tmp == NULL)
{
wordArrayDestruct(wordArray);
fprintf(stderr, "WordArray allocation error\n");
exit(EXIT_FAILURE);
}
wordArray->words = tmp;
wordArray->numberOfWords = newSize;
}
static void wordArrayDestruct(struct WordArray *wordArray)
{
for (size_t wordStartIdx = 0U; wordStartIdx < wordArray->numberOfWords; ++wordStartIdx)
{
free(wordArray->words[wordStartIdx]);
wordArray->words[wordStartIdx] = NULL;
}
free(wordArray->words);
}
static char *reallocStringWrapper(char *restrict str, size_t const newSize)
{
char *tmp = (char*) realloc(str, newSize);
if (tmp == NULL)
{
free(str);
fprintf(stderr, "Realloc string allocation error\n");
exit(EXIT_FAILURE);
}
return tmp;
}
static void wordArrayPrint(struct WordArray const *wordArray)
{
for (size_t wordStartIdx = 0U; wordStartIdx < wordArray->numberOfWords; ++wordStartIdx)
printf("%zu: %s\n", wordStartIdx, wordArray->words[wordStartIdx]);
}
Note: This program reads input from stdin
, as Unix/Linux utilities typically do. Use input redirection to read from a file, or provide a file descriptor to the readWordArray
function.