c string file character-encoding string-comparison

Read strings out of a file and parse /0 character and utf8 signs right in c

I want to write a Hangman game which randomly chooses a word from a list in a text file.

The implementation works if i set a word per initialization:

char wort[] =  "Apfelbäumchen";

If I read a string out of a row of the file, it doesn't work anymore. I get a string which is at least one character longer than the word has. I figured out this is because of the '\0' string end character. Also characters like the German ä, ü, and ö seems to be encoded in an other way, so the comparison will say that the strings are not equal.

Could you please help me?

#include <stdio.h>
#include <string.h>
#include <stdlib.h> // Wofür diese?
#include <time.h> //für den Zufall

#define MAX_WORD_LENGTH 50


char* WortAusDatei(const char* filename) {
    FILE* file = fopen(filename, "r");
    if (file == NULL){
        printf("Datei konnte nicht gefunden werden.\n");
        return NULL;
    }
    
    srand(time(0));
    int wordCount = 0;
    char word[MAX_WORD_LENGTH];
    char* selectedWord = NULL;
    
    while (fgets(word, MAX_WORD_LENGTH, file) != NULL) {
        if (rand() % ++wordCount == 0) {
            selectedWord = strdup(word);
            //selectedWord = strcpy(word);
        }
    }

    fclose(file);

    printf("Debug: Das Wort heißt %s", selectedWord);
    printf("Debug: , es ist %d Buchstaben lang. \n" , strlen(selectedWord));

    //char* wort = malloc(100 * sizeof(char)); //Speicher für das Wort reservieren
    //strcpy(wort, "Bratpfanne"); // Wort in den reservierten Speicher kopieren
    return selectedWord; // Rückgabe des Strings
}

int main(){
    const char* filename = "worte.txt";
    char* wort = WortAusDatei(filename);    
    //char wort[] = strcpy(WortAusDatei(filename));
    //char wort[] =  "Apfelbaum";
    printf ("%s", wort);
    //free(wort);
    //printf ("%s", wort);

char eingabe[100];
    int leben = 10;

    int zeichenAnzahl = strlen(wort)-1; // Minus one to NOT count the closing zero character

    int eraten[strlen(wort)];
    int i;
    
    //paintHangman;
            
    anfang:
    printf("\n");

    // Zeige die Platzhalter für jeden Buchstaben an
    for (i = 0; i < strlen(wort) - 1; i++){
        if (eraten[i] == 1){
            printf("%c ", wort[i]);
        } else {
            printf("_ ");
        }
    }
    
    printf("\n\n");

    scanf("%s", eingabe); 

    if (strcmp(eingabe, wort) == 0){
        printf("Gewonnen!\n");
        return 0;
    }

    // Wenn eingbabe ein Buchstabe / d.h. länge 1 dann
    // Prüfe ob der Buchtstabe im Wort enthalten ist.
    if (strlen(eingabe) == 1){
        int i;
        int trefferanzahl = 0;
        for (i = 0; wort[i] != '\0'; i++){
            if (wort[i] == eingabe[0]){
                trefferanzahl++;
                printf("Treffer\n");
                eraten[i] = 1;

                // Prüfe ob alle Buchstaben des Wortes eraten wurden
                int alleEraten = 1;
                for (int i = 0; i < strlen(wort); i++){
                    if (eraten[i] != 1){
                        alleEraten = 0;
                        break;
                    }
                }
                if (alleEraten){
                    printf("Gewonnen, das Lösungswort ist: %s\n", wort);
                    return 0;
                }
                
            }
        }
        if (trefferanzahl == 0){
            leben--;
            printf("Du hast nurnoch %d Leben\n", leben);
            if (leben == 0){
                printf("Verloren");
                return 0;
            }
        }

    } else {
        printf("Eingabe ungültig\n");
    }

    goto anfang;

    return 0;
}

I did my research on strings, and tried to use different ways to copy the string like strdup and strcpy.

I tested that it works if I pass the word per initialization instead of reading it from a file.

Solution

First, you will probably want to use setlocale early on in your program, otherwise you will be operating in the C locale. This can be as simple as

setlocale(LC_ALL, "");

which sets the locale based on the user's preferences (the environment), or something like

setlocale(LC_ALL, "en_US.UTF-8"); /* or "de_DE.UTF-8", or whatnot */

to set it explicitly.

In UTF-8, ä (U+00E4) is a multibyte character, consisting of the sequence 0xC3 0xA4.

A program that demonstrates this:

#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(void)
{
    setlocale(LC_ALL, "en_US.UTF-8"); /* or an equally appropriate locale */

    char word[] = "Apfelbäumchen";
    int clen = 0;
    size_t offset = 0;

    while ((clen = mblen(word + offset, MB_CUR_MAX)) > 0) {
        printf("%.*s | UTF-8 MBS<%d>: ", clen, word + offset, clen);

        for (int i = 0; i < clen; i++)
            printf("0x%hhX ", word[offset + i]);

        printf("\n");
        offset += clen;
    }
}

Output:

A | UTF-8 MBS<1>: 0x41 
p | UTF-8 MBS<1>: 0x70 
f | UTF-8 MBS<1>: 0x66 
e | UTF-8 MBS<1>: 0x65 
l | UTF-8 MBS<1>: 0x6C 
b | UTF-8 MBS<1>: 0x62 
ä | UTF-8 MBS<2>: 0xC3 0xA4 
u | UTF-8 MBS<1>: 0x75 
m | UTF-8 MBS<1>: 0x6D 
c | UTF-8 MBS<1>: 0x63 
h | UTF-8 MBS<1>: 0x68 
e | UTF-8 MBS<1>: 0x65 
n | UTF-8 MBS<1>: 0x6E

Null-terminated multibyte character strings (NTMBS) mostly interface fine with "narrow" byte string functions (e.g., puts, strcpy), but things get troublesome when trying to deal with individual characters in a NTMBS.

Thus, strlen("ä") will count 2 bytes.

That additional byte is causing the inconsistency you attempt to fix here:

int zeichenAnzahl = strlen(wort)-1; // Minus one to NOT count the closing zero character

(Note that strlen does not include the null-terminating byte in its count.)

As seen above, mblen (consider also mbrlen) can be used to determine the byte-length of multibyte characters. printf("%.*s, length, start_of_sequence) can be used to print a maximum number of bytes from a string.

You will need to count these characters more carefully so that your array of flags (eraten) can be sized and indexed accurately with regards to the input (in the example above, that could be something as simple as count++ within the loop).

Additionally, comparing two individual multibyte characters generally requires knowledge of their byte-lengths (e.g., with strncmp / memcmp, but AFAIK this can fall apart with complex shift states, mixed encodings, or duplicates/equivalents).

Alternatively, multibyte characters and strings can be converted to wide characters and strings, or the entire program could be written with wide character support. For example, wcslen(L"ä") is 1 and L"äb"[0] == L"bä"[1] is generally true. You may find this to be the easiest route.

(Note that care must be taken to not mix calls to byte ("narrow") and wide character I/O functions with the same stream.).

See also: ICU.

When fgets reads a newline character, and there is room in the buffer, it places the newline character in the buffer. This is another source of strings being one byte longer than expected.

Conversely, scanf("%s", eingabe) will never place a newline in the buffer, as %s terminates upon reading whitespace.

(Note that an unbounded scanf("%s", ... is as dangerous as gets. e.g., if you have char buf[128];, limit input with scanf("%127s", ... which leaves room for the null-terminating byte. Or just use fgets instead.)

You will need to handle these newlines to normalize your inputs (as in Removing trailing newline character from fgets() input).

The following

if (rand() % ++wordCount == 0) {
    selectedWord = strdup(word);

randomly leaks memory provided by previous calls to strdup.

This is roughly a stream version of reservoir sampling (where k=1).

To avoid constantly allocating and deallocating memory, this can be done with two buffers, A and B, where read lines populate B one-by-one, and are randomly copied to A (first line read is always copied). Alternatively, instead of copying, pointer swaps can be performed.

The result is the contents of A, which can be passed to strdup at the end of the function.

It is generally best to call srand just once, at the start of the program. Even if you are currently calling WortAusDatei only once, that might change and multiple calls of srand(time(0)) within the same second will result in the same random sequence.

Avoid using goto to create loops - simply use while or for. If you find the inside of a loop getting too nested, refactor parts of it into functions.

Here is a cursory example implementing much of what I discussed, and somewhat of a refactoring your program. Someone more experienced with encodings (and shift states) can probably poke some holes in this, but it should be alright for a small use case (just don't use it as a definitive reference).

#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_WORD_LENGTH 64

static bool every(bool *set, size_t length)
{
    for (size_t i = 0; i < length; i++)
        if (!set[i])
            return false;

    return true;
}

static char *get_default_word(const char *badf)
{
    static const char *words[] = {
        "error",
        "sea",
        "spell",
        "elegant",
        "zippy",
        "chalk",
        "sweet",
        "join",
        "superb"
    };

    fprintf(stderr,
            "WARN: Generating default word. Check validity of word file `%s`.\n", badf);

    char *def = strdup(words[rand() % (sizeof words / sizeof *words)]);

    if (!def) {
        perror("Could not generate default word. Something is quite wrong");
        exit(EXIT_FAILURE);
    }

    return def;
}

static char *word_from_file(const char *filename) {
    FILE *file = fopen(filename, "r");

    if (!file) {
        perror(filename);
        return NULL;
    }

    size_t lines = 0;
    char word[MAX_WORD_LENGTH] = { 0 };
    char temp[MAX_WORD_LENGTH] = { 0 };

    while (fgets(temp, sizeof temp, file))
        if ((rand() % ++lines) < 1)
            strcpy(word, temp);

    fclose(file);

    word[strcspn(word, "\n")] = '\0';

    return *word ? strdup(word) : NULL;
}

static size_t character_count(const char *mbs)
{
    size_t count = 0;

    for (; *mbs; mbs++)
        if ((*mbs & 0xc0) != 0x80)
            count++;

    return count;
}

static void display(const char *string, bool *flags, size_t length)
{
    int len = 0;
    size_t offset = 0;
    size_t index = 0;

    mblen(NULL, 0);

    while (index < length && (len = mblen(string + offset, MB_CUR_MAX)) > 0) {
        if (flags[index])
            printf("%.*s ", len, string + offset);
        else
            printf("_ ");

        offset += len;
        index++;
    }

    putchar('\n');
}

static size_t mark(const char *mbs, bool *flags, size_t length, const char *query)
{
    size_t matches = 0;
    size_t index = 0;
    size_t offset = 0;

    mblen(NULL, 0);
    int qlen = mblen(query, MB_CUR_MAX);

    mblen(NULL, 0);
    int slen = 0;

    while (index < length && (slen = mblen(mbs + offset, MB_CUR_MAX)) > 0) {
        if (qlen == slen && 0 == memcmp(mbs + offset, query, qlen)) {
            flags[index] = true;
            matches++;
        }

        offset += slen;
        index++;
    }

    return matches;
}

int main(int argc, char **argv)
{
    setlocale(LC_ALL, "");
    srand((unsigned) time(NULL));

    const char *filename = argc > 1 ? argv[1] : "words.txt";
    char *word = word_from_file(filename);

    if (!word)
        word = get_default_word(filename);

    fprintf(stderr, "DEBUG: <<%s>>\n", word);

    size_t length = character_count(word);
    unsigned lives = 10;
    bool guess[length];

    memset(guess, 0, sizeof guess);

    while (1) {
        display(word, guess, length);

        printf("Enter a guess: ");

        char input[128];

        if (!fgets(input, sizeof input, stdin))
            break;

        input[strcspn(input, "\n")] = '\0';

        size_t hits = mark(word, guess, length, input);

        if (hits) {
            puts("Hit!");
        } else {
            puts("Miss!");

            if (!--lives) {
                puts("Game over! You ran out of lives!");
                break;
            }
        }

        if (every(guess, length)) {
            puts("That's it! You won!");
            break;
        }

        printf("Not quite there! You have %u lives remaining.\n", lives);
    }

    printf("The word was: %s\n", word);
    free(word);
}

In use:

DEBUG: <<Apfelbäumchen>>
_ _ _ _ _ _ _ _ _ _ _ _ _ 
Enter a guess: A
Hit!
Not quite there! You have 10 lives remaining.
...
...
A p f e l b _ u m c h e n 
Enter a guess: ä
Hit!
That's it! You won!
The word was: Apfelbäumchen