Search code examples
arrayscparsingstrtok

Parser in C doesn't store values in a array


I write a parser in C to parse a file that describe a graph, with vertex and edges, the graph file looks like this :

c FILE: graph_test
c
c SOURCE: generator
c 
p edge 10 12
e 1 2
e 2 3
e 6 2

the "c" lines correspond to comments (must be ignored), the "p" line describe the number of nodes and the number of Edges, and the nbEdges "e" following lines describe the edges between the two nodes.

I use strtok() to split the string to get only the values that interest me, but when I try to store the values into a array int edges[nb_adges][2], the array is filled with wrong valued

Here's the entire code:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define MAX_SIZE 1000

int main(int argc, char *argv[]) {
    if (argc != 2) {
        printf("\n[USAGE] : ./parser file_path\n");
        exit(1);
    }

    char *file_name = argv[1];
    FILE *file;

    // Open file in read mode
    file = fopen(file_name, "r");
    if (file == NULL) {
        printf("\nUnable to open file '%s'\n", file_name);
        exit(2);
    }

    char line[MAX_SIZE];
    int num_vertices = 0;
    int num_edges = 0;
    int edges[num_edges][2];

    // Read each line from the file
    int i;
    while (fgets(line, MAX_SIZE, file) != NULL) {
        // If the line starts with "c", it is a comment, so we ignore it
        if (line[0] == 'c') {
            continue;
        }
        // If the line starts with "p", it contains the number of vertices and edges
        if (line[0] == 'p') {
            i = 0;
            char *tmp = strtok(line, " ");
            while (tmp != NULL) {
                if (i == 2) {
                    num_vertices = atoi(tmp);
                }
                if (i == 3) {
                    num_edges = atoi(tmp);
                }
                tmp = strtok(NULL, " ");
                i++;
            }
            i = 0;
            continue;
        }

        // If the line starts with "e", we retrieve the edges
        if (line[0] == 'e') {
            char *tmp = strtok(line, " ");
            int j = 0;
            while (tmp != NULL) {
                if (j == 1) {
                    edges[i][0] = atoi(tmp);
                }
                if (j == 2) {
                    edges[i][1] = atoi(tmp);
                }
                printf("tmp : %s\n", tmp);
                tmp = strtok(NULL, " ");
                j++;
            }
            i++;
            continue;
        }
    }

    printf("\nNumber of vertices: %d\n", num_vertices);
    printf("Number of edges: %d\n", num_edges);
    printf("List of edges:\n");

    // Print all edges
    for (int i = 0; i < num_edges; i++) {
        printf("%d -> %d\n", edges[i][0], edges[i][1]);
    }

    // Close the file
    fclose(file);

    return 0;
}

I tried to print the value of tmp in the 'e' loop, it display the correct value of nodes, but the array int edges[nb_edges][2] is filled with stranges values

> Execution 
List of edges :
1 -> 2
2 -> 3
6 -> 2
8 -> 3
3473509 -> 52
8 -> 7
9 -> 4
5 -> 6
7 -> 9
10 -> 1
1 -> 7
5 -> 4

Solution

  • In addition to my top comments ...

    This:

    int nb_aretes = 0;
    int aretes[nb_aretes][2];
    

    This is [effectively]:

    int aretes[0][2];
    

    That's because the size is fixed when the declaration occurs. It does not increase if nb_aretes is increased subsequently.

    We need a fixed size array (e.g.):

    #define ARETES_MAX 10000
    int aretes[ARETES_MAX][2];
    

    The data file does not match up. The p line says 15 edges but there are only 3 e lines.


    Although, I studied French [for four years, 40 years ago], I ran your code through google translate. So, some of this may be from that.

    I changed the code to ignore the p line for nb_edges and to just count the e lines:

    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    
    #define MAX_LINE 1000                   // max line size
    #define EDGES_MAX 10000                 // max number of edges/nodes
    
    int
    main(int argc, char *argv[])
    {
        if (argc != 2) {
            printf("\n[USAGE]: ./parser file_path\n");
            exit(1);
        }
    
        char *file_name = argv[1];
        FILE *file;
    
        // Open for reading
        file = fopen(file_name, "r");
        if (file == NULL) {
            printf("\nCould not open file '%s'\n", file_name);
            exit(2);
        }
    
        char line[MAX_LINE];
        int nb_vertices = 0;
    
    // NOTE/BUG: this produces [effectively]
        int nb_edges = 0;
    #if 0
        int edges[nb_edges][2];
    #else
        int counted_edges = 0;
        int edges[EDGES_MAX][2];
    #endif
    
        // We read each line of the file
        int i;
    
        while (fgets(line, MAX_LINE, file) != NULL) {
            // If the line begins with "c" it is a comment, we ignore it
            if (line[0] == 'c') {
                continue;
            }
    
            // If the line begins with "p" this is the line that contains the
            // number of vertices and edges
            if (line[0] == 'p') {
                i = 0;
                char *tmp = strtok(line, " ");
    
                while (tmp != NULL) {
                    if (i == 2) {
                        nb_vertices = atoi(tmp);
                    }
                    if (i == 3) {
    // NOTE/BUG: in the data file, this does _not_ match the number of "e" lines
                        nb_edges = atoi(tmp);
    // NOTE/BUG: this is flagged by the compiler with -Wall
    // NOTE/BUG: using [2] is UB (undefined behavior) because the max index can be
    // is 1
    #if 0
                        edges[nb_edges][2];
    #endif
                    }
                    tmp = strtok(NULL, " ");
                    i++;
                }
                i = 0;
                continue;
            }
    
            // If the line begins with "e" we retrieve the edges
            if (line[0] == 'e') {
                char *tmp = strtok(line, " ");
                int j = 0;
    
                while (tmp != NULL) {
                    if (j == 1) {
    #if 0
                        edges[i][0] = atoi(tmp);
    #else
                        edges[counted_edges][0] = atoi(tmp);
    #endif
                    }
                    if (j == 2) {
    #if 0
                        edges[i][1] = atoi(tmp);
    #else
                        edges[counted_edges][1] = atoi(tmp);
    #endif
                    }
                    printf("tmp: %s\n", tmp);
                    tmp = strtok(NULL, " ");
                    j++;
                }
    #if 0
                i++;
    #else
                ++counted_edges;
    #endif
                continue;
            }
        }
    
        printf("\nNumber of vertices: %d\n", nb_vertices);
        printf("Number of edges: %d (expected)\n", nb_edges);
        printf("Number of edges: %d (calculated)\n", counted_edges);
        printf("List of edges:\n");
    
        // We display all the edges
    #if 0
        for (int i = 0; i < nb_edges; i++) {
    #else
        for (int i = 0; i < counted_edges; i++) {
    #endif
            printf("%d -> %d\n", edges[i][0], edges[i][1]);
        }
    
        // We close the file
        fclose(file);
    
        return 0;
    }
    

    In the above code, I've used cpp conditionals to denote old vs. new code:

    #if 0
    // old code
    #else
    // new code
    #endif
    
    #if 1
    // new code
    #endif
    

    Note: this can be cleaned up by running the file through unifdef -k


    I used your sample input:

    c FILE: graph_test
    c
    c SOURCE: generator
    c
    p edge 10 12
    e 1 2
    e 2 3
    e 6 2
    

    Here is the program output:

    tmp: e
    tmp: 1
    tmp: 2
    
    tmp: e
    tmp: 2
    tmp: 3
    
    tmp: e
    tmp: 6
    tmp: 2
    
    Number of vertices: 10
    Number of edges: 12 (expected)
    Number of edges: 3 (calculated)
    List of edges:
    1 -> 2
    2 -> 3
    6 -> 2
    

    I think the code can be simplified. There is too much replication of strtok calls under each if statement.

    Better to have a loop that splits the line before deciding on the command to process.

    And, a switch/case seems better than the if statements.

    Here's how I would clean things up:

    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    
    #define MAX_LINE 1000                   // max line size
    #define EDGES_MAX 10000                 // max number of edges/nodes
    #if 1
    #define MAX_TOKENS  10                  // max tokens per line
    #endif
    
    int
    main(int argc, char *argv[])
    {
        if (argc != 2) {
            printf("\n[USAGE]: ./parser file_path\n");
            exit(1);
        }
    
        char *file_name = argv[1];
        FILE *file;
    
        // Open for reading
        file = fopen(file_name, "r");
        if (file == NULL) {
            printf("\nCould not open file '%s'\n", file_name);
            exit(2);
        }
    
        char line[MAX_LINE];
        char saved_line[MAX_LINE];
    
        int nb_vertices = 0;
        int nb_edges = 0;
        int counted_edges = 0;
        int edges[EDGES_MAX][2];
    
        // We read each line of the file
    
    #if 1
        int ntokens;
        char *tokens[MAX_TOKENS + 1];
    #endif
    
        while (fgets(line, MAX_LINE, file) != NULL) {
            // If the line begins with "c" [or "#"] it is a comment, we ignore it
            if (line[0] == 'c')
                continue;
    #if 1
            if (line[0] == '#')
                continue;
    #endif
    
            // split line into tokens
    #if 1
            // copy in case of error
            strcpy(saved_line,line);
    
            char *tok = strtok(line," \n");
            for (ntokens = 0;  ntokens < MAX_TOKENS;  ++ntokens) {
                if (tok == NULL)
                    break;
                tokens[ntokens] = tok;
                tok = strtok(NULL," \n");
            }
            tokens[ntokens] = NULL;
    
            // ignore blank lines
            if (ntokens == 0)
                continue;
    #endif
    
            switch (line[0]) {
            case 'p':
                // If the line begins with "p" this is the line that contains the
                // number of vertices and edges
                if (ntokens != 4) {
                    printf("bad p line -- ntokens=%d %s",ntokens,saved_line);
                    exit(1);
                }
    
                nb_vertices = atoi(tokens[2]);
                nb_edges = atoi(tokens[3]);
                break;
    
            case 'e':
                // If the line begins with "e" we retrieve the edges
                if (ntokens != 3) {
                    printf("bad e line -- ntokens=%d %s",ntokens,saved_line);
                    exit(1);
                }
    
                edges[counted_edges][0] = atoi(tokens[1]);
                edges[counted_edges][1] = atoi(tokens[2]);
                ++counted_edges;
                break;
    
            default:
                printf("unknown command -- %s",saved_line);
                exit(1);
                break;
            }
        }
    
        printf("\nNumber of vertices: %d\n", nb_vertices);
        printf("Number of edges: %d (expected)\n", nb_edges);
        printf("Number of edges: %d (calculated)\n", counted_edges);
        printf("List of edges:\n");
    
        // We display all the edges
        for (int i = 0; i < counted_edges; i++)
            printf("%d -> %d\n", edges[i][0], edges[i][1]);
    
        // We close the file
        fclose(file);
    
        return 0;
    }
    

    I did not do the following in the code above.

    But, if the intention was that there is only one p line at the top of the file, we could use the:

    int aretes[nb_aretes][2];
    

    But, we'd have to parse the p line above that definition in a separate block. Then, nb_aretes would be set correctly [dynamically] before we see the definition.