cparsingscanf

Parsing command line input filename to check the correctness of the content


Want to parse the filename from the command line and check its correctness, such as (1) total length, (2) expected extension, (3) '_' position, and other input values.

The sequence should be as follows:

$check.exe input_file  L2A30000_0102051303042026_0001.dat  

It should check if the output file (L2A30000_0102051303042026_0001.dat) is typed as it should (not by the exact values, but by the type and length).

// Function to check if a string consists of digits
int isNumeric(const char *str) {
    while (*str) {
        if (!isdigit(*str)) {
        return 0;  // Not a digit
        }
        str++;
    }
    return 1;  // All characters are digits
}

int main(int argc, char *argv[]) {
    // Check if the correct number of command line arguments is  
    provided
    if (argc != 3) {
        printf("Usage: %s inputfile outputfile\n", argv[0]);
        return 1;
    }

   // Extract the output file name from the command line arguments
   const char *outputFileName = argv[2];

   // Define the expected format
   char asciiChar1, numChar1, asciiChar2, numChar2, numChar3[5],      
   underscore1, numChar4[17], underscore2, numChar5[5],  
   numChar6[4], extension[4];

   int result = sscanf(outputFileName, 
   "%c%c%c%c%4[0-9]%c%16[0-9]%c%1[0-9]%3[0-9]_%3[0-9]%4[.dat]",
                    &asciiChar1, &numChar1, &asciiChar2, 
   &numChar2, numChar3, &underscore1, numChar4, &underscore2, 
   numChar5, numChar6, extension);

  // Debugging print statement
  printf("Debug: sscanf result: %d\n", result);

  printf("Debug: asciiChar1: %c\n", asciiChar1);
  printf("Debug: numChar1: %c\n", numChar1);
  printf("Debug: asciiChar2: %c\n", asciiChar2);
  printf("Debug: numChar2: %c\n", numChar2);
  printf("Debug: numChar3: %s\n", numChar3);
  printf("Debug: underscore1: %c\n", underscore1);
  printf("Debug: numChar4: %s\n", numChar4);
  printf("Debug: underscore2: %c\n", underscore2);
  printf("Debug: numChar5: %s\n", numChar5);
  printf("Debug: numChar6: %s\n", numChar6);
  printf("Debug: extension: %s\n", extension);

 // Check if the extracted values match the expected format
 if (result != 12 || !isalpha(asciiChar1) || !isdigit(numChar1) || 
    !isalpha(asciiChar2) || !isdigit(numChar2) ||
    strlen(numChar3) != 4 || !isNumeric(numChar3) ||    
    strlen(numChar4) != 16 || !isNumeric(numChar4) ||
    strlen(numChar5) != 4 || !isNumeric(numChar5) || 
    strlen(numChar6) != 3 || !isNumeric(numChar6) ||
    strlen(extension) != 3 || strcmp(extension, ".dat") != 0) {

    printf("Error: Output file format is incorrect.\n");
    return 1;
}

// If all checks pass, the output file format is correct
 printf("Output file format is correct.\n");

 return 0;
}

Command line input:

.\check.exe inputfile L2A30000_0102051303042026_0001.dat

This is the output I am getting:

Debug: sscanf result: 9
...
Debug: numChar5: 0001
Debug: extension:
Error: Output file format is incorrect.

This is the output I am expecting:

Debug: extension:.dat

This part is not working. Other parts are OK. Want to check if the extension is .dat or not for the filename. If not it will print error msg and exit.


Solution

  • I suggest you introduce some extra white space in the format string and matching arguments along these lines:

        int result = sscanf(outputFileName, 
            "%c%c"
            "%c%c"
            "%4[0-9]"
            "%c"
            "%16[0-9]"
            "%c" // underscore2
            "%1[0-9]"
            "%3[0-9]_%3[0-9]%4[.dat]",
            &asciiChar1, &numChar1,
            &asciiChar2, &numChar2,
            numChar3,
            &underscore1,
            numChar4,
            &underscore2,
            numChar5,
            numChar6,
            extension
        );
    

    so we are good up to the 2nd underscore. Then you expect a number (char numChar5[5]) but that doesn't match the size of the variable. Then 3 more numbers (char numChat6[4]) which is ok. Then a 3rd underscore which isn't in the input. 3 more numbers which has no matching argument. "%4[.dat]" which result in a buffer overflow as the extensions variable is a char extension[4]. Overall 12 format directives and 11 arguments which is undefined behavior.

    You can simplify it by hard-coding your fixed strings:

    #include <ctype.h>
    #include <stdio.h>
    #include <string.h>
    
    int isNumeric(const char *str) {
        for(; isdigit(*str); str++);
        return !*str;
    }
    
    int main(int argc, char *argv[]) {
        if (argc != 3) {
            printf("Usage: %s inputfile outputfile\n", argv[0]);
            return 1;
        }
        const char *outputFileName = argv[2];
        char asciiChar1, numChar1, asciiChar2, numChar2, numChar3[5], numChar4[17], numChar5[5], extension[4];
        int result = sscanf(outputFileName,
            "%c%c"
            "%c%c"
            "%4[0-9]"
            "_"
            "%16[0-9]"
            "_"
            "%4[0-9]"
            ".dat",
            &asciiChar1, &numChar1,
            &asciiChar2, &numChar2,
            numChar3,
            numChar4,
            numChar5
        );
        printf("Debug: sscanf result: %d\n", result);
        printf("Debug: asciiChar1: %c\n", asciiChar1);
        printf("Debug: numChar1: %c\n", numChar1);
        printf("Debug: asciiChar2: %c\n", asciiChar2);
        printf("Debug: numChar2: %c\n", numChar2);
        printf("Debug: numChar3: %s\n", numChar3);
        printf("Debug: numChar4: %s\n", numChar4);
        printf("Debug: numChar5: %s\n", numChar5);
        if (result != 7 || !isalpha(asciiChar1) || !isdigit(numChar1) ||
            !isalpha(asciiChar2) || !isdigit(numChar2) ||
            strlen(numChar3) != 4 || !isNumeric(numChar3) ||
            strlen(numChar4) != 16 || !isNumeric(numChar4) ||
            strlen(numChar5) != 4 || !isNumeric(numChar5)
        ) {
    
            printf("Error: Output file format is incorrect.\n");
            return 1;
        }
        printf("Output file format is correct.\n");
        return 0;
    }
    
    

    with example run:

    ./a.out  input_file L2A30000_0102051303042026_0001.dat
    Debug: sscanf result: 7
    Debug: asciiChar1: L
    Debug: numChar1: 2
    Debug: asciiChar2: A
    Debug: numChar2: 3
    Debug: numChar3: 0000
    Debug: numChar4: 0102051303042026
    Debug: numChar5: 0001
    Output file format is correct.
    

    Another approach would be to just parse the file name is_valid_format() possible via a little interpreter is_valid_format2():

    #include <ctype.h>
    #include <stdio.h>
    #include <string.h>
    
    const char *alpha(const char *s) {
        if(!s) return NULL;
        if(!isalpha(*s)) return NULL;
        return s + 1;
    }
    
    const char *digits(const char *s, size_t n) {
        if(!s) return NULL;
        for(size_t i = 0; i < n; i++)
            if(!isdigit(s[i])) return NULL;
        return s + n;
    }
    const char *str(const char *s, const char *s2) {
        if(!s) return NULL;
        size_t n = strlen(s2);
        if(strncmp(s, s2, n)) return NULL;
        return s + n;
    }
    
    int is_valid_filename(const char *s) {
        s = alpha(s);
        s = digits(s, 1);
        s = alpha(s);
        s = digits(s, 5);
        s = str(s, "_");
        s = digits(s, 16);
        s = str(s, "_");
        s = digits(s, 4);
        s = str(s, ".dat");
        return s && !*s;
    }
    
    int is_valid_filename2(const char *s) {
        struct {
            enum { ALPHA, DIGITS, STR } type;
            union {
                int n;
                const char *s;
            };
        } format[] = {
            { ALPHA },
            { DIGITS, .n = 1 },
            { ALPHA },
            { DIGITS, .n = 5 },
            { STR, .s = "_" },
            { DIGITS, .n = 16 },
            { STR, .s = "_" },
            { DIGITS, .n = 4 },
            { STR, .s = ".dat" },
        };
        size_t n = sizeof format / sizeof *format;
        for(size_t i = 0; s && i < n; i++) {
            switch(format[i].type) {
                case ALPHA:
                    s = alpha(s);
                    break;
                case DIGITS:
                    s = digits(s, format[i].n);
                    break;
                case STR:
                    s = str(s, format[i].s);
                    break;
            }
        }
        return s && !*s;
    }
    
    int main(int argc, char *argv[]) {
        if (argc != 3) {
            printf("Usage: %s inputfile outputfile\n", argv[0]);
            return 1;
        }
        char *result[] = { "invalid", "valid" };
        printf("%s\n", result[is_valid_filename(argv[2])]);
        printf("%s\n", result[is_valid_filename2(argv[2])]);
    }