Search code examples
c++cbisonflex-lexer

Flex/Bison Markdown to HTML Program


This is for a homework assignment. The only code I've edited myself are the definitions, rules, and tokens. What I have so far compiles successfully but gives me a segmentation fault when I try to run it on the markdown file (.md), and the HTML output is just a blank file because of that.

%{
#define YYSTYPE char *
#include <string.h>
#include "miniMD2html.tab.h"
extern YYSTYPE yylval;
%}
%option yylineno

/* Flex definitions */
whitespace [ \t]+
newline [\n]+|{whitespace}[\n]+
textword [a-zA-Z:/.\-,\']+
integer [0-9]+
header #|##|###|####|#####

%%
{header} { return T_HEADER; }
{integer} { return T_INTEGER; }
{textword} { return T_TEXTWORD; }
{whitespace} { return T_BLANK; }
{newline} { return T_NEWLINE; }
%%

The generate functions are given in another file. Most of them just accept char*, the generate_header function takes an int and char*, and the generate_image function takes two char* and two int. The grammar may look weird but this is what was given in the assignment.

%{
#include "global.h"
#include "stdlib.h"
#include "stdio.h"
#define YYSTYPE char *
extern int yylex();
int yywrap();
int yyerror(const char*);
int yyparse();
extern FILE *yyin;
Html_Doc *html_doc;
%}

/* Define tokens here */
%token T_BLANK T_NEWLINE
%token T_HEADER T_INTEGER T_TEXTWORD

%% /* Grammar rules and actions follow */
s: mddoc;
mddoc: /*empty*/ | mddoc paragraph;
paragraph: T_NEWLINE {add_linebreak(html_doc);} 
        | pcontent T_NEWLINE {add_element(html_doc, $1); free($1);} ;
pcontent: header
        | rftext {generate_paragraph($1);}
header: T_HEADER T_BLANK rftext {generate_header(strlen($1), $3);}
rftext: rftext T_BLANK rftextword {strappend($1, $3);}
        | rftext rftextword {strappend($1, $2);}
        | rftextword
rftextword: textnum | image | format
image: "![" text "](" text '=' T_INTEGER '@' T_INTEGER ')' {generate_image($2, $4, atoi($6), atoi($8));}
format: "**" text "**" {generate_bold($2);}
        | '_' text '_' {generate_italic($2);}
        | "**" format "**" {generate_bold($2);}
        | '_' format '_' {generate_italic($2);}
text: text T_BLANK textnum {strappend($1, $3);}
        | text textnum {strappend($1, $2);}
        | textnum
textnum: T_TEXTWORD | T_INTEGER
%%

int main(int argc, char *argv[]) {
    // yydebug = 1;

    FILE *fconfig = fopen(argv[1], "r");
    // make sure it is valid
    if (!fconfig) {
        printf("Error reading file!\n");
        return -1;
    }
    html_doc = new_html_doc();
    // set lex to read from file
    yyin = fconfig;
    int ret = yyparse();
    output_result(html_doc);
    del_html_doc(html_doc);
    return ret;
}

int yywrap(){
    return 1;
}

int yyerror(const char* s){
    extern int yylineno;
    extern char *yytext;
    printf("error while parsing line %d: %s at '%s', ASCII code: %d\n", yylineno, s, yytext, (int)(*yytext));
    return 1;
}

Solution

  • None of your flex rules ever set the value of yylval, so it will be NULL throughout. And so will all the references to semantic values ($n) in your grammar. Since most functions which take a char* assume that it is a valid string, it's pretty likely that one of them will soon try to examine the string value, and the fact that the pointer is NULL will certainly lead to a segfault.

    In addition, there are both single character and quoted string tokens in your grammar, none of which can be produced by your scanner. So it's quite likely that the parser will stop with a syntax error as soon as one of the non-word characters is encountered in the input.