Currently using Flex to scan a grammar that defines a subset of C. Here is the .l file.
/*definitions*/
%{
#include <stdio.h>
int yylex(void);
%}
/*rules*/
%%
"int" {printf("TYPE_INT %s\n",yytext);}
"float" {printf("TYPE_FLOAT %s\n",yytext);}
"char" {printf("TYPE_CHAR %s\n",yytext);}
"bool" {printf("TYPE_BOOL %s\n",yytext);}
"while" {printf("WHILE %s\n",yytext);}
"do" {printf("DO %s\n",yytext);}
"for" {printf("FOR %s\n",yytext);}
"printf" {printf("PRINTF %s\n",yytext);}
"scanf" {printf("SCANF %s\n",yytext);}
"if" {printf("IF %s\n",yytext);}
"elif" {printf("ELIF %s\n",yytext);}
"else if" {printf("ELSE_IF %s\n",yytext);}
"else" {printf("ELSE %s\n",yytext);}
"true" {printf("TRUE %s\n",yytext);}
"false" {printf("FALSE %s\n",yytext);}
"void" {printf("VOID %s\n",yytext);}
"main" {printf("MAIN %s\n",yytext);}
"return" {printf("RETURN %s\n",yytext);}
[a-zA-Z_][a-zA-Z0-9]* {printf("IDENTIFIER %s\n",yytext);}
[-+]?(([1-9][0-9]*)|0) {printf("INTEGER %s\n",yytext);}
[-+]?[0-9]+\.[0-9]+ {printf("FLOAT %s\n",yytext);}
\"[^\"\n]*\" {printf("STRING %s\n",yytext);}
"," {printf("COMMA %s\n",yytext);}
";" {printf("SEMICOLON %s\n",yytext);}
"{" {printf("LEFT_BRACE %s\n",yytext);}
"}" {printf("RIGHT_BRACE %s\n",yytext);}
"(" {printf("LEFT_PAREN %s\n",yytext);}
")" {printf("RIGHT_PAREN %s\n",yytext);}
"[" {printf("LEFT_BRACKET %s\n",yytext);}
"]" {printf("RIGHT_BRACKET %s\n",yytext);}
"-" {printf("MINUS %s\n",yytext);}
"+" {printf("PLUS %s\n",yytext);}
"*" {printf("MULTIPLY %s\n",yytext);}
"/" {printf("DIVIDE %s\n",yytext);}
"\\" {printf("BACKSLASH %s\n",yytext);}
"%" {printf("MODULUS %s\n",yytext);}
"==" {printf("EQUALS %s\n",yytext);}
"!=" {printf("NOT_EQUALS %s\n",yytext);}
"<" {printf("LESS_THAN %s\n",yytext);}
">" {printf("GREATER_THAN %s\n",yytext);}
"<=" {printf("LESS_THAN_OR_EQUAL %s\n",yytext);}
">=" {printf("GREATER_THAN_OR_EQUAL %s\n",yytext);}
"=" {printf("ASSIGN %s\n",yytext);}
"&&" {printf("LOGICAL_AND %s\n",yytext);}
"||" {printf("LOGICAL_OR %s\n",yytext);}
"!" {printf("LOGICAL_NOT %s\n",yytext);}
[" "|\t|\n|\f|\v] {printf("WHITESPACE\n");}
. {printf("UNRECOGNIZED_CHARACTER %s\n",yytext);}
%%
/*for when we use multipe input files*/
int yywrap(void){
return 1;
}
/*main driver function that takes */
int main(int argc, char *argv[]){
if(argc<2){
printf("Usage: %s <input_file_name>\n",argv[0]);
return 1;
}
FILE *fp = fopen(argv[1], "r");
if(fp == NULL){
printf("Error opening input file.\n");
return 1;
}
yyin = fp;
yylex();
fclose(fp);
return 0;
}
The issue I have is with certain inputs such as 90.s3
and 232a3
. Instead of getting 'UNRECOGNIZED CHARACTER' as in the last grammar rule, flex splits them apart.
232a3
becomes
INTEGER 232
IDENTIFIER a3
and 90.s3
becomes
INTEGER 90
UNRECOGNIZED_CHARACTER .
IDENTIFIER s3
How should I go about stopping this from happening?
How should I go about stopping this from happening?
There are several alternatives. Among them,
you could use patterns with trailing context. For example, this INTEGER
pattern recognizes an integer token only where it is immediately followed by the end of the file or a character that is neither a decimal point nor an upper- or lowercase letter:
[-+]?(([1-9][0-9]*)|0)/([^.A-Za-z]|<<EOF>>)
Or you could play games with start conditions.
But your best option would probably be to add a pattern to match the unwanted tokens and flag the error. Example:
[a-zA-Z_][a-zA-Z0-9]* { /* IDENTIFIER */ }
([1-9][0-9]*)|0 { /* INTEGER */ }
[0-9]+[.][0-9]* { /* FLOAT version 1 */ }
[.][0-9]+ { /* FLOAT version 2 */ }
/* Used only if it affords a longer match than any of the above: */
[0-9.][0-9]*[a-zA-Z_.][0-9a-zA-Z_.]* { /* ERROR */ }
Note that this example omits leading +
and -
from the patterns for numbers. They will be matched instead by your existing patterns for the +
and -
operators, which is exactly what they are in C.
Note also that you would need to go to more effort -- possibly more than one error pattern -- if you were supporting the .
as a member-selection operator as in C.