Search code examples
clinuxcompiler-constructionflex-lexerlex

How flex recognizes substrings?


I am new to flex and working on lexical analysis for c language. I want to output all keywords,identifiers,literals,operators and separators. This is my program lexer.l which does not work.

%{
#include<stdio.h>
int currentLine=1;
%}
%%
#include<.*> printf("%d\t<%s,%s>\n",currentLine,"include","PreProcessor");
#define[^\n]+  printf("%d\t<%s,%s>\n",currentLine,"define","PreProcessor");
= {printf("%d\t<%s,%s>\n",currentLine,yytext,"AssignmentOperator");}
int|short|signed|unsigned|long|double|float|char|void|enum|union|struct|auto|const|register|static|volatile|extern|typedef|if|else|while|do|for|switch|case|continue|break|default|sizeof|goto|return   {printf("%d\t<%s,%s>\n",currentLine,yytext,"Keyword");}
[\t ]   ;
\n currentLine++;
(\"[^\"]*\")    {printf("%d\t<%s,%s>\n",currentLine,yytext,"String Literal");}
\( printf("%d\t<%s,%s>\n",currentLine,yytext,"parenthesisOpen");
\) printf("%d\t<%s,%s>\n",currentLine,yytext,"parenthesisClose");
\{ printf("%d\t<%s,%s>\n",currentLine,yytext,"blockOpen");
\} printf("%d\t<%s,%s>\n",currentLine,yytext,"blockClose");
"+"|"-"|"/"|"*"|"<="    {printf("%d\t<%s,%s>\n",currentLine,yytext,"ArithmeticOperator");}
(\&\&)|(\|\|)|! printf("%d\t<%s,%s>\n",currentLine,yytext,"LogicalOperator");
&|\||~ printf("%d\t<%s,%s>\n",currentLine,yytext,"BitwiseOperator");
\/\/[^\n] printf("%d\t<%s,%s>\n",currentLine,yytext,"SingleLineComment");
(\/\*.*\*\/)    printf("%d\t<%s,%s>\n",currentLine,yytext,"MultiLineComment");
;   printf("%d\t<%s,%s>\n",currentLine,yytext,"Separator");
.* printf("%s\tany match\n",yytext);
%%

int yywrap(){
    return 1;
}

int main(int argc, char *argv[]){

if(argc!=2){
    printf("Invalid arguments !\n Usage: lexgen <filename>\n");
    return 1;
}
yyin=fopen(argv[1],"r");
if(yyin==0){
    printf("File not found !\n");
    return 2;
}
printf("Lexical Analyser for C :-\n");
printf("Line\tToken\n");
yylex();
fclose(yyin);
return 0;
}

input file:

#include<stdio.h>
#define PI 3.14
int a=5;
double
< + - *
<= >= ! ~
"hskldjh";

This is other program tmp.l which does work(it works for int a=5; as for lexer.l it just ignores)

%{
#include<stdio.h>
#include<string.h>
char err[20][50],name[20][20];
int lno=1,cnt=0,ecnt=0,elno[20];
void st_add(char *);
%}

%%
[0-9]+   {printf("%d %s Number\n",lno,yytext);}
[+-/*]   {printf("%d %s Operator\n",lno,yytext);}
=   {printf("%d %s Assignment\n",lno,yytext);}
main|return|include|if|else|switch|cin|cout|using|namespace|std {printf("%d %s Keyword\n",lno,yytext);}
int|double|char|float {printf("%d %s Data type\n",lno,yytext);}
[\t ]      ;
\n   {lno++;}
(\/\/.*) ;
(\/\*[^*/]*\*\/) ;
(\/\*[^*/]*)  {elno[ecnt]=lno;char str[100]="Unterminated comment";strcpy(err[ecnt],str);ecnt++;}
printf|scanf  {printf("%d %s Library function\n",lno,yytext);}
[a-z]+[a-zA-Z0-9]* {printf("%d %s Identifier\n",lno,yytext);st_add(yytext);}
([a-zA-Z0-9]+\.h) {printf("%d %s Header\n",lno,yytext);}
\(   {printf("%d %s Open bracket\n",lno,yytext);}
\)   {printf("%d %s Close bracket\n",lno,yytext);}
\<<   {printf("%d %s insertion\n",lno,yytext);}
\>>   {printf("%d %s extraction\n",lno,yytext);}
\{   {printf("%d %s Block start\n",lno,yytext);}
\}   {printf("%d %s Block end\n",lno,yytext);}
#   {printf("%d %s Preprocessor\n",lno,yytext);}
;   {printf("%d %s Terminator\n",lno,yytext);}
(\"[^\"]*\")  {printf("%d %s String literal\n",lno,yytext);} 
(\"[^\"\n]*\n)  {elno[ecnt]=lno;char str[100]="Unterminated quote";strcpy(err[ecnt],str);ecnt++;lno++;}

[0-9]+[a-zA-z]*  {elno[ecnt]=lno;char str[100]="Unrecognized token";strcpy(err[ecnt],str);ecnt++;}  
%%
void st_add(char s[20])
{
int i;
for(i=0;i<cnt;i++)
{
if(strcmp(name[i],s)==0)
return;
}
strcpy(name[cnt],s);
cnt++;
}

main()
{
char file[20];
printf("Enter file name:");
scanf("%s",file);
yyin=fopen(file,"r");
printf("Line No. Lexeme  Token\n");
yylex();

printf("Number of errors: %d\n",ecnt);
int i=0;
for(i=0;i<ecnt;i++)
 printf("Line no.: %2d  %s\n",elno[i],err[i]);

printf("\nSymbol Table\n");
for(i=0;i<cnt;i++)
 printf("%s\n",name[i]);
return 0;
}

int yywrap()
{
return 1;
}

tmp.l output:

Line No. Lexeme  Token
1 # Preprocessor
1 include Keyword
<1 stdio.h Header
>2 # Preprocessor
2 define Identifier
PI2 3 Number
2 . Operator
2 14 Number
3 int Data type
3 a Identifier
3 = Assignment
3 5 Number
3 ; Terminator
4 double Data type
<5 + Operator
5 - Operator
5 * Operator
<6 = Assignment
>6 = Assignment
!~7 "hskldjh" String literal
7 ; Terminator
Number of errors: 0

Symbol Table
define
a

lexer.l output:

Lexical Analyser for C :-
Line    Token
1   <include,PreProcessor>
2   <define,PreProcessor>
int a=5;    any match
4   <double,Keyword>
< + - * any match
<= >= ! ~   any match
"hskldjh";  any match

It doesn't even match regex for relational, arithmetic operators. if input is int only then it displays int,keyword but if input is int a=5 it ignores but as for tmp.l it works perfectly fine! Is there a specific order of how should i write rules in flex?


Solution

  • (F)lex always uses the rule which has the longest match. The rule .* matches input up to the end of the line, which will be a longer match than any other rule unless the token is at the end of the line.