NOTICE: This is my same question as earlier, but I simplified it to the required structure. If it is still not looking good, please let me know what is wrong with the formatting for future reference.
I have a project for class where I have to make a lexical analyzer for TIPs. I am having an error in counting the newlines to display on the left side of the screen. Currently, it has a regular expression that increments the line_counter variable whenever \n or \r are used. My problem is that it does not pick up on new line symbols in a string. The program will display it how it is supposed to with the newlines, however it will not increment the counter.
DESIRED OUTPUT:
WHAT I HAVE DONE SO FAR:
ERRORS: I am not receiving any errors.
The below is a snippet of the relevant regular expressions that I am referencing in this question. Also, at the bottom of the question are the actual and expected outputs.
snippet from rules file
/* STRING LITERAL REGEX */
[']([^'\\]|\\(.|\n))*['] { if(yyleng <= 80)
{
return TOK_STRINGLIT;
}
else
{
return TOK_UNKNOWN;
}
}
/* REGEX TO COUNT NEW LINES */
[\r\n] { line_number++; }
Expected output
line: 16, lexeme: |'This string
has
newlines
inside of it'|, length: 43, token: 4003
line: 20, lexeme: |&|, length: 1, token: 6000
ERROR: unknown token
Actual output
line: 16, lexeme: |'This string
has
newlines
inside of it'|, length: 43, token: 4003
line: 17, lexeme: |&|, length: 1, token: 6000
ERROR: unknown token
REPRODUCIBLE FILES
LEXER.H
//*****************************************************************************
// CSE 4713 / 6713 Project - List of tokens for TIPS
//*****************************************************************************
#ifndef LEXER_H
#define LEXER_H
// List of token codes
// Keywords
#define TOK_BEGIN 1000
#define TOK_BREAK 1001
#define TOK_CONTINUE 1002
#define TOK_DOWNTO 1003
#define TOK_ELSE 1004
#define TOK_END 1005
#define TOK_FOR 1006
#define TOK_IF 1007
#define TOK_LET 1008
#define TOK_PROGRAM 1009
#define TOK_READ 1010
#define TOK_THEN 1012
#define TOK_TO 1013
#define TOK_VAR 1014
#define TOK_WHILE 1015
#define TOK_WRITE 1016
// Datatype Specifiers
#define TOK_INTEGER 1100
#define TOK_REAL 1101
// Punctuation
#define TOK_SEMICOLON 2000
#define TOK_COLON 2001
#define TOK_OPENPAREN 2002
#define TOK_CLOSEPAREN 2003
#define TOK_OPENBRACE 2004
#define TOK_CLOSEBRACE 2005
// Operators
#define TOK_PLUS 3000
#define TOK_MINUS 3001
#define TOK_MULTIPLY 3002
#define TOK_DIVIDE 3003
#define TOK_ASSIGN 3004
#define TOK_EQUALTO 3005
#define TOK_LESSTHAN 3006
#define TOK_GREATERTHAN 3007
#define TOK_NOTEQUALTO 3008
#define TOK_MOD 3009
#define TOK_NOT 3010
#define TOK_OR 3011
#define TOK_AND 3012
// Useful abstractions
#define TOK_IDENT 4000 // identifier
#define TOK_INTLIT 4001 // integer literal
#define TOK_FLOATLIT 4002 // floating point literal
#define TOK_STRINGLIT 4003 // string literal
#define TOK_EOF 5000 // end of file
#define TOK_EOF_SL 5001 // end of file while parsing a string literal
#define TOK_UNKNOWN 6000 // unknown lexeme
#endif
DRIVER.CPP
//*****************************************************************************
// CSE 4713 / 6713 Project Part 1 - Lexical Analyzer Driver
// Fall 2020
//*****************************************************************************
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <stdio.h>
#include "lexer.h"
// Instantiate global variables
extern "C"
{
extern FILE *yyin; // input stream
extern FILE *yyout; // output stream
extern int yyleng; // length of current lexeme
extern char *yytext; // text of current lexeme
extern int yylex(); // the generated lexical analyzer
extern int line_number; // current line number of the input
}
// Do the analysis
int main( int argc, char* argv[] ) {
int token; // hold each token code
// Set the input stream
if (argc > 1) {
printf("INFO: Using the file %s for input\n", argv[1]);
yyin = fopen(argv[1], "r");
if (!yyin) {
printf(" ERROR: input file not found\n");
return (-1);
}
}
else {
printf("INFO: Using stdin for input, use EOF to end input\n");
printf(" Windows EOF is Ctrl+z, Linux EOF is Ctrl+d\n");
yyin = stdin;
}
// Set the output stream
yyout = stdout;
// Do the lexical parsing
token = yylex();
while( token != TOK_EOF )
{
// What did we find?
fprintf(yyout, "line: %d, lexeme: |%s|, length: %d, token: %d\n",
line_number, yytext, yyleng, token);
// Is it an error?
if( token == TOK_UNKNOWN )
fprintf(yyout," ERROR: unknown token\n");
if( token == TOK_EOF_SL )
fprintf(yyout," ERROR: end of file while in a string literal\n");
// Get the next token
token = yylex();
}
return 0;
}
RULES.L
/*******************************************************************
Starting point your rules.l file for TIPS
Name: Stephanie Schisler NetID: sas880
Course: CSE 4713 Assignment: Part 1
Programming Environment: WSL C++
Purpose of File: Contains the rules for the project.
*******************************************************************/
%option noyywrap
%{
#include "lexer.h"
// global variable to hold current line number being read
int line_number = 1;
%}
%%
/* Keywords */
BEGIN { return TOK_BEGIN; }
BREAK { return TOK_BREAK; }
CONTINUE { return TOK_CONTINUE; }
DOWNTO { return TOK_DOWNTO; }
ELSE { return TOK_ELSE; }
END { return TOK_END; }
FOR { return TOK_FOR; }
IF { return TOK_IF; }
LET { return TOK_LET; }
PROGRAM { return TOK_PROGRAM; }
READ { return TOK_READ; }
THEN { return TOK_THEN; }
TO { return TOK_TO; }
VAR { return TOK_VAR; }
WHILE { return TOK_WHILE; }
WRITE { return TOK_WRITE; }
/* Datatype Specifiers */
INTEGER { return TOK_INTEGER; }
REAL { return TOK_REAL; }
/* Punctuation */
\; { return TOK_SEMICOLON; }
\: { return TOK_COLON; }
\( { return TOK_OPENPAREN; }
\) { return TOK_CLOSEPAREN; }
\{ { return TOK_OPENBRACE; }
\} { return TOK_CLOSEBRACE; }
/* Operators */
\+ { return TOK_PLUS; }
- { return TOK_MINUS; }
\* { return TOK_MULTIPLY; }
\/ { return TOK_DIVIDE; }
\:= { return TOK_ASSIGN; }
\= { return TOK_EQUALTO; }
\< { return TOK_LESSTHAN; }
\> { return TOK_GREATERTHAN; }
\<> { return TOK_NOTEQUALTO; }
MOD { return TOK_MOD; }
NOT { return TOK_NOT; }
OR { return TOK_OR; }
AND { return TOK_AND; }
/* Abstractions */
[A-Z][0-9A-Z]{0,7} { return TOK_IDENT; }
[0-9]+ { return TOK_INTLIT; }
[0-9]+[.]?[0-9]+ { return TOK_FLOATLIT; }
[']([^'\\]|\\(.|\n))*['] { if(yyleng <= 80)
{
return TOK_STRINGLIT;
}
else
{
return TOK_UNKNOWN;
}
}
"\[[^"\\]|\\(.|\n)]*|'\[[^'\\]|\\(.|\n)]* { return TOK_EOF_SL; }
/* Count new lines */
[\r\n] { line_number++; }
/* Eat any whitespace */
[\t ]*
/* Found an unknown character */
. { return TOK_UNKNOWN; }
/* Recognize end of file */
<<EOF>> { return TOK_EOF; }
INPUT FILE
ABCDEFGH
ABCDEFGHIJ
AB123 123AB A123ZZ SUM IFFINESS
AB_123
ab_123
123
3219012894910
12.132
.123
0.132
-123 -12.324
%%%%%%%
^
'This is a string'
'This string has tabs inside of it.'
'This string
has
newlines
inside of it'
&
CORRECT OUTPUT
ABCDEFGH
ABCDEFGHIJ
AB123 123AB A123ZZ SUM IFFINESS
AB_123
ab_123
123
3219012894910
12.132
.123
0.132
-123 -12.324
%%%%%%%
^
'This is a string'
'This string has tabs inside of it.'
'This string
has
newlines
inside of it'
&
MAKEFILE
###############################################################################
# CSE 4713 / 6713 Project Part 1 - Lexical Analyzer (flex)
#
# 'make' build executable file
# 'make clean' removes all intermediate (lex.yy.c and *.o) and executable files
#
# This makefile purposely avoids macros to make the rules more clear.
# For more information about makefiles:
# http://www.cs.colby.edu/maxwell/courses/tutorials/maketutor/
# http://www.cs.swarthmore.edu/~newhall/unixhelp/howto_makefiles.html
# http://www.gnu.org/software/make/manual/make.html
#
###############################################################################
lex.exe: lex.yy.o driver.o
g++ -g -o lex.exe lex.yy.o driver.o
driver.o: driver.cpp lexer.h
g++ -g -o driver.o -c driver.cpp
lex.yy.o: lex.yy.c lexer.h
gcc -g -o lex.yy.o -c lex.yy.c
lex.yy.c: rules.l lexer.h
flex -o lex.yy.c rules.l
clean:
$(RM) *.o lex.yy.c lex.exe
Your line counter is not being incremented by newline characters inside strings because nothing in your action for the string pattern changes the line counter.
(F)lex lexical analyzers divide the input into tokens, based on the patterns you provide, and for each token the associated action is executed. Patterns are not matched inside other patterns: that would lead to chaos. (tiffany
does not contain an if
token, for example. It's a single indivisible identifier.)
The easiest way to get an accurate line count out of a lexical analyser built with flex is to include the option
%option yylineno
in your prologue (the part of the flex input file before the first %%
). Once you do that, flex will do everything for you, and yylineno
will always contain the line number count. (It contains the line number count at the end of the token, which is important: if you want to know which line a multiline token started at, you need to do a (very little) bit more.)
It's possible that you have been told not to use yylineno
. (Personally, I think assignment restrictions like that are misguided, but I don't always see eye-to-eye with instructors.) If that is the case, you'll need to do what flex would have done for you automatically, which is rescan any token which might contain a newline character to count the number of newlines it contains, if any:
[']([^'\\]|\\(.|\n))*['] { for (const char* p = yytext; *p; ++p) {
if (*p == '\n') ++line_number;
}
/* Rest of the string action */
...
You can make your rescan more efficient (also applies to the flex-generated lexical analyser's rescan, since it will add code to do exactly what I did above) by using a start condition to handle the internals of a string literal. For example, you could change that code to:
%x STRING_LITERAL
%%
['] { yymore(); BEGIN(STRING_LITERAL); }
<STRING_LITERAL>{
[^'\\\n]+ { yymore(); }
\\?\r?\n { ++line_number; yymore(); }
\\. { yymore(); }
['] { BEGIN(INITIAL); /* Return to the normal scan */
/* At this point, yytext and yyleng refer to the
* entire token, including the ' marks. So you can
* now do exactly what you did in your string literal
* action. But see below for some comments.
*/
...
}
}
/* Other lexical rules continue here */
Take a look at the flex manual section on start conditions for a longer description of how start conditions work.
Note that all of the actions inside the string token, except the last one which actually accepts the token, contain a call to yymore()
. yymore()
is a special (f)lex action which tells the analyser that the token is not yet complete, and that the next pattern will match another part of the current token.
A few side notes on your code:
Your end-of-line pattern [\r\n]
matches either \r
or \n
. Windows line endings are actually the two-character sequence \r\n
, so you will end up incrementing line_number
twice if you encounter one. (However, you are not too likely to encounter one, unless you open the input file in binary mode, because the standard library should remove the \r
when it reads the line.) The correct pattern to match either a Windows or a Unix line-ending sequence is
\r?\n
which will match the entire line-ending. You will sometimes see more elaborate patterns, and if your instructor cares about the Museum of Historical Artifacts Never to be Seen Again, they might ask you to cope with conventions that haven't been in use in this century, like the pre-OS-X Apple convention of just using \r
. My advice is to resist taking that on. Just count the \n
s, whether or not they were preceded by \r
. That will be correct on any system you are likely to come across.
Your test for excessively long strings is imprecise (I think we've mentioned this fact in another question). First, it counts the quote marks in the length of the string literal, which is probably not correct. Second, it counts escape sequences by the length of the escape sequence itself, rather than the single character the escape sequence is translated into. That will result in counting a
as a single character, but the semantically equivalent \x61
as four characters, which is likely to cause perfectly valid less-than-80-character-long literals to be rejected by your lexical scanner. If you use the start-condition solution to counting lines suggested above, you can also correct the computed length of the literal by keeping a separate count of excess characters. (You cannot change yyleng
inside the token because yymore()
depends on begin able to maintain it and yytext
. See the manual link above.)