Search code examples
cregexlex

Lex program to count number of lines, characters, digits, and key words


I have been working on developing a lex scanner however when I feed it my input file it is producing the wrong output. Here is my source code:

%{
#include <stdio.h>

int NumberOfLines=0;
int NumberOfChar=0;
int NumberOfIntegers=0;
int KWCount=0;
int NumberOfComments=0;
%}

DIGIT   [0-9]*
ID  [a-z][a-z0-9]*
%x COMMENT
%option noyywrap
%%

^[\t]*"/*" {BEGIN COMMENT;}
^[\t]*"/*".*"*/"[\t]*\n {NumberOfComments++;}

<COMMENT>"*/"[\t]*\n {BEGIN 0; NumberOfComments++;}
<COMMENT>"*/" {BEGIN 0;}
<COMMENT>\n {NumberOfComments++;}
<COMMENT>.\n {NumberOfComments++;}

\n {NumberOfLines++, NumberOfChar++; NumberOfChar +=strlen(yytext);}
. {NumberOfChar++; NumberOfChar +=strlen(yytext);}



{DIGIT}     {NumberOfIntegers++; NumberOfChar +=strlen(yytext); }


{DIGIT}+"."{DIGIT}* {
    printf("A flot: %s (%g) \n", yytext, atof(yytext));
    NumberOfChar +=strlen(yytext); 
    }

if|else|while|return    {
    printf("A keyword: %s\n", yytext); KWCount++;
    NumberOfChar +=strlen(yytext); 
    }

{ID}        {
    printf("An identifier: %s\n", yytext);
    NumberOfChar +=strlen(yytext); 
    }
"{"[^}\n]*"}"   {
    /*each up one-line comments*/
    NumberOfChar +=strlen(yytext);
    }

%%
int main(int argc, char **argv){
    ++argv, --argc; /*skip over program name */
    if (argc > 0)
        yyin = fopen(argv[0], "r"); 
    else
        yyin = stdin; 
    yylex();
    printf("Character count: %d",NumberOfChar);
    printf("\n");
    printf("Number count: %d",NumberOfIntegers);
    printf("\n");
    printf("Keyword count: %d",KWCount);
    printf("\n");
    printf("Line count: %d",NumberOfLines);
    printf("\n");
    printf("Comment count: %d", NumberOfComments);
    printf("\n"); 
    return 0; 
}

Whenever I run my input file with the source it give me the wrong output. For instance the output of the file should be:

Output:

Number of Keywords: 3

Number of Characters: 196

Number of Lines: 17

Number of Digits: 3

However the output it is currently producing is:

Output:

Number of keywords: 0

Number of Characters: 3

Number of Lines: 7

Number of Digits: 0   

I suspect it has to do with my regular expressions, any help would be appreciated as I am still learning regex!

Here is my input file contents:

/*comment 1*/
/*comment
  comment 2 
  */
  /*comment 3*
   */if this is a line
{comment 4}
int i = 789; 
int j = 689;
if i == 172 then
 {comment 5}
else
{comment 6}
{comment 7}
/*8 comments
 *
 */ 
end

Solution

  • Here's some mostly working code, closely based on your code.

    %{
    #include <stdio.h>
    
    int NumberOfLines=0;
    int NumberOfChar=0;
    int NumberOfIntegers=0;
    int KWCount = 0;
    int IDCount = 0;
    int RCCount = 0;
    int OCCount = 0;
    int DTCount = 0;
    int FLCount = 0;
    %}
    
    %option noyywrap
    %option noinput
    %option nounput
    
    DIGIT   [0-9]*
    ID  [a-z][a-z0-9]*
    
    %%
    
    \n {NumberOfLines++; NumberOfChar++; RCCount += strlen(yytext); }
    . {NumberOfChar++; DTCount++; RCCount++; printf(" '%c'", yytext[0]); }
    
    {DIGIT}     {NumberOfIntegers++; RCCount += strlen(yytext); }
    
    {DIGIT}+"."{DIGIT}* {
        printf("\nA float: %s (%g) \n", yytext, atof(yytext)); 
        RCCount += strlen(yytext);
        FLCount++;
        }
    
    if|else|while|return    {
        printf("\nA keyword: %s\n", yytext); 
        KWCount++;
        RCCount += strlen(yytext);
        }
    
    {ID}        {
        printf("\nAn identifier: %s\n", yytext); 
        IDCount++;
        RCCount += strlen(yytext);
        }
    "{"[^}\n]*"}"   {
        RCCount += strlen(yytext);
        OCCount += strlen(yytext);
        }
    
    %%
    int main(int argc, char **argv){
        ++argv, --argc; /*skip over program name */
        if (argc > 0)
            yyin = fopen(argv[0], "r"); 
        else
            yyin = stdin; 
        yylex();
        printf("Character count: %d\n", NumberOfChar);
        printf("Number count:    %d\n", NumberOfIntegers);
        printf("Keyword count:   %d\n", KWCount);
        printf("Line count:      %d\n", NumberOfLines);
        printf("ID count:        %d\n", IDCount);
        printf("Dot count:       %d\n", DTCount);
        printf("Raw count:       %d\n", RCCount);
        printf("Float count:     %d\n", FLCount);
        printf("Other count:     %d\n", OCCount);
        printf("\n"); 
        return 0; 
    }
    

    When run on the data file:

    /*commEnt 1*/
    /*COMMENT
      commEnt 2 
      */
      /*commEnt 3*
       */if this is a linE
    {commEnt 4}
    int i = 789; 
    int j = 689;
    if i == 172 thEn
     {commEnt 5}
    ElsE
    {commEnt 6}
    {commEnt 7}
    float 12.34
    /*8 commEnts
     *
     else
     return
     while
     the
     going
     is
     good
     */ 
    end
    

    I get the output:

     '/' '*'
    An identifier: comm
     'E'
    An identifier: nt
     ' ' '1' '*' '/' '/' '*' 'C' 'O' 'M' 'M' 'E' 'N' 'T' ' ' ' '
    An identifier: comm
     'E'
    An identifier: nt
     ' ' '2' ' ' ' ' ' ' '*' '/' ' ' ' ' '/' '*'
    An identifier: comm
     'E'
    An identifier: nt
     ' ' '3' '*' ' ' ' ' ' ' '*' '/'
    A keyword: if
     ' '
    An identifier: this
     ' '
    An identifier: is
     ' ' 'a' ' '
    An identifier: lin
     'E'
    An identifier: int
     ' ' 'i' ' ' '=' ' ' ';' ' '
    An identifier: int
     ' ' 'j' ' ' '=' ' ' ';'
    A keyword: if
     ' ' 'i' ' ' '=' '=' ' ' ' '
    An identifier: th
     'E' 'n' ' ' 'E'
    An identifier: ls
     'E'
    An identifier: float
     ' '
    A float: 12.34 (12.34) 
     '/' '*' '8' ' '
    An identifier: comm
     'E'
    An identifier: nts
     ' ' '*' ' '
    A keyword: else
     ' '
    A keyword: return
     ' '
    A keyword: while
     ' '
    An identifier: the
     ' '
    An identifier: going
     ' '
    An identifier: is
     ' '
    An identifier: good
     ' ' '*' '/' ' '
    An identifier: end
    Character count: 115
    Number count:    3
    Keyword count:   5
    Line count:      26
    ID count:        21
    Dot count:       89
    Raw count:       258
    Float count:     1
    Other count:     44
    

    The output from wc is:

    $ wc data.2
          26      49     258 data.2
    $
    

    The 'raw count' of characters matches the character count from wc; the line count matches too. The number of integers, floats, keywords and identifiers all looks correct, given that upper case letters are counted in the 'dot characters'. You can work out whether there are other problems; I think that the count of integers is wrong, but I'm not sure why.