Search code examples
parsinghaskellparsecmegaparsec

Indentation using Megaparsec


I would like to parse a basic indented language using Megaparsec. Originally I was using Parsec which I managed to get working correctly with indentation but now I'm having quite some trouble.

I've been following a tutorial here and here's the code I have to parse a language ignoring indentation.

module Parser where

import           Data.Functor                  ((<$>), (<$))
import           Control.Applicative           (Applicative(..))
import qualified Control.Monad                 as M
import Control.Monad (void)
import           Data.Functor.Identity
import           Data.Text                     (Text)
import qualified Data.Text                     as Text

import Data.Void

import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Megaparsec.Perm
import Text.Megaparsec.Expr
import qualified Text.Megaparsec.Char.Lexer as L

import Text.Pretty.Simple
import Data.Either.Unwrap

--import Lexer
import Syntax

type Parser = Parsec Void String

lineComment :: Parser ()
lineComment = L.skipLineComment "#"

scn :: Parser ()
scn = L.space space1 lineComment empty

sc :: Parser () -- ‘sc’ stands for “space consumer”
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc

symbol :: String -> Parser String
symbol = L.symbol sc

integer :: Parser Integer
integer = lexeme L.decimal


semi :: Parser String
semi = symbol ";"

rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)

rws :: [String] -- list of reserved words
rws = ["if","then","else","while","do","skip","true","false","not","and","or"]

identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> letterChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an identifier"
                else return x


parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")


whileParser :: Parser Stmt
whileParser = between sc eof stmt

stmt :: Parser Stmt
stmt = f <$> sepBy1 stmt' semi
  where
    -- if there's only one stmt return it without using ‘Seq’
    f l = if length l == 1 then head l else Seq l

stmt' :: Parser Stmt
stmt' = ifStmt
  <|> whileStmt
  <|> skipStmt
  <|> assignStmt
  <|> parens stmt

ifStmt :: Parser Stmt
ifStmt = do
    rword "if"
    cond  <- bExpr
    rword "then"
    stmt1 <- stmt
    rword "else"
    stmt2 <- stmt
    return (If cond stmt1 stmt2)

whileStmt :: Parser Stmt
whileStmt = do
  rword "while"
  cond <- bExpr
  rword "do"
  stmt1 <- stmt
  return (While cond stmt1)

assignStmt :: Parser Stmt
assignStmt = do
  var  <- identifier
  void (symbol ":=")
  expr <- aExpr
  return (Assign var expr)

skipStmt :: Parser Stmt
skipStmt = Skip <$ rword "skip"

aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators

bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators

aOperators :: [[Operator Parser AExpr]]
aOperators =
  [ [Prefix (Neg <$ symbol "-") ]
  , [ InfixL (ABinary Multiply <$ symbol "*")
    , InfixL (ABinary Divide   <$ symbol "/") ]
  , [ InfixL (ABinary Add      <$ symbol "+")
    , InfixL (ABinary Subtract <$ symbol "-") ]
  ]

bOperators :: [[Operator Parser BExpr]]
bOperators =
  [ [Prefix (Not <$ rword "not") ]
  , [InfixL (BBinary And <$ rword "and")
    , InfixL (BBinary Or <$ rword "or") ]
  ]

aTerm :: Parser AExpr
aTerm = parens aExpr
  <|> Var      <$> identifier
  <|> IntConst <$> integer

bTerm :: Parser BExpr
bTerm =  parens bExpr
  <|> (BoolConst True  <$ rword "true")
  <|> (BoolConst False <$ rword "false")
  <|> rExpr

rExpr :: Parser BExpr
rExpr = do
  a1 <- aExpr
  op <- relation
  a2 <- aExpr
  return (RBinary op a1 a2)

relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
  <|> (symbol "<" *> pure Less)

parsePrint :: String -> IO()
parsePrint s = do
    parseTest stmt' s

Running this parses correctly.

parsePrint $ unlines
[ "while (true) do if(false) then x := 5 else y := 20"
]

This is the code for parsing indentation from the second tutorial here.

{-# LANGUAGE TupleSections #-}

module Main where

import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L

type Parser = Parsec Void String

lineComment :: Parser ()
lineComment = L.skipLineComment "#"

scn :: Parser ()
scn = L.space space1 lineComment empty

sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc

pItem :: Parser String
pItem = lexeme (takeWhile1P Nothing f) <?> "list item"
  where
    f x = isAlphaNum x || x == '-'

pComplexItem :: Parser (String, [String])
pComplexItem = L.indentBlock scn p
  where
    p = do
      header <- pItem
      return (L.IndentMany Nothing (return . (header, )) pLineFold)

pLineFold :: Parser String
pLineFold = L.lineFold scn $ \sc' ->
  let ps = takeWhile1P Nothing f `sepBy1` try sc'
      f x = isAlphaNum x || x == '-'
  in unwords <$> ps <* sc

pItemList :: Parser (String, [(String, [String])])
pItemList = L.nonIndented scn (L.indentBlock scn p)
  where
    p = do
      header <- pItem
      return (L.IndentSome Nothing (return . (header, )) pComplexItem)

parser :: Parser (String, [(String, [String])])
parser = pItemList <* eof

main :: IO ()
main = return ()

I would like as an example for this to parse correctly.

parsePrint $ unlines
[ "while (true) do" 
, "    if(false) then x := 5 else y := 20"
]

How could I parse indentation correctly? Also are there any other places with tutorials/documentation on using Megaparsec?


Solution

  • After spending a lot of time on this over the last couple of weeks I managed to work it out. It was a matter of moving from using strings to using my own "Expr" data type.
    For anybody else who would like to start writing an indented language this code could be a good start!

    Parser

    {-# LANGUAGE TupleSections #-}
    
    module IndentTest where
    
    import Control.Applicative (empty)
    import Control.Monad (void)
    import Data.Void
    import Data.Char (isAlphaNum)
    import Text.Megaparsec
    import Text.Megaparsec.Char
    import qualified Text.Megaparsec.Char.Lexer as L
    import Text.Megaparsec.Expr
    
    import Block
    
    type Parser = Parsec Void String
    
    -- Tokens
    
    
    lineComment :: Parser ()
    lineComment = L.skipLineComment "#"
    
    
    scn :: Parser ()
    scn = L.space space1 lineComment empty
    
    
    sc :: Parser ()
    sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
      where
        f x = x == ' ' || x == '\t'
    
    
    symbol :: String -> Parser String
    symbol = L.symbol sc
    
    
    rword :: String -> Parser ()
    rword w = lexeme (string w *> notFollowedBy alphaNumChar)
    
    
    rws :: [String] -- list of reserved words
    rws = ["module", "println", "import",  "let", "if","then","else","while","do","skip","true","false","not","and","or"]
    
    
    word :: Parser String
    word = (lexeme . try) (p >>= check)
      where
        p       = (:) <$> alphaNumChar <*> many alphaNumChar
        check x = if x `elem` rws
                    then fail $ "keyword " ++ show x ++ " cannot be an word"
                    else return x
    
    lexeme :: Parser a -> Parser a
    lexeme = L.lexeme sc
    
    
    integer :: Parser Integer
    integer = lexeme L.decimal
    
    
    parens :: Parser a -> Parser a
    parens = between (symbol "(") (symbol ")")
    
    
    aTerm :: Parser AExpr
    aTerm = parens aExpr
      <|> Var      <$> identifier
      <|> IntConst <$> integer
    
    
    aOperators :: [[Operator Parser AExpr]]
    aOperators =
      [ [Prefix (Neg <$ symbol "-") ]
      , [ InfixL (ABinary Multiply <$ symbol "*")
        , InfixL (ABinary Divide   <$ symbol "/") ]
      , [ InfixL (ABinary Add      <$ symbol "+")
        , InfixL (ABinary Subtract <$ symbol "-") ]
      ]
    
    
    aExpr :: Parser AExpr
    aExpr = makeExprParser aTerm aOperators
    
    
    assignArith :: Parser Expr
    assignArith = do
      var  <- identifier
      symbol ":"
      vType <- valType
      symbol "="
      e <- aExpr
      return $ AssignArith vType var e
    
    
    bTerm :: Parser BExpr
    bTerm =  parens bExpr
      <|> (BoolConst True  <$ rword "true")
      <|> (BoolConst False <$ rword "false")
      <|> rExpr
    
    
    bOperators :: [[Operator Parser BExpr]]
    bOperators =
      [ [Prefix (Not <$ rword "not") ]
      , [InfixL (BBinary And <$ rword "and")
        , InfixL (BBinary Or <$ rword "or") ]
      ]
    
    
    bExpr :: Parser BExpr
    bExpr = makeExprParser bTerm bOperators
    
    
    rExpr :: Parser BExpr
    rExpr = do
      a1 <- aExpr
      op <- relation
      a2 <- aExpr
      return (RBinary op a1 a2)
    
    
    relation :: Parser RBinOp
    relation = (symbol ">" *> pure Greater)
      <|> (symbol "<" *> pure Less)
    
    
    identifier :: Parser String
    identifier = (lexeme . try) (p >>= check)
      where
        p       = (:) <$> letterChar <*> many alphaNumChar
        check x = if x `elem` rws
                    then fail $ "keyword " ++ show x ++ " cannot be an identifier"
                    else return x
    
    
    stringLiteral :: Parser Expr
    stringLiteral = do
      value <- char '"' >> manyTill L.charLiteral (char '"')
      symbol ";"
      return $ StringLiteral value
    
    
    assignString :: Parser Expr
    assignString = do
      var  <- identifier
      symbol ":"
      vType <- valType
      symbol "="
      e <- stringLiteral
      return (AssignString vType var e)
    
    
    arrayDef :: Parser Expr
    arrayDef = do
      name <- identifier
      symbol ":"
    
      symbol "["
      arrType <- word
      symbol "]"
    
      symbol "="
      return $ ArrayDef arrType name
    
    arrayValues :: Parser Expr
    arrayValues = do
      symbol "["
      values <- many identifier
      symbol "]"
      return $ ArrayValues values
    
    arrayAssign :: Parser Expr
    arrayAssign = do
      def <- arrayDef
      values <- arrayValues
      return $ ArrayAssignment def values
    
    arrayElementSelect :: Parser Expr
    arrayElementSelect = do
      symbol "!!"
      elementNum <- word
      return $ ArrayElementSelect elementNum
    
    
    moduleParser :: Parser Expr
    moduleParser = L.nonIndented scn (L.indentBlock scn p)
      where
        p = do
          rword "module"
          name <- identifier
          return (L.IndentSome Nothing (return . (Module name)) expr')
    
    
    valType :: Parser Expr
    valType = do
        value <- identifier
        return $ Type value
    
    
    argumentType :: Parser Expr
    argumentType = do
        value <- identifier
        return $ ArgumentType value
    
    
    returnType :: Parser Expr
    returnType = do
        value <- identifier
        return $ ReturnType value
    
    
    argument :: Parser Expr
    argument = do
      value <- identifier
      return $ Argument value
    
    
    -- Function parser
    functionParser :: Parser Expr
    functionParser = L.indentBlock scn p
      where
        p = do
          name <- identifier
          symbol ":"
          argTypes <- some argumentType
          symbol "->"
          rType <- IndentTest.returnType
          nameDup <- L.lineFold scn $ \sp' ->
            (identifier) `sepBy1` try sp' <* scn
          args <- many argument
          symbol "="
          if(name == "main") then
              return (L.IndentMany Nothing (return . (MainFunction name argTypes args rType)) expr')
          else
              return (L.IndentMany Nothing (return . (Function name argTypes args rType)) expr')
    
    
    
    functionCallParser :: Parser Expr
    functionCallParser = do
      name <- identifier
      args <- parens $ many argument
      return $ FunctionCall name args
    
    
    printParser :: Parser Expr
    printParser = do
      rword "println"
      bodyArr <- identifier
      symbol ";"
      return $ Print bodyArr
    
    
    valueToken :: Parser String
    valueToken = lexeme (takeWhile1P Nothing f) <?> "list item"
      where
        f x = isAlphaNum x || x == '-'
    
    
    ifStmt :: Parser Expr
    ifStmt = L.indentBlock scn p
       where
         p = do
           rword "if"
           cond  <- bExpr
           return (L.IndentMany Nothing (return . (If cond)) expr')
    
    elseStmt :: Parser Expr
    elseStmt = L.indentBlock scn p
       where
         p = do
           rword "else"
           return (L.IndentMany Nothing (return . (Else)) expr')
    
    whereStmt :: Parser Expr
    whereStmt = do
      rword "where"
      symbol "{"
      exprs <- many expr
      symbol "}"
      return $ (Where exprs)
    
    
    expr :: Parser Expr
    expr = f <$> sepBy1 expr' (symbol ";")
      where
        -- if there's only one expr return it without using ‘Seq’
        f l = if length l == 1 then head l else Seq l
    
    
    expr' :: Parser Expr
    expr' = try moduleParser
      <|> try functionParser
      <|> try ifStmt
      <|> try elseStmt
      <|> try arrayAssign
      <|> arrayElementSelect
      <|> try assignArith
      <|> try functionCallParser
      <|> try assignString
      <|> try printParser
      <|> try whereStmt
      <|> try stringLiteral
    
    
    parser :: Parser Expr
    parser = expr'
    
    
    parseFromFile file = runParser expr file <$> readFile file
    
    
    parseString input =
      case parse expr' "" input of
        Left  e -> show e
        Right x -> show x
    
    
    parsePrint :: String -> IO()
    parsePrint s = parseTest' parser s
    

    Block/Expr - The AST consists of this

    module Block where
    
    import Data.List
    import Text.Show.Functions
    import Data.Char
    import Data.Maybe
    
    -- Boolean expressions
    data BExpr
      = BoolConst Bool
      | Not BExpr
      | BBinary BBinOp BExpr BExpr
      | RBinary RBinOp AExpr AExpr
    
    instance Show BExpr where
        show (BoolConst b) = lowerString $ show b
        show (Not n) = show n
        show (BBinary bbinop bExpr1 bExpr2) = show bExpr1 ++ " " ++ show bbinop ++ " " ++ show bExpr2
        show (RBinary rbinop aExpr1 aExpr2) = show aExpr1 ++ " " ++ show rbinop ++ " " ++ show aExpr2
    
    
    -- Boolean ops
    data BBinOp
      = And
      | Or
    
    instance Show BBinOp where
        show (And) = "&&"
        show (Or) = "||"
    
    -- R binary ops
    data RBinOp
      = Greater
      | Less
    
    instance Show RBinOp where
        show (Greater) = ">"
        show (Less) = "<"
    
    -- Arithmetic expressions
    data AExpr
      = Var String
      | IntConst Integer
      | Neg AExpr
      | ABinary ABinOp AExpr AExpr
      | Parenthesis AExpr
    
    instance Show AExpr where
        show (Var v) = v
        show (IntConst i) = show i
        show (Neg aExpr) = "-" ++ show aExpr
        show (ABinary aBinOp aExpr1 aExpr2) = show aExpr1 ++ " " ++ show aBinOp ++ " " ++ show aExpr2
        show (Parenthesis aExpr) = "(" ++ show aExpr ++ ")"
    
    -- Arithmetic ops
    data ABinOp
      = OpeningParenthesis
      | ClosingParenthesis
      | Add
      | Subtract
      | Multiply
      | Divide
    
    instance Show ABinOp where
        show (Add) = "+"
        show (Subtract) = "-"
        show (Multiply) = "*"
        show (Divide) = "/"
        show (OpeningParenthesis) = "("
        show (ClosingParenthesis) = ")"
    
    -- Statements
    data Expr
      = Seq [Expr]
      | Module String [Expr]
      | Import String String
      | MainFunction {name ::String, argTypes:: [Expr], args::[Expr], returnType::Expr, body::[Expr]}
      | Function String [Expr] [Expr] Expr [Expr]
      | FunctionCall String [Expr]
      | Type String
      | ValueType String
      | Argument String
      | ArgumentType String
      | ReturnType String
      | AssignArith Expr String AExpr
      | AssignString Expr String Expr
      | If BExpr [Expr]
      | Else [Expr]
      | While BExpr [Expr]
      | Print String
      | Return Expr
      | ArrayValues [String]
      | ArrayDef String String
      | ArrayAssignment Expr Expr
      | ArrayElementSelect String
      | Lambda String String
      | Where [Expr]
      | StringLiteral String
      | Skip
    
    instance Show Expr where
        show (Module name bodyArray) =
            -- Get the main function tree
    
            "public class " ++ name ++ "{\n" ++
                "public static void main(String[] args){\n" ++
                    name ++ " " ++ lowerString name ++ "= new " ++ name ++ "();\n" ++
                    intercalate "\n" (map (\mStatement -> if(isFunctionCall mStatement) then (lowerString name ++ "." ++ show mStatement) else show mStatement) (body ((filter (isMainFunction) bodyArray)!!0))) ++
                "}\n" ++
                getFunctionString bodyArray ++
            "}\n"
    
        show (Import directory moduleName) = "import " ++ directory ++ moduleName
        show (Function name argTypes args returnType body) = "public " ++ show returnType ++ " " ++ name ++ "("++ intercalate ", " (zipWith (\x y -> x ++ " " ++ y) (map show argTypes) (map show args)) ++"){\n" ++ intercalate "\n" (map show body) ++ "}"
        show (MainFunction name argTypes args returnType body) =
            intercalate "\n " $ map show body
        show (FunctionCall name exprs) = name ++ "(" ++ (intercalate ", " (map show exprs)) ++ ");"
        show (Type b) = b
        show (Argument b) = b
        show (ArgumentType b) = b
        show (ReturnType b) = b
        show (AssignArith vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
        show (AssignString vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
        show (If condition statement) = "if(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
        show (Else statement) = " else {\n" ++ intercalate "\n" (map show statement) ++ "}"
        show (While condition statement) = "while(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
        show (Skip) = "[skip]"
        show (Seq s) = "[seq]"
        show (Return expr) = "return " ++ show expr ++ ";"
        show (Print exprs) = "System.out.println(" ++ exprs ++ ");" --"System.out.println(" ++ intercalate " " (map show exprs) ++ ");"
        show (ArrayDef arrType name) = arrType ++ "[] " ++ name ++ "="
        show (ArrayValues exprs) = "{" ++ intercalate ", " exprs ++ "};"
        show (ArrayAssignment arr values) = show arr ++ show values
        show (ArrayElementSelect i) = "[" ++ i ++ "];"
        show (Lambda valName collectionName) = ""
        show (Where exprs) = intercalate "\n" (map show exprs)
        show (StringLiteral value) = "\"" ++ value ++ "\""
        show (_) = "<unknown>"
    
    lowerString str = [ toLower loweredString | loweredString <- str]
    
    extractMain :: Expr -> Maybe String
    extractMain (MainFunction m _ _ _ _) = Just m
    extractMain _ = Nothing
    
    extractFunctionCall :: Expr -> Maybe String
    extractFunctionCall (FunctionCall m _) = Just m
    extractFunctionCall _ = Nothing
    
    isMainFunction :: Expr -> Bool
    isMainFunction e = isJust $ extractMain e
    
    isFunctionCall :: Expr -> Bool
    isFunctionCall e = isJust $ extractFunctionCall e
    
    {--
    getInnerMainFunctionString :: [Expr] -> String -> String
    getInnerMainFunctionString e instanceName  = do
        if(isMainFunction (e!!0)) then
          show (e!!0)
        else
          getInnerMainFunctionString (drop 1 e) instanceName
    --}
    getFunctionString :: [Expr] -> String
    getFunctionString e = do
        if(isMainFunction (e!!0)) then
          ""
        else
          "" ++ show (e!!0) ++ getFunctionString (drop 1 e)
    

    Code Example

    module IndentationTest
        testFunction : int -> void
        testFunction x =
            if(x < 50)
                println x;
                nextX :int = x + 1 * 2 - 3 / 2 + 5
                testFunction (nextX)
            else
                last :int = 1000
                println last;
    
        main : String -> IO
        main args =
            x :int = 3
            y :int = 10
            z :int = 15
            arrTest:[int] = [x y z]
            println arrTest;
            testFunction (x)
            stringTest :String = "Helloworld";
    

    This will successfully parse the example code. Just pass it into the parsePrint function.