Search code examples
parsingscalaparser-combinators

Scala parser combinators: parse by lines


Using Scala parser combinators library is it possible to parse by lines, not but characters? I would like my custom parsers to have signature

(in: Reader[String]) => ParserResult[T]

so that String in.first would be a line of text and in.rest - remaining lines.

I assume I need to subclass Parsers and define Elem type as String. Then implement Reader[String] to split input into lines. I wonder is it the right approach? Are there other solutions?


Solution

  • The following code does that:

    import scala.util.parsing.combinator.Parsers
    import scala.util.parsing.input.{Position, Reader}
    
    object Main {
      def main(args: Array[String]): Unit = CVParser("one\ntwo\nthree")
    }
    
    object CVParser {
      def apply(in: String): Any = impl.parse(in)
    
      private object impl extends Parsers {
        override type Elem = String
    
        def parse(in: String): ParseResult[String~String~String] = {
          val reader = new ByLineReader(in.split(System.lineSeparator).toList, 1)
          grammar(reader)
        }
    
        // private
    
        private val grammar = SampleParser ~ SampleParser ~ SampleParser
    
        private class ByLineReader(lines: List[String], lineNo: Int) extends Reader[String] {
          override def first: String = lines.headOption getOrElse ""
          override def rest: Reader[String] =
            if (!lines.isEmpty) new ByLineReader(lines.tail, lineNo + 1)
            else new ByLineReader(lines, lineNo)
          override def pos: Position = new Position {
            def line: Int = lineNo
            def column: Int = 0
            def lineContents: String = first
          }
          override def atEnd: Boolean = lines.size == 1
        }
    
        private object SampleParser extends Parser[String] {
          def apply(in: Input): ParseResult[String] = {
            println(in.first + "\n---------")
            Success(in.first, in.rest)
          }
        }
      }
    }
    

    Outputs:

    one
    ---------
    two
    ---------
    three
    ---------
    

    Now new parsers which accept a line of text may be written and combined using usual parser combinators, like ~, ~>, <~, |, rep, etc.