Search code examples
f#fparsec

Use FParsec to parse a self-describing input


I'm using FParsec to parse an input that describes its own format. For example, consider this input:

int,str,int:4,'hello',3

The first part of the input (before the colon) describes the format of the second part of the input. In this case, the format is int, str, int, which means that the actual data consists of three comma-separated values of the given types, so the result should be 4, "hello", 3.

What is the best way to parse something like this with FParsec?

I've pasted my best effort below, but I'm not happy with it. Is there a better way to do this that is cleaner, less stateful, and less reliant on the parse monad? I think this depends on smarter management of UserState, but I don't know how to do it. Thanks.

open FParsec

type State = { Formats : string[]; Index : int32 }
    with static member Default = { Formats = [||]; Index = 0 }

type Value =
    | Integer of int
    | String of string

let parseFormat : Parser<_, State> =
    parse {
        let! formats =
            sepBy
                (pstring "int" <|> pstring "str")
                (skipString ",")
                |>> Array.ofList
        do! updateUserState (fun state -> { state with Formats = formats })
    }

let parseValue format =
    match format with
        | "int" -> pint32 |>> Integer
        | "str" ->
            between
                (skipString "'")
                (skipString "'")
                (manySatisfy (fun c -> c <> '\''))
                    |>> String
        | _ -> failwith "Unexpected"

let parseValueByState =
    parse {
        let! state = getUserState
        let format = state.Formats.[state.Index]
        do! setUserState { state with Index = state.Index + 1}
        return! parseValue format
    }

let parseData =
    sepBy
        parseValueByState
        (skipString ",")

let parse =
    parseFormat
        >>. skipString ":"
        >>. parseData

[<EntryPoint>]
let main argv =
    let result = runParserOnString parse State.Default "" "int,str,int:4,'hello',3"
    printfn "%A" result
    0

Solution

  • @bytebuster beat me to it but I still post my solution. The technique is similar to @bytebuster.

    Thanks for an interesting question.

    In compilers I believe the preferred technique is to parse the text into an AST and on that run a type-checker. For this example a potentially simpler technique would be that parsing the type definitions returns a set of parsers for the values. These parsers are then applied on the rest of the string.

    open FParsec
    
    type Value = 
      | Integer of int
      | String  of string
    
    type ValueParser = Parser<Value, unit>
    
    let parseIntValue : Parser<Value, unit> =
      pint32 |>> Integer
    
    let parseStringValue : Parser<Value, unit> =
      between
        (skipChar '\'')
        (skipChar '\'')
        (manySatisfy (fun c -> c <> '\''))
        <?> "string"
        |>> String
    
    let parseValueParser : Parser<ValueParser, unit> =
      choice 
        [
          skipString "int"  >>% parseIntValue
          skipString "str"  >>% parseStringValue
        ]
    
    let parseValueParsers : Parser<ValueParser list, unit> =
        sepBy1
          parseValueParser
          (skipChar ',')
    
    // Runs a list of parsers 'ps' separated by 'sep' parser
    let sepByList (ps : Parser<'T, unit> list) (sep : Parser<unit, unit>) : Parser<'T list, unit> =
      let rec loop adjust ps =
        match ps with
        | []    -> preturn []
        | h::t  ->
          adjust h >>= fun v -> loop (fun pp -> sep >>. pp) t >>= fun vs -> preturn (v::vs)
      loop id ps
    
    let parseLine : Parser<Value list, unit> =
      parseValueParsers .>> skipChar ':' >>= (fun vps -> sepByList vps (skipChar ',')) .>> eof
    
    [<EntryPoint>]
    let main argv = 
        let s = "int,str,int:4,'hello',3"
    
        let r = run parseLine s
    
        printfn "%A" r
    
        0
    

    Parsing int,str,int:4,'hello',3 yields Success: [Integer 4; String "hello";Integer 3].

    Parsing int,str,str:4,'hello',3 (incorrect) yields:

    Failure:
    Error in Ln: 1 Col: 23
    int,str,str:4,'hello',3
                          ^
    Expecting: string