Search code examples
f#deedle

Create KFold crossvalidation test and training sets with Deedle


Let's say I have a bunch of orders for each month. An order can also extend over certain months. I do some predictions and I want to validate my predictions with KFold crossvalidation. My goal is to create K test and training sets.

I group my order ids, separate the indexes into test an training sets and collect rows based on those indexes.

The solution I came up with is slow because for every order id it uses Series.filter with Seq.contains. Does somebody know a more efficient way to do it in F#?

Simplified example:

#r "nuget: Deedle"
#r "nuget: FSharp.Stats"

open Deedle

type Order = 
  { OrderId:string; Month:int; Amount:int }

let OrderRecds = 
  [ { OrderId = "I1"; Month = 1; Amount = 100}
    { OrderId = "I2"; Month = 1; Amount = 200}
    { OrderId = "I3"; Month = 1; Amount = 300}
    { OrderId = "I4"; Month = 1; Amount = 400}
    { OrderId = "I5"; Month = 1; Amount = 500}
    { OrderId = "I6"; Month = 1; Amount = 600}
    { OrderId = "I1"; Month = 2; Amount = 100}
    { OrderId = "I2"; Month = 2; Amount = 200}
    { OrderId = "I3"; Month = 2; Amount = 300}
    { OrderId = "I4"; Month = 2; Amount = 400}
    { OrderId = "I5"; Month = 2; Amount = 500}
    { OrderId = "I6"; Month = 2; Amount = 600}
    { OrderId = "I1"; Month = 3; Amount = 100}
    { OrderId = "I2"; Month = 3; Amount = 200}
    { OrderId = "I3"; Month = 3; Amount = 300}
    { OrderId = "I4"; Month = 3; Amount = 400}
    { OrderId = "I5"; Month = 3; Amount = 500}
    { OrderId = "I6"; Month = 3; Amount = 600}
     ]

let df_order = OrderRecds |> Frame.ofRecords

let order_ids_series = 
    df_order 
    |> Frame.groupRowsByString "OrderId" 
    |> Frame.nest
    |> Series.indexOrdinally

let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
        [|0 .. n-1|]
        |> FSharp.Stats.Array.shuffleFisherYates
        |> Seq.chunkBySize chunkSize

let train_indexes =
    { 0 .. NUM_FOLDS - 1 }
    |> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)

let test_indexes = chunkIndices

let train_data = 
    train_indexes 
    |> Seq.map (fun indexes -> order_ids_series |> Series.filter (fun k _ -> Seq.contains k indexes) ) 
    |> Seq.map Frame.unnest 
    |> Seq.map (Frame.mapRowKeys snd)

Solution

  • One concern I had with another answer was that Frame.unnest or Frame.mergeAll takes a lot of time. This solution creates a map of the indexes and uses filterRows. On my dataset it is roughly 60 times faster.

    let df_order_grouped = 
        df_order 
        |> Frame.groupRowsByString "OrderId"
    
    let order_ids_series = 
      df_order_grouped
        |> Frame.nest
    
    let order_ids = order_ids_series.Keys
    
    let NUM_FOLDS = 3
    let n = order_ids_series |> Series.countKeys
    let chunkSize = int (ceil (float n / float NUM_FOLDS))
    let chunkIndices =
            order_ids |> Array.ofSeq
            |> FSharp.Stats.Array.shuffleFisherYates
            |> Seq.chunkBySize chunkSize
    
    let train_indexes =
        { 0 .. NUM_FOLDS - 1 }
        |> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)
    
    let test_indexes = chunkIndices
    
    let toMap (indexes:seq<string>)= 
        indexes 
        |> Seq.map (fun i -> i,true) 
        |> Map.ofSeq
    
    let filter (indexes:Map<string,bool>) (df:Frame<string*int,string>) = 
        df 
        |> Frame.filterRows (fun k _ -> indexes.ContainsKey (fst k))
    
    let train_data:seq<Frame<string*int,string>> =
        train_indexes
           |> Seq.map (fun indexes -> filter (indexes |> toMap) df_order_grouped)
    
    train_data |> Seq.iter (fun df -> df.Print())