Let's say I have a bunch of orders for each month. An order can also extend over certain months. I do some predictions and I want to validate my predictions with KFold crossvalidation. My goal is to create K test and training sets.
I group my order ids, separate the indexes into test an training sets and collect rows based on those indexes.
The solution I came up with is slow because for every order id it uses Series.filter
with Seq.contains
. Does somebody know a more efficient way to do it in F#?
Simplified example:
#r "nuget: Deedle"
#r "nuget: FSharp.Stats"
open Deedle
type Order =
{ OrderId:string; Month:int; Amount:int }
let OrderRecds =
[ { OrderId = "I1"; Month = 1; Amount = 100}
{ OrderId = "I2"; Month = 1; Amount = 200}
{ OrderId = "I3"; Month = 1; Amount = 300}
{ OrderId = "I4"; Month = 1; Amount = 400}
{ OrderId = "I5"; Month = 1; Amount = 500}
{ OrderId = "I6"; Month = 1; Amount = 600}
{ OrderId = "I1"; Month = 2; Amount = 100}
{ OrderId = "I2"; Month = 2; Amount = 200}
{ OrderId = "I3"; Month = 2; Amount = 300}
{ OrderId = "I4"; Month = 2; Amount = 400}
{ OrderId = "I5"; Month = 2; Amount = 500}
{ OrderId = "I6"; Month = 2; Amount = 600}
{ OrderId = "I1"; Month = 3; Amount = 100}
{ OrderId = "I2"; Month = 3; Amount = 200}
{ OrderId = "I3"; Month = 3; Amount = 300}
{ OrderId = "I4"; Month = 3; Amount = 400}
{ OrderId = "I5"; Month = 3; Amount = 500}
{ OrderId = "I6"; Month = 3; Amount = 600}
]
let df_order = OrderRecds |> Frame.ofRecords
let order_ids_series =
df_order
|> Frame.groupRowsByString "OrderId"
|> Frame.nest
|> Series.indexOrdinally
let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
[|0 .. n-1|]
|> FSharp.Stats.Array.shuffleFisherYates
|> Seq.chunkBySize chunkSize
let train_indexes =
{ 0 .. NUM_FOLDS - 1 }
|> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)
let test_indexes = chunkIndices
let train_data =
train_indexes
|> Seq.map (fun indexes -> order_ids_series |> Series.filter (fun k _ -> Seq.contains k indexes) )
|> Seq.map Frame.unnest
|> Seq.map (Frame.mapRowKeys snd)
One concern I had with another answer was that Frame.unnest
or Frame.mergeAll
takes a lot of time. This solution creates a map of the indexes and uses filterRows
. On my dataset it is roughly 60 times faster.
let df_order_grouped =
df_order
|> Frame.groupRowsByString "OrderId"
let order_ids_series =
df_order_grouped
|> Frame.nest
let order_ids = order_ids_series.Keys
let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
order_ids |> Array.ofSeq
|> FSharp.Stats.Array.shuffleFisherYates
|> Seq.chunkBySize chunkSize
let train_indexes =
{ 0 .. NUM_FOLDS - 1 }
|> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)
let test_indexes = chunkIndices
let toMap (indexes:seq<string>)=
indexes
|> Seq.map (fun i -> i,true)
|> Map.ofSeq
let filter (indexes:Map<string,bool>) (df:Frame<string*int,string>) =
df
|> Frame.filterRows (fun k _ -> indexes.ContainsKey (fst k))
let train_data:seq<Frame<string*int,string>> =
train_indexes
|> Seq.map (fun indexes -> filter (indexes |> toMap) df_order_grouped)
train_data |> Seq.iter (fun df -> df.Print())