I have the data "li
" and I want to run the algorithm FPGrowth, but I don't know how
set.seed(123)
# make fake data
li <- list()
for(i in 1:10) li[[i]] <- make.unique(letters[sample(1:26,sample(5:20,1),rep = T)])
require(sparklyr)
sc <- spark_connect(master = "local",version = "3.0.1")
df <- copy_to(sc, **....??????what should be here??????...** )
fp_growth_model <- ml_fpgrowth(df)
there is a similar answer here but it doesn't work, i get the error
sc <- spark_connect(master = "local", version = "2.3")
tb <- tibble::tibble(items=c("a b c", "a b", "c f g", "b c"))
df <- copy_to(sc, tb) %>%
mutate(items = split(items, "\\\\s+"))
Error in mutate(., items = split(items, "\\\\s+")) :
could not find function "mutate"
/// plyr::mutate
df <- copy_to(sc, tb) %>%
plyr::mutate(items = split(items, "\\\\s+"))
Error in sdf_import.default(x, sc, name, memory, repartition, overwrite, :
table tb already exists (pass overwrite = TRUE to overwrite)
/// SparkR::mutate
df <- copy_to(sc, tb) %>%
SparkR::mutate(items = split(items, "\\\\s+"))
Error in sdf_import.default(x, sc, name, memory, repartition, overwrite, :
table tb already exists (pass overwrite = TRUE to overwrite)
The code example from the mentioned answer works. You get two errors the first because mutate
was not loaded. The second because the object tb
was already loaded into Spark.
Try running the following code from a new session:
library(tidyverse)
library(sparklyr)
sc <- spark_connect(master = "local")
tb <- tibble::tibble(items=c("a b c", "a b", "c f g", "b c"))
df <- copy_to(sc, tb) %>%
mutate(items = split(items, "\\\\s+"))
fp_growth_model <- ml_fpgrowth(df)
ml_association_rules(fp_growth_model)
ml_freq_itemsets(fp_growth_model)
To execute FP-growth with your dataset li
, you need to change the format.
The function ml_fpgrowth
requires a SparkDataFrame with a column of lists containing the sequences. You cannot transfer an R DataFrame with lists directly to Spark. First, you create a SparkDataFrame with sequences as a String and then generate the lists with mutate
and split
functions.
Here is the code applied to your data.
> tb_li <- tibble(items=sapply(li, function(x) paste(x, collapse=" ")))
> tb_li
# A tibble: 10 x 1
items
<chr>
1 o s n c j r v k e t n.1 v.1 y z e.1 s.1 y.1 y.2 i
2 c h z g j i s d n q k g.1 u l o j.1 m
3 i i.1 j w u g u.1 f y b e
4 l m r a y y.1 f u o i o.1 z
5 p t f k h v v.1 g p.1 q v.2 r q.1 b d m
6 v s y t v.1 y.1 n y.2 w
7 h p l y n c n.1
8 g c w v z o u e h s j r j.1 l b j.2 v.1
9 l t n q n.1 v c h n.2 s o x q.1 w k g o.1 w.1 z
10 n g j e f p x u w k
Transfer data to Spark and generate the lists:
> df_li <- copy_to(sc, tb_li, overwrite = TRUE) %>%
+ mutate(items = split(items, "\\\\s+"))
> df_li
# Source: spark<?> [?? x 1]
items
<list>
1 <list [19]>
2 <list [17]>
3 <list [11]>
4 <list [12]>
5 <list [16]>
6 <list [9]>
7 <list [7]>
8 <list [17]>
9 <list [19]>
10 <list [10]>
The data is ready to be used by the model like the example above.
> fp_growth_model_li <- ml_fpgrowth(df_li)
> ml_association_rules(fp_growth_model_li)
# Source: spark<?> [?? x 4]
antecedent consequent confidence lift
<list> <list> <dbl> <dbl>
1 <list [4]> <list [1]> 1 2
2 <list [3]> <list [1]> 1 2
3 <list [3]> <list [1]> 1 2
4 <list [3]> <list [1]> 1 2
5 <list [5]> <list [1]> 1 2
6 <list [5]> <list [1]> 1 2
7 <list [3]> <list [1]> 1 2
8 <list [3]> <list [1]> 1 2
9 <list [3]> <list [1]> 1 2
10 <list [3]> <list [1]> 1 2
# ... with more rows
> ml_freq_itemsets(fp_growth_model_li)
# Source: spark<?> [?? x 2]
items freq
<list> <dbl>
1 <list [1]> 3
2 <list [2]> 3
3 <list [3]> 3
4 <list [2]> 3
5 <list [1]> 5
6 <list [2]> 3
7 <list [3]> 3
8 <list [3]> 3
9 <list [4]> 3
10 <list [2]> 4
# ... with more rows