First off, drake
is just magical. I love the workflow of designing the dependency graph, and then executing it in one fell swoop.
However, I ran into a roadblock.
My workflow is simulating over large parameter grids, and then summarizing different slices of the said grid. I'd like to create a plot for every such slice. If I understand this correctly, I should use some form of cross->combine->map
to achieve this.
Here is what I have:
sim_data <- function(mean, sd) {
tibble(r = rnorm(1000, mean, sd))
}
plot_dis <- function(lg, title) {
ggplot(lg) +
geom_histogram(aes(x=r, fill=sd), binwidth = 0.25) +
labs(title = str_glue("x = {title}")) +
ggsave(str_glue("{title}.pdf")) # side-effect
}
plan <- drake_plan(
data = target(
sim_data(mean = x, sd = sd),
transform = cross(x = c(10, 20, 30), sd = c(1, 2))
), # awesome
s_x = target(
bind_rows(data, .id = "sd"),
transform = combine(data, .by=x)
), # great
plot = target(
plot_dis(s_x, x),
transform = map(s_x)
) # how to add a `file_out` to this target?
)
So my plot
target has a side-effect of saving the plot.
Is there a better way to do this? Like a proper file_out
for the plot
target?
Thank you.
Great question. Thinking about this actually helps me iron out some issues with drake
+ keras
.
file_out()
sYou're almost there, all you need is some tidy evaluation (!!
) to make sure each file name is a literal string in the plan.
library(drake)
drake_plan(
data = target(
sim_data(mean = x, sd = sd),
transform = cross(x = c(10, 20, 30), sd = c(1, 2))
),
s_x = target(
bind_rows(data, .id = "sd"),
transform = combine(data, .by=x)
),
plot = target(
plot_dis(s_x, file_out(!!sprintf("%s.pdf", x))),
transform = map(s_x)
)
)
#> # A tibble: 12 x 2
#> target command
#> <chr> <expr>
#> 1 data_10_1 sim_data(mean = 10, sd = 1)
#> 2 data_20_1 sim_data(mean = 20, sd = 1)
#> 3 data_30_1 sim_data(mean = 30, sd = 1)
#> 4 data_10_2 sim_data(mean = 10, sd = 2)
#> 5 data_20_2 sim_data(mean = 20, sd = 2)
#> 6 data_30_2 sim_data(mean = 30, sd = 2)
#> 7 s_x_10 bind_rows(data_10_1, data_10_2, .id = "sd")
#> 8 s_x_20 bind_rows(data_20_1, data_20_2, .id = "sd")
#> 9 s_x_30 bind_rows(data_30_1, data_30_2, .id = "sd")
#> 10 plot_s_x_10 plot_dis(s_x_10, file_out("10.pdf"))
#> 11 plot_s_x_20 plot_dis(s_x_20, file_out("20.pdf"))
#> 12 plot_s_x_30 plot_dis(s_x_30, file_out("30.pdf"))
Created on 2019-03-26 by the reprex package (v0.2.1)
And with a little more metaprogramming, you can use entire target names instead.
library(drake)
drake_plan(
data = target(
sim_data(mean = x, sd = sd),
transform = cross(x = c(10, 20, 30), sd = c(1, 2))
),
s_x = target(
bind_rows(data, .id = "sd"),
transform = combine(data, .by=x)
),
plot = target(
plot_dis(s_x, file_out(!!sprintf("%s.pdf", deparse(substitute(s_x))))),
transform = map(s_x)
)
)
#> # A tibble: 12 x 2
#> target command
#> <chr> <expr>
#> 1 data_10_1 sim_data(mean = 10, sd = 1)
#> 2 data_20_1 sim_data(mean = 20, sd = 1)
#> 3 data_30_1 sim_data(mean = 30, sd = 1)
#> 4 data_10_2 sim_data(mean = 10, sd = 2)
#> 5 data_20_2 sim_data(mean = 20, sd = 2)
#> 6 data_30_2 sim_data(mean = 30, sd = 2)
#> 7 s_x_10 bind_rows(data_10_1, data_10_2, .id = "sd")
#> 8 s_x_20 bind_rows(data_20_1, data_20_2, .id = "sd")
#> 9 s_x_30 bind_rows(data_30_1, data_30_2, .id = "sd")
#> 10 plot_s_x_10 plot_dis(s_x_10, file_out("s_x_10.pdf"))
#> 11 plot_s_x_20 plot_dis(s_x_20, file_out("s_x_20.pdf"))
#> 12 plot_s_x_30 plot_dis(s_x_30, file_out("s_x_30.pdf"))
Created on 2019-03-26 by the reprex package (v0.2.1)
ggplot2
objects play nicely with drake
's cache.
library(drake)
library(tidyverse)
sim_data <- function(mean, sd) {
tibble(r = rnorm(1000, mean, sd))
}
plot_dis <- function(lg) {
ggplot(lg) +
geom_histogram(aes(x=r, fill=sd), binwidth = 0.25) +
labs(title = deparse(substitute(lg)))
}
plan <- drake_plan(
data = target(
sim_data(mean = x, sd = sd),
transform = cross(x = c(10, 20, 30), sd = c(1, 2))
),
s_x = target(
bind_rows(data, .id = "sd"),
transform = combine(data, .by=x)
),
plot = target(
plot_dis(s_x),
transform = map(s_x)
)
)
make(plan)
#> target data_10_1
#> target data_10_2
#> target data_20_1
#> target data_20_2
#> target data_30_2
#> target data_30_1
#> target s_x_10
#> target s_x_20
#> target s_x_30
#> target plot_s_x_10
#> target plot_s_x_20
#> target plot_s_x_30
readd(plot_s_x_10) # see also loadd()
Created on 2019-03-26 by the reprex package (v0.2.1)