Improve Performance for Facet Grid Plot on Big Data

I have several time series and need to plot the scatter plots of each combination. As I already posted the code here, with your help I figured out how to plot the whole thing nicely using a facet_grid() from the ggplot2-Package.

The problem now is the performance. The example below is pretty small. You can set n <- 50000 to touch the lower amount of data I need to handle. I think the most consuming part is to generate the FACET-Data_Frame with all combinations and particularly all the repetition. Finally, the plot call also takes a very long time due to the huge amount of lines I am passing through. The nrow(FACET) is length(df) * length(df) * n which is 5 Million in my actual real case with n = 50000 and length(df) = 10.

library(tidyverse)
set.seed(214)

n <- 1000
df <- tibble(v1 = runif(n), v2 = runif(n)*0.1 + v1, v3 = runif(n)*0.2 + v2, v4 = runif(n)*0.3 + v3, v5 = runif(n)*0.4 + v4, v6 = runif(n)*0.5 + v5)

C                   <- crossing(w1 = 1:length(df), w2 = 1:length(df))    # Alle Kombinationsmöglichkeiten

FACET_LIST <- lapply(1:nrow(C), function(c) { # c <- 14   C[c,]
  tibble(a1 = unlist(df[, C$w1[c]], use.names = FALSE), 
         a2 = unlist(df[, C$w2[c]], use.names = FALSE), 
         name1 = names(df[, C$w1[c]]),
         name2 = names(df[, C$w2[c]])
  )
})

FACET <- do.call(rbind.data.frame, FACET_LIST)

FACET$name1 <- as_factor(FACET$name1)
FACET$name2 <- as_factor(FACET$name2)

dat_text <- tibble(
  name1 = rep(names(df), each = length(names(df))), 
  name2 = rep(names(df), length(names(df)))
)

p <- ggplot()
p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
p <- p + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
p

Is there a more efficient way to pass the requires information to the facet_grid()-plot? Or is there any other way to speed up my code?

Solution

So I've run a number of tests with n = 50000:

base <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  print(p)
})

facet <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(p)
})

# Adding group to stat_smooth, so the number of lines it 
# has to estimate is consistent with the facetted option
smooth <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + stat_smooth(data=FACET, aes(a1, a2, group = interaction(name1, name2)), method = "lm")
  print(p)
})

smooth_facet <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
  p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(p)
})

building <- system.time({
  pp <- ggplot_build(p)
})

interpreting <- system.time({
  ppp <- ggplotGrob(pp$plot)
})

library(grid)
drawing <- system.time({
  grid.newpage(); grid.draw(ppp)
})

alternative <- system.time({
  g <- ggplot()
  g <- g + geom_point(data=FACET, aes(a1, a2), size = 0.5, shape = ".")
  g <- g + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
  g <- g + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(g)
})

These were the results:

rbind(base, facet, smooth, smooth_facet, building, interpreting, drawing, alternative)

             user.self sys.self elapsed user.child sys.child
base              8.34    30.96   39.44         NA        NA
facet             8.56    30.48   39.12         NA        NA
smooth           10.00    31.14   41.18         NA        NA
smooth_facet     10.14    31.50   41.73         NA        NA
building          2.59     0.42    3.03         NA        NA
interpreting      5.08     0.61    5.76         NA        NA
drawing           5.13    30.23   35.39         NA        NA
alternative       7.58     8.23   15.86         NA        NA

Which would suggest to me that it's not ggplot's code that is slow, it is either the drawing code or the fact that you have to draw many points.

However, it would seem that you could cut the time by more than half by not using the rounded points, but by using the shape = "." in the geom_point() statement (as in the 'alternative' test). You are likely overplotting points anyway. Here is how that looks: