Search code examples
rggplot2dplyrsample

Why does filtering to a sample of a dataset inside ggplot() return an incorrect sample?


The context is that I have many time series for many ids and many bands, and I have included a sample of nine ids and two bands. Here we can see that I can easily plot the time series for all the ids:

library(tidyverse)
df <- structure(list(id = c(1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L), date = structure(c(1488884400, 1490612474, 1507460497, 1502276146, 1514372627, 1512644789, 1500980863, 1503572707, 1513940711, 1496660730, 1495796861, 1512644789, 1488884400, 1504436115, 1502276146, 1495796118, 1494068453, 1504868786, 1513940711, 1511780307, 1511348810, 1503572707, 1497524848, 1507028336, 1491476744, 1503572707, 1492340161, 1501844755, 1505300762, 1503140790, 1509620381, 1488884400, 1487156167, 1510052273, 1491476744, 1494068453, 1513940711, 1489748810, 1498388749, 1509620381, 1500980120, 1511780307, 1502708860, 1489748810, 1501412778, 1504436115, 1495796861, 1493204748, 1510484382, 1487156167, 1508324436, 1500548201, 1513940711, 1505732183, 1490612474, 1496660730, 1511348810, 1514372627, 1494068453, 1510052273, 1500548201, 1513076347, 1508756553, 1510484382, 1504436858, 1504004193, 1494932749, 1508324436, 1512644789, 1504868786, 1507460497, 1504004193, 1503140790, 1500980120, 1512212632, 1491476744, 1513940711, 1508756553, 1504436115, 1490612474, 1495796861, 1509188631, 1508756553, 1486292805, 1504004193, 1498388749, 1495796861, 1486292805, 1513940711, 1499684790), class = c("POSIXct", "POSIXt"), tzone = "UTC"), band = c("fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5"), value = c(0.496538754230172, 0.503271496428091, 0.97387311299285, 0.580658673638122, 0.55924511798107, 0.832069876834949, 0.669456383223215, 1.12835570514478, 0.650077806710299, 0.380956367547047, 0.315803532869213, 0.792491389890908, 0.542150595815071, 1.03016500582205, 0.761751198659722, 0.367933240661702, 0.478285303617102, 1.68901870452092, 0.740965064159661, 1.09028738312622, 0.822334909416119, 0.758342181009204, 0.404208383270466, 0.892795714415756, 0.452540219822814, 1.15220190981348, 0.522093412373678, 0.953592910857701, 1.27850667816495, 1.10756222303339, 0.722797148902218, 0.465842402588039, 0.524130056243481, 0.724757971315511, 0.401849347220063, 0.455169211763473, 0.736683498842155, 0.530595901306756, 0.598435246507131, 0.855911625573028, 0.459872179640563, 0.851473466057886, 0.600348304937791, 0.484896112230185, 0.491357621589034, 1.21884821937325, 0.408355867626313, 0.541537217668289, 1.20173675518489, 0.61126928681528, 1.02122136799224, 0.489289990779144, 0.829092258901136, 0.88152853467569, 0.528559966420024, 0.544164467022259, 1.15093592993106, 0.876559089290843, 0.582149928218707, 1.26592404446571, 0.479960992971744, 0.840894959543198, 1.00459298341354, 0.98285777345435, 0.754965044767638, 1.14971147250154, 0.678568628236206, 1.38981008816777, 0.989354634818581, 1.25116433808614, 1.2142398253614, 1.03201975237089, 0.928602154928637, 0.642961745200205, 0.842888403466734, 0.649606669375906, 0.724490820076092, 1.68294181717141, 1.83216850101507, 0.69741924948021, 0.268972923828825, 1.16584414990533, 1.20604228862346, 0.586060027904748, 1.16356144256577, 0.52670838257608, 0.382147314320451, 0.668308513834733, 0.78509264848017, 0.733357618207109)), row.names = c(NA, -90L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = c("id", "band"), drop = TRUE, indices = list(0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44, 45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(id = c(1001L, 1001L, 1002L, 1002L, 1004L, 1004L, 1005L, 1005L, 1007L, 1007L, 1009L, 1009L, 1010L, 1010L, 1011L, 1011L, 1013L, 1013L), band = c("fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5")), row.names = c(NA, -18L), class = "data.frame", vars = c("id", "band"), drop = TRUE, indices = list(0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44, 45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(merge_id = c(1001L, 1001L, 1002L, 1002L, 1004L, 1004L, 1005L, 1005L, 1007L, 1007L, 1009L, 1009L, 1010L, 1010L, 1011L, 1011L, 1013L, 1013L), band = c("fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5")), row.names = c(NA, -18L), class = "data.frame", vars = c("merge_id", "band"), drop = TRUE)))

ggplot(df, aes(x = date, y = value, colour = band)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~id)

However, this becomes unwieldy and the plots become too small when there are too many id, so I want to visually inspect a random subset. I would expect the following to return just three of the ids, but instead we get four ids and we don't even get all the bands for every id. I choose seed 1234 here but you get a different result if you keep rerunning with different seeds, with different arrangements of band-id combinations.

set.seed(1234)
ggplot(
  data = df %>% filter(id %in% sample(unique(df$id), 3)), # filtering to subset of 3 ids
  mapping = aes(x = date, y = value, colour = band)
) +
  geom_point() +
  geom_line() +
  facet_wrap(~id)

Note that it works if I do the sampling outside of the ggplot() call. (This would be the desired result)

set.seed(1234)
some_ids <- sample(unique(df$id), 3) # moved sample() outside of ggplot()
ggplot(
  data = df %>% filter(id %in% some_ids),
  mapping = aes(x = date, y = value, colour = band)
) +
  geom_point() +
  geom_line() +
  facet_wrap(~id)

Why is this happening? I cannot see the difference in logic between the two options. It is definitely related to sample and not the unique(df$id) part, since you can replace that with c(1001, 1002, 1004, 1005, 1007, 1009, 1010, 1011, 1013) and still get the problem. I am also cognizant that it may be something to do with my specific data, since I did try making an analogous reprex with a built-in dataset, but I cannot imagine what that would be since this is already a pretty limited subset.

EDIT: for example, I cannot reproduce this error if I use this even more stripped down dataset. I am befuddled because I cannot tell any difference between this dataset and the one in my dput except for the actual values.

df2 <- tibble(
  id = rep(1:9, each = 5, times = 2),
  date = rep(seq(as.POSIXct("2018-01-01 00:00:00"), by = "month", length.out = 5), times = 18),
  band = rep(c("b1", "b2"), each = 45),
  value = c(rnorm(45, 0), rnorm(45, 1))
)

Solution

  • TLDR: The filter expression gets evaluated multiple times, so you should not use a non-deterministic expression.

    Not sure if this is good enough for an answer, but if you try to run your example with different seeds, you'll notice that the number of charts changes with each seed. This suggests that the number of ids we are filtering the data frame changes with each seed, suggesting that sample is actually called multiple times. We can confirm this by creating a function that takes the place of sample:

    sample_out <- function(data, n) {
      print("running sample_out ")
      return (sample(data, n))
    }
    

    and then using it in place of sample:

    ggplot(
      data = df %>% filter(id %in% sample_out(unique(df$id), 3)), 
      mapping = aes(x = date, y = value, colour = band)
    )
    

    you'll see that sample_out actually gets called multiple times. In my session, this gets called 18 times with the data above, regardless of seed. Experimenting with different data frame sizes, it seems like sample will get called (row_count / 5) times. This implies that filter somehow evaluates its argument multiple times. A complete answer would explain why this happens with filter but this is where I get a bit lost. I believe that the relevant source is here:

    https://github.com/tidyverse/dplyr/blob/master/R/tbl-df.r#L55

    filter.tbl_df <- function(.data, ..., .preserve = TRUE) {
      // elided
      out <- filter_impl(.data, quo)
    

    filter_impl basically calls a C++ implementation, and I think the key line is:

    https://github.com/tidyverse/dplyr/blob/master/src/filter.cpp#L408

    template <typename SlicedTibble>
    SEXP filter_template(const SlicedTibble& gdf, const NamedQuosure& quo) {
      // elided
      Proxy call_proxy(quo.expr(), gdf, quo.env()) ;
      // elided
      int ngroups = gdf.ngroups() ;    
      // elided    
      for (int i = 0; i < ngroups; i++, ++git) {
        // elided
        LogicalVector g_test = check_result_lgl_type(call_proxy.get(indices));
        // elided
      }
      // elided
    }
    

    Note that for each group of the tibble, call_proxy.get is executed. I am assuming that we see sample_out get called 18 times because there are 18 groups in the corresponding tibble.

    Anyway, this could probably be answered quickly and authoritatively by posting to the relevant dplyr community contact. In my adventure learning about dyplr, I was unable to find a warning about this, so it could be that I am missing something. dplyr's documentation discusses that it's evaluation is a little different than may be used to: https://dplyr.tidyverse.org/articles/programming.html.

    Most dplyr functions use non-standard evaluation (NSE). This is a catch-all term that means they don’t follow the usual R rules of evaluation. Instead, they capture the expression that you typed and evaluate it in a custom way.