The context is that I have many time series for many id
s and many band
s, and I have included a sample of nine id
s and two band
s. Here we can see that I can easily plot the time series for all the id
s:
library(tidyverse)
df <- structure(list(id = c(1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1001L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1002L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1004L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1005L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1009L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1010L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1011L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L, 1013L), date = structure(c(1488884400, 1490612474, 1507460497, 1502276146, 1514372627, 1512644789, 1500980863, 1503572707, 1513940711, 1496660730, 1495796861, 1512644789, 1488884400, 1504436115, 1502276146, 1495796118, 1494068453, 1504868786, 1513940711, 1511780307, 1511348810, 1503572707, 1497524848, 1507028336, 1491476744, 1503572707, 1492340161, 1501844755, 1505300762, 1503140790, 1509620381, 1488884400, 1487156167, 1510052273, 1491476744, 1494068453, 1513940711, 1489748810, 1498388749, 1509620381, 1500980120, 1511780307, 1502708860, 1489748810, 1501412778, 1504436115, 1495796861, 1493204748, 1510484382, 1487156167, 1508324436, 1500548201, 1513940711, 1505732183, 1490612474, 1496660730, 1511348810, 1514372627, 1494068453, 1510052273, 1500548201, 1513076347, 1508756553, 1510484382, 1504436858, 1504004193, 1494932749, 1508324436, 1512644789, 1504868786, 1507460497, 1504004193, 1503140790, 1500980120, 1512212632, 1491476744, 1513940711, 1508756553, 1504436115, 1490612474, 1495796861, 1509188631, 1508756553, 1486292805, 1504004193, 1498388749, 1495796861, 1486292805, 1513940711, 1499684790), class = c("POSIXct", "POSIXt"), tzone = "UTC"), band = c("fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5", "fit1", "fit1", "fit1", "fit1", "fit1", "fit5", "fit5", "fit5", "fit5", "fit5"), value = c(0.496538754230172, 0.503271496428091, 0.97387311299285, 0.580658673638122, 0.55924511798107, 0.832069876834949, 0.669456383223215, 1.12835570514478, 0.650077806710299, 0.380956367547047, 0.315803532869213, 0.792491389890908, 0.542150595815071, 1.03016500582205, 0.761751198659722, 0.367933240661702, 0.478285303617102, 1.68901870452092, 0.740965064159661, 1.09028738312622, 0.822334909416119, 0.758342181009204, 0.404208383270466, 0.892795714415756, 0.452540219822814, 1.15220190981348, 0.522093412373678, 0.953592910857701, 1.27850667816495, 1.10756222303339, 0.722797148902218, 0.465842402588039, 0.524130056243481, 0.724757971315511, 0.401849347220063, 0.455169211763473, 0.736683498842155, 0.530595901306756, 0.598435246507131, 0.855911625573028, 0.459872179640563, 0.851473466057886, 0.600348304937791, 0.484896112230185, 0.491357621589034, 1.21884821937325, 0.408355867626313, 0.541537217668289, 1.20173675518489, 0.61126928681528, 1.02122136799224, 0.489289990779144, 0.829092258901136, 0.88152853467569, 0.528559966420024, 0.544164467022259, 1.15093592993106, 0.876559089290843, 0.582149928218707, 1.26592404446571, 0.479960992971744, 0.840894959543198, 1.00459298341354, 0.98285777345435, 0.754965044767638, 1.14971147250154, 0.678568628236206, 1.38981008816777, 0.989354634818581, 1.25116433808614, 1.2142398253614, 1.03201975237089, 0.928602154928637, 0.642961745200205, 0.842888403466734, 0.649606669375906, 0.724490820076092, 1.68294181717141, 1.83216850101507, 0.69741924948021, 0.268972923828825, 1.16584414990533, 1.20604228862346, 0.586060027904748, 1.16356144256577, 0.52670838257608, 0.382147314320451, 0.668308513834733, 0.78509264848017, 0.733357618207109)), row.names = c(NA, -90L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = c("id", "band"), drop = TRUE, indices = list(0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44, 45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(id = c(1001L, 1001L, 1002L, 1002L, 1004L, 1004L, 1005L, 1005L, 1007L, 1007L, 1009L, 1009L, 1010L, 1010L, 1011L, 1011L, 1013L, 1013L), band = c("fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5")), row.names = c(NA, -18L), class = "data.frame", vars = c("id", "band"), drop = TRUE, indices = list(0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44, 45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(merge_id = c(1001L, 1001L, 1002L, 1002L, 1004L, 1004L, 1005L, 1005L, 1007L, 1007L, 1009L, 1009L, 1010L, 1010L, 1011L, 1011L, 1013L, 1013L), band = c("fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5", "fit1", "fit5")), row.names = c(NA, -18L), class = "data.frame", vars = c("merge_id", "band"), drop = TRUE)))
ggplot(df, aes(x = date, y = value, colour = band)) +
geom_point() +
geom_line() +
facet_wrap(~id)
However, this becomes unwieldy and the plots become too small when there are too many id
, so I want to visually inspect a random subset. I would expect the following to return just three of the id
s, but instead we get four id
s and we don't even get all the band
s for every id
. I choose seed 1234 here but you get a different result if you keep rerunning with different seeds, with different arrangements of band-id combinations.
set.seed(1234)
ggplot(
data = df %>% filter(id %in% sample(unique(df$id), 3)), # filtering to subset of 3 ids
mapping = aes(x = date, y = value, colour = band)
) +
geom_point() +
geom_line() +
facet_wrap(~id)
Note that it works if I do the sampling outside of the ggplot()
call. (This would be the desired result)
set.seed(1234)
some_ids <- sample(unique(df$id), 3) # moved sample() outside of ggplot()
ggplot(
data = df %>% filter(id %in% some_ids),
mapping = aes(x = date, y = value, colour = band)
) +
geom_point() +
geom_line() +
facet_wrap(~id)
Why is this happening? I cannot see the difference in logic between the two options. It is definitely related to sample
and not the unique(df$id)
part, since you can replace that with c(1001, 1002, 1004, 1005, 1007, 1009, 1010, 1011, 1013)
and still get the problem. I am also cognizant that it may be something to do with my specific data, since I did try making an analogous reprex with a built-in dataset, but I cannot imagine what that would be since this is already a pretty limited subset.
EDIT: for example, I cannot reproduce this error if I use this even more stripped down dataset. I am befuddled because I cannot tell any difference between this dataset and the one in my dput
except for the actual values.
df2 <- tibble(
id = rep(1:9, each = 5, times = 2),
date = rep(seq(as.POSIXct("2018-01-01 00:00:00"), by = "month", length.out = 5), times = 18),
band = rep(c("b1", "b2"), each = 45),
value = c(rnorm(45, 0), rnorm(45, 1))
)
TLDR: The filter expression gets evaluated multiple times, so you should not use a non-deterministic expression.
Not sure if this is good enough for an answer, but if you try to run your example with different seeds, you'll notice that the number of charts changes with each seed. This suggests that the number of ids we are filtering the data frame changes with each seed, suggesting that sample
is actually called multiple times. We can confirm this by creating a function that takes the place of sample
:
sample_out <- function(data, n) {
print("running sample_out ")
return (sample(data, n))
}
and then using it in place of sample
:
ggplot(
data = df %>% filter(id %in% sample_out(unique(df$id), 3)),
mapping = aes(x = date, y = value, colour = band)
)
you'll see that sample_out
actually gets called multiple times. In my session, this gets called 18 times with the data above, regardless of seed. Experimenting with different data frame sizes, it seems like sample
will get called (row_count / 5) times. This implies that filter
somehow evaluates its argument multiple times. A complete answer would explain why this happens with filter
but this is where I get a bit lost. I believe that the relevant source is here:
https://github.com/tidyverse/dplyr/blob/master/R/tbl-df.r#L55
filter.tbl_df <- function(.data, ..., .preserve = TRUE) {
// elided
out <- filter_impl(.data, quo)
filter_impl
basically calls a C++ implementation, and I think the key line is:
https://github.com/tidyverse/dplyr/blob/master/src/filter.cpp#L408
template <typename SlicedTibble>
SEXP filter_template(const SlicedTibble& gdf, const NamedQuosure& quo) {
// elided
Proxy call_proxy(quo.expr(), gdf, quo.env()) ;
// elided
int ngroups = gdf.ngroups() ;
// elided
for (int i = 0; i < ngroups; i++, ++git) {
// elided
LogicalVector g_test = check_result_lgl_type(call_proxy.get(indices));
// elided
}
// elided
}
Note that for each group of the tibble, call_proxy.get
is executed. I am assuming that we see sample_out
get called 18 times because there are 18 groups in the corresponding tibble.
Anyway, this could probably be answered quickly and authoritatively by posting to the relevant dplyr community contact. In my adventure learning about dyplr, I was unable to find a warning about this, so it could be that I am missing something. dplyr
's documentation discusses that it's evaluation is a little different than may be used to: https://dplyr.tidyverse.org/articles/programming.html.
Most dplyr functions use non-standard evaluation (NSE). This is a catch-all term that means they don’t follow the usual R rules of evaluation. Instead, they capture the expression that you typed and evaluate it in a custom way.