Search code examples
rdatetimeduplicatesintervals

How to compact block-duplicates of variables from a date-ordered dataset into a wide-table in R?


I have the following data

structure(list(station = c("61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002"), pollutant = c(17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L
), tag = c("002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002"
), concentration = c(NA, 0.41, 0.41, 0.41, 0.41, 0.41, 0.41, 
0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.39, 0.39, 0.39, 0.39, 
0.39, 0.39, 0.39, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.33, 
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.4, 0.4, 0.4, 0.4, 0.4, 
0.4, 0.4, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, NA, 0.38, 
0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 
0.38, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.38, 0.38, 0.38, 
0.38, 0.38, 0.38, 0.38, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31, 
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.38, 0.38, 0.38, 0.38, 
0.38, 0.38, 0.38), date = structure(c(1514764800, 1514851200, 
1514937600, 1515024000, 1515110400, 1515196800, 1515283200, 1515369600, 
1515456000, 1515542400, 1515628800, 1515715200, 1515801600, 1515888000, 
1515974400, 1516060800, 1516147200, 1516233600, 1516320000, 1516406400, 
1516492800, 1516579200, 1516665600, 1516752000, 1516838400, 1516924800, 
1517011200, 1517097600, 1517184000, 1517270400, 1517356800, 1517443200, 
1517529600, 1517616000, 1517702400, 1517788800, 1517875200, 1517961600, 
1518048000, 1518134400, 1518220800, 1518307200, 1518393600, 1518480000, 
1518566400, 1518652800, 1518739200, 1518825600, 1518912000, 1514764800, 
1514851200, 1514937600, 1515024000, 1515110400, 1515196800, 1515283200, 
1515369600, 1515456000, 1515542400, 1515628800, 1515715200, 1515801600, 
1515888000, 1515974400, 1516060800, 1516147200, 1516233600, 1516320000, 
1516406400, 1516492800, 1516579200, 1516665600, 1516752000, 1516838400, 
1516924800, 1517011200, 1517097600, 1517184000, 1517270400, 1517356800, 
1517443200, 1517529600, 1517616000, 1517702400, 1517788800, 1517875200, 
1517961600, 1518048000, 1518134400, 1518220800, 1518307200, 1518393600, 
1518480000, 1518566400, 1518652800, 1518739200, 1518825600, 1518912000
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA, 
-98L), class = c("tbl_df", "tbl", "data.frame"))

I would like to transform it as

tag   station start_date end_date   `17201` `17204`
<chr> <chr>   <date>     <date>       <dbl>   <dbl>
002   61R002  2018-01-02 2018-01-07    0.41    0.38
002   61R002  2018-01-08 2018-01-14    0.42    0.38
002   61R002  2018-01-15 2018-01-21    0.39    0.37
002   61R002  2018-01-22 2018-01-28    0.46    0.38
002   61R002  2018-01-29 2018-02-04    0.33    0.31
002   61R002  2018-02-05 2018-02-11    0.4     0.33
002   61R002  2018-02-12 2018-02-18    0.38    0.38

i.e. into non-overlapping date intervals per station.

How can I achieve this (with dplyr & pipe operator, for example) ?

Note that station & pollutant variables can take more values and interval between start_date & end_date are not fixed.

A previous question (and answer) allowed me to reach

station pollutant tag   concentration start_date          end_date           
<chr>       <int> <chr>         <dbl> <dttm>              <dttm>             
61R002      17201 002            0.41 2018-01-02 00:00:00 2018-01-07 00:00:00
61R002      17201 002            0.42 2018-01-08 00:00:00 2018-01-14 00:00:00
61R002      17201 002            0.39 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002      17201 002            0.46 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002      17201 002            0.33 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002      17201 002            0.4  2018-02-05 00:00:00 2018-02-11 00:00:00
61R002      17201 002            0.38 2018-02-12 00:00:00 2018-02-18 00:00:00
61R002      17204 002            0.38 2018-01-02 00:00:00 2018-01-14 00:00:00
61R002      17204 002            0.37 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002      17204 002            0.38 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002      17204 002            0.31 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002      17204 002            0.33 2018-02-05 00:00:00 2018-02-11 00:00:00
61R002      17204 002            0.38 2018-02-12 00:00:00 2018-02-18 00:00:00

Many thanks.


Solution

  • The solution is inspired by combining this answer and this answer.

    data_new <- data %>%
      arrange(station, pollutant, date) %>%
      group_by(tag, station, pollutant, grp = rleid(concentration)) %>%
      summarise(concentration = first(concentration), start_date = min(date), end_date = max(date), .groups = 'drop') %>%
      select(-grp) %>%
      mutate(date_range = interval(start_date, end_date))
      xls <- NULL
    for (station in unique(data$station))
    {
      tmp <- data_new %>%
        filter(station == !!station)
    
      all_dates <- tmp %>%
        select(start_date, end_date, concentration) %>%
        pivot_longer(!concentration, names_to = "date_type", values_to="date") %>%
        arrange(date) %>%
        select(date) %>%
        distinct()
    
      tmp2 <- tmp %>%
        rowwise() %>%
        mutate(bounded_dates = list(filter(all_dates, all_dates$date %within% date_range) %>% pull(date)),
               bounded_intervals = list(int_diff(bounded_dates))) %>%
        select(tag, station, pollutant, concentration, bounded_intervals) %>%
        unnest(bounded_intervals) %>%
        filter(bounded_intervals != as.interval(days(1), start = int_start(bounded_intervals))) %>%
        mutate(start_date = as_date(int_start(bounded_intervals)),
               end_date = as_date(int_end(bounded_intervals))) %>%
        pivot_wider(id_cols = -bounded_intervals, names_from = "pollutant", values_from = "concentration")
    
      xls <- bind_rows(xls, tmp2)
    }
    

    gives me the desired output

    tag   station start_date end_date   `17201` `17204`
    <chr> <chr>   <date>     <date>       <dbl>   <dbl>
    002   61R002  2018-01-02 2018-01-07    0.41    0.38
    002   61R002  2018-01-08 2018-01-14    0.42    0.38
    002   61R002  2018-01-15 2018-01-21    0.39    0.37
    002   61R002  2018-01-22 2018-01-28    0.46    0.38
    002   61R002  2018-01-29 2018-02-04    0.33    0.31
    002   61R002  2018-02-05 2018-02-11    0.4     0.33
    002   61R002  2018-02-12 2018-02-18    0.38    0.38