Search code examples
rtidyverselubridate

How to summarize a tibble by time periods?


EDIT: I have updated the input, added the expected output.

I have a table that contains time-dates and a grouping criterion NEL_Hotspots. I am trying to summarise the table according to these rules:

Observations grouped by NEL_Hotspots and then all observations that fall within the same day (24h), AND have Wind_direc within +- 10.

This is a small subset of a larger table:

structure(list(Serial_number = c(10, 8, 9, 20, 21, 23, 3, 5, 
7, 11, 13, 20, 24), Date_time = c("3/31/05 1:57", "3/31/05 4:12", 
"3/31/05 18:12", "4/1/05 2:12", "4/1/05 3:12", "4/3/05 16:12", 
"3/28/05 9:57", "3/30/05 13:42", "3/31/05 1:57", "4/10/05 10:57", 
"4/10/05 18:57", "4/10/05 20:13", "4/10/05 21:30"), Wind_direc = c(50, 
60, 70, 60, 70, 70, 60, 140, 50, 270, 300, 310, 290), NEL_Hotspots = c(0, 
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), Dust_Intens = c("weak", 
"weak", "weak", "weak", "medium", "weak", "weak", "medium", "weak", 
"weak", "medium", "medium", "high"), Area_km2 = c(290, 241, 225, 
240, 340, 320, 176, 143, 211, 72, 171, 167, 121)), .Names = c("Serial_number", 
"Date_time", "Wind_direc", "NEL_Hotspots", "Dust_Intens", "Area_km2"
), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-13L), spec = structure(list(cols = structure(list(Serial_number = structure(list(), class = c("collector_double", 
"collector")), Date_time = structure(list(), class = c("collector_character", 
"collector")), Wind_direc = structure(list(), class = c("collector_double", 
"collector")), NEL_Hotspots = structure(list(), class = c("collector_double", 
"collector")), Dust_Intens = structure(list(), class = c("collector_character", 
"collector")), Area_km2 = structure(list(), class = c("collector_double", 
"collector"))), .Names = c("Serial_number", "Date_time", "Wind_direc", 
"NEL_Hotspots", "Dust_Intens", "Area_km2")), default = structure(list(), class = c("collector_guess", 
"collector")), skip = 1), .Names = c("cols", "default", "skip"
), class = "col_spec"))

Once the data is loaded I used df <- df %>% mutate(full_date = ymd_hms(Date_time)) from lubridate to create the column full_date.

The expected output is:

structure(list(`First Date_time` = c("3/31/05 1:57", "3/31/05 18:12", 
"4/1/05 2:12", "4/3/05 16:12", "3/28/05 9:57", "3/30/05 13:42", 
"3/31/05 1:57", "4/10/05 10:57", "4/10/05 18:57"), `Last Date_time` = c("3/31/05 4:12", 
"3/31/05 18:12", "4/1/05 3:12", "4/3/05 16:12", "3/28/05 9:57", 
"3/30/05 13:42", "3/31/05 1:57", "4/10/05 10:57", "4/10/05 21:30"
), Wind_direc_avg = c(55, 70, 60, 70, 60, 140, 50, 270, 300), 
    wind_direc_min = c(50, 70, 60, 70, 60, 140, 50, 270, 290), 
    wind_direc_max = c(60, 70, 70, 70, 60, 140, 50, 270, 310), 
    NEL_Hotspots = c(0, 0, 0, 0, 1, 1, 1, 1, 1), Dust_Intens = c("weak,weak", 
    "weak", "weak,medium", "weak", "weak", "medium", "weak", 
    "weak", "medium, medium, high"), Area_km2_avg = c(265.5, 
    225, 290, 320, 176, 143, 211, 72, 153), Area_km2_stdv = c(34.64, 
    0, 70.71, 0, 0, 0, 0, 0, 27.78), events_count = c(2, 1, 2, 
    1, 1, 1, 1, 1, 3), serial_numbers = c("10, 8", "9", "20, 21", 
    "23", "3", "5", "7", "11", "13, 20, 24")), .Names = c("First Date_time", 
"Last Date_time", "Wind_direc_avg", "wind_direc_min", "wind_direc_max", 
"NEL_Hotspots", "Dust_Intens", "Area_km2_avg", "Area_km2_stdv", 
"events_count", "serial_numbers"), class = c("spec_tbl_df", "tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
    cols = structure(list(`First Date_time` = structure(list(), class = c("collector_character", 
    "collector")), `Last Date_time` = structure(list(), class = c("collector_character", 
    "collector")), Wind_direc_avg = structure(list(), class = c("collector_double", 
    "collector")), wind_direc_min = structure(list(), class = c("collector_double", 
    "collector")), wind_direc_max = structure(list(), class = c("collector_double", 
    "collector")), NEL_Hotspots = structure(list(), class = c("collector_double", 
    "collector")), Dust_Intens = structure(list(), class = c("collector_character", 
    "collector")), Area_km2_avg = structure(list(), class = c("collector_double", 
    "collector")), Area_km2_stdv = structure(list(), class = c("collector_double", 
    "collector")), events_count = structure(list(), class = c("collector_double", 
    "collector")), serial_numbers = structure(list(), class = c("collector_character", 
    "collector"))), .Names = c("First Date_time", "Last Date_time", 
    "Wind_direc_avg", "wind_direc_min", "wind_direc_max", "NEL_Hotspots", 
    "Dust_Intens", "Area_km2_avg", "Area_km2_stdv", "events_count", 
    "serial_numbers")), default = structure(list(), class = c("collector_guess", 
    "collector")), skip = 1), .Names = c("cols", "default", "skip"
), class = "col_spec"))

I would appreciate any help!


Solution

  • Try to create groups based on your condition. A new group is created when -

    • Date changes
    • Every +10 value change in Wind

    For each group calculate all the statistics that you want in summarise

    library(dplyr)
    
    df %>%
      mutate(Date_time = lubridate::mdy_hm(Date_time), 
             date = as.Date(Date_time)) %>%
      group_by(date) %>%
      group_by(val = lag(ceiling((Wind_direc - first(Wind_direc))/10), 
                         default = 0), .add = TRUE) %>%
      summarise(first_date_time = first(Date_time), 
                last_date_time = last(Date_time), 
                Wind_direc_avg = mean(Wind_direc), 
                Wind_direc_min = min(Wind_direc), 
                Wind_direc_max = max(Wind_direc), 
                NEL_Hotspots = sum(NEL_Hotspots), 
                Dust_Intens = toString(Dust_Intens), 
                Area_km2_avg = mean(Area_km2))