EDIT: I have updated the input, added the expected output.
I have a table that contains time-dates and a grouping criterion NEL_Hotspots
.
I am trying to summarise the table according to these rules:
Observations grouped by NEL_Hotspots
and then all observations that fall within the same day (24h), AND have Wind_direc
within +- 10.
This is a small subset of a larger table:
structure(list(Serial_number = c(10, 8, 9, 20, 21, 23, 3, 5,
7, 11, 13, 20, 24), Date_time = c("3/31/05 1:57", "3/31/05 4:12",
"3/31/05 18:12", "4/1/05 2:12", "4/1/05 3:12", "4/3/05 16:12",
"3/28/05 9:57", "3/30/05 13:42", "3/31/05 1:57", "4/10/05 10:57",
"4/10/05 18:57", "4/10/05 20:13", "4/10/05 21:30"), Wind_direc = c(50,
60, 70, 60, 70, 70, 60, 140, 50, 270, 300, 310, 290), NEL_Hotspots = c(0,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), Dust_Intens = c("weak",
"weak", "weak", "weak", "medium", "weak", "weak", "medium", "weak",
"weak", "medium", "medium", "high"), Area_km2 = c(290, 241, 225,
240, 340, 320, 176, 143, 211, 72, 171, 167, 121)), .Names = c("Serial_number",
"Date_time", "Wind_direc", "NEL_Hotspots", "Dust_Intens", "Area_km2"
), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-13L), spec = structure(list(cols = structure(list(Serial_number = structure(list(), class = c("collector_double",
"collector")), Date_time = structure(list(), class = c("collector_character",
"collector")), Wind_direc = structure(list(), class = c("collector_double",
"collector")), NEL_Hotspots = structure(list(), class = c("collector_double",
"collector")), Dust_Intens = structure(list(), class = c("collector_character",
"collector")), Area_km2 = structure(list(), class = c("collector_double",
"collector"))), .Names = c("Serial_number", "Date_time", "Wind_direc",
"NEL_Hotspots", "Dust_Intens", "Area_km2")), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), .Names = c("cols", "default", "skip"
), class = "col_spec"))
Once the data is loaded I used df <- df %>% mutate(full_date = ymd_hms(Date_time))
from lubridate
to create the column full_date
.
The expected output is:
structure(list(`First Date_time` = c("3/31/05 1:57", "3/31/05 18:12",
"4/1/05 2:12", "4/3/05 16:12", "3/28/05 9:57", "3/30/05 13:42",
"3/31/05 1:57", "4/10/05 10:57", "4/10/05 18:57"), `Last Date_time` = c("3/31/05 4:12",
"3/31/05 18:12", "4/1/05 3:12", "4/3/05 16:12", "3/28/05 9:57",
"3/30/05 13:42", "3/31/05 1:57", "4/10/05 10:57", "4/10/05 21:30"
), Wind_direc_avg = c(55, 70, 60, 70, 60, 140, 50, 270, 300),
wind_direc_min = c(50, 70, 60, 70, 60, 140, 50, 270, 290),
wind_direc_max = c(60, 70, 70, 70, 60, 140, 50, 270, 310),
NEL_Hotspots = c(0, 0, 0, 0, 1, 1, 1, 1, 1), Dust_Intens = c("weak,weak",
"weak", "weak,medium", "weak", "weak", "medium", "weak",
"weak", "medium, medium, high"), Area_km2_avg = c(265.5,
225, 290, 320, 176, 143, 211, 72, 153), Area_km2_stdv = c(34.64,
0, 70.71, 0, 0, 0, 0, 0, 27.78), events_count = c(2, 1, 2,
1, 1, 1, 1, 1, 3), serial_numbers = c("10, 8", "9", "20, 21",
"23", "3", "5", "7", "11", "13, 20, 24")), .Names = c("First Date_time",
"Last Date_time", "Wind_direc_avg", "wind_direc_min", "wind_direc_max",
"NEL_Hotspots", "Dust_Intens", "Area_km2_avg", "Area_km2_stdv",
"events_count", "serial_numbers"), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
cols = structure(list(`First Date_time` = structure(list(), class = c("collector_character",
"collector")), `Last Date_time` = structure(list(), class = c("collector_character",
"collector")), Wind_direc_avg = structure(list(), class = c("collector_double",
"collector")), wind_direc_min = structure(list(), class = c("collector_double",
"collector")), wind_direc_max = structure(list(), class = c("collector_double",
"collector")), NEL_Hotspots = structure(list(), class = c("collector_double",
"collector")), Dust_Intens = structure(list(), class = c("collector_character",
"collector")), Area_km2_avg = structure(list(), class = c("collector_double",
"collector")), Area_km2_stdv = structure(list(), class = c("collector_double",
"collector")), events_count = structure(list(), class = c("collector_double",
"collector")), serial_numbers = structure(list(), class = c("collector_character",
"collector"))), .Names = c("First Date_time", "Last Date_time",
"Wind_direc_avg", "wind_direc_min", "wind_direc_max", "NEL_Hotspots",
"Dust_Intens", "Area_km2_avg", "Area_km2_stdv", "events_count",
"serial_numbers")), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), .Names = c("cols", "default", "skip"
), class = "col_spec"))
I would appreciate any help!
Try to create groups based on your condition. A new group is created when -
For each group calculate all the statistics that you want in summarise
library(dplyr)
df %>%
mutate(Date_time = lubridate::mdy_hm(Date_time),
date = as.Date(Date_time)) %>%
group_by(date) %>%
group_by(val = lag(ceiling((Wind_direc - first(Wind_direc))/10),
default = 0), .add = TRUE) %>%
summarise(first_date_time = first(Date_time),
last_date_time = last(Date_time),
Wind_direc_avg = mean(Wind_direc),
Wind_direc_min = min(Wind_direc),
Wind_direc_max = max(Wind_direc),
NEL_Hotspots = sum(NEL_Hotspots),
Dust_Intens = toString(Dust_Intens),
Area_km2_avg = mean(Area_km2))