Search code examples
rdplyrtidyrfillcoalesce

How to use fill function from tidyr R package with groups


This question is a follow up to my other question, How to get tidy fill function to use groups when filling data rows.

I want to get one row per TIMESTAMP value. I want to fill in real data, by column, whether it's character, float, or whatever. If both TIMESTAMP rows have NA's, then I need NA returned. I want all columns returned.

A tiny portion of my data:

dupped_records <- structure(list(Station_Number = c("3999", "3999", "3999", "3999"
    ), Station_Name = c("Eagle 3NW Beta", "Eagle 3NW Beta", "Eagle 3NW Beta","Eagle 3NW 
    Beta"), NWSLI = c("EEMN1", "EEMN1", "EEMN1", "EEMN1"), Station_Legacy_Name = c("Rogers 
    Farm Tower", "Rogers Farm Tower", "Rogers Farm Tower", "Rogers Farm Tower"), Lat = 
    c(40.846994, 40.846994, 40.846994, 40.846994), Lon = c(-96.467199, -96.467199, 
    -96.467199, -96.467199), Elev_m = c("368.7", "368.7", "368.7", "368.7"), TIMESTAMP = 
    c("2021-07-28 20:05:00", "2021-07-28 20:05:00", "2021-07-28 20:06:00", "2021-07-28 
    20:06:00"), RECORD = c(18L, 18L, 19L, 19L), Ta_2m_Avg = c(34.3, NA, 34.26, NA), TaMax_2m = c(34.39, 
NA, 34.35, NA), TaMaxTime_2m = c("2021-07-28 20:04:15", NA, "2021-07-28 20:05:05", 
NA), TaMin_2m = c(34.25, NA, 34.16, NA), TaMinTime_2m = c("2021-07-28 20:04:45", 
NA, "2021-07-28 20:05:50", NA), RH_2m_Avg = c(57.35, NA, 56.97, 
NA), RHMax_2m = c(58.68, NA, 57.99, NA), RHMaxTime_2m = c("2021-07-28 20:04:30", 
NA, "2021-07-28 20:05:40", NA), RHMin_2m = c(55.86, NA, 55.41, 
NA), RHMinTime_2m = c("2021-07-28 20:04:45", NA, "2021-07-28 20:05:20", 
NA), Dp_2m_Avg = c(24.68, NA, 24.53, NA), DpMax_2m = c(25.06, 
NA, 24.77, NA), DpMaxTime_2m = c("2021-07-28 20:04:25", NA, "2021-07-28 20:05:40", 
NA), DpMin_2m = c(24.19, NA, 24.06, NA), DpMinTime_2m = c("2021-07-28 20:04:45", 
NA, "2021-07-28 20:05:20", NA), HeatIndex_2m_Avg = c(41.9, NA, 
41.64, NA), HeatIndexMax_2m = c(42.51, NA, 41.95, NA), HeatIndexMaxTime_2m = c("2021-07-28 20:04:25", 
NA, "2021-07-28 20:05:45", NA), WindChill_2m_Avg = c(37.65, NA, 
37.77, NA), WindChillMin_2m = c(37.47, NA, 37.56, NA), WindChillMinTime_2m = c("2021-07-28 20:04:45", 
NA, "2021-07-28 20:05:20", NA), WndAveSpd_3m = c(2.746, NA, 3.796, 
NA), WndVecMagAve_3m = c(2.594, NA, 3.607, NA), WndAveDir_3m = c(166.5, 
NA, 152.6, NA), WndAveDirSD_3m = c(19.03, NA, 18.05, NA), WndMaxSpd5s_3m = c(5.165, 
NA, 5.498, NA), WndMaxSpd5sTime_3m = c("2021-07-28 20:05:00", 
NA, "2021-07-28 20:05:05", NA), WndMax_5sec_Dir_3m = c(158.3, 
NA, 162.8, NA), PresAvg_1pnt5m = c(972.3953, NA, 972.3508, NA
), PresMax_1pnt5m = c(972.4536, NA, 972.42, NA), PresMaxTime_1pnt5m = c("2021-07-28 20:04:25", 
NA, "2021-07-28 20:05:10", NA), PresMin_1pnt5m = c(972.3868, 
NA, 972.2872, NA), PresMinTime_1pnt5m = c("2021-07-28 20:04:35", 
NA, "2021-07-28 20:05:50", NA), Solar_2m_Avg = c(771.1, NA, 776.1, 
NA), Rain_1m_Tot = c(0, NA, 0, NA), Ts_bare_10cm_Avg = c(32.98, 
NA, 33.01, NA), TsMax_bare_10cm = c(33, NA, 33.02, NA), TsMaxTime_bare_10cm = c("2021-07-28 20:05:00", 
NA, "2021-07-28 20:06:00", NA), TsMin_bare_10cm = c(32.97, NA, 
33, NA), TsMinTime_bare_10cm = c("2021-07-28 20:04:10", NA, "2021-07-28 20:05:05", 
NA), BattVolts_Min = c(12.77, NA, 12.77, NA), LithBatt_Min = c(3.56, 
NA, 3.56, NA), MaintMode = c(0L, NA, 0L, NA), WS_ms_S_WVT = c(NA, 
3.082, NA, 3.614), SlrW_Avg = c(NA, 854, NA, 859), SlrW_Max = c(NA, 
858, NA, 862), SlrW_TMx = c(NA, "2021-07-28 20:04:05", NA, "2021-07-28 20:06:00"
), SlrW = c(NA, 856, NA, 862), SlrW_Std = c(NA, 2.334, NA, 2.376
), WS_Dir = c(NA, 164L, NA, 128L), WS_ms_Avg = c(NA, 3.082, NA, 
3.614), WS_ms_Max = c(NA, 5.17, NA, 5.71), WS_ms_TMx = c(NA, 
"2021-07-28 20:05:00", NA, "2021-07-28 20:05:05"), WS_Diag = c(NA, 
0L, NA, 0L), SmplsF = c(NA, 1L, NA, 1L), Diag1F = c(NA, 0L, NA, 
0L), Diag2F = c(NA, 0L, NA, 0L), Diag4F = c(NA, 0L, NA, 0L), 
    Diag8F = c(NA, 0L, NA, 0L), Diag9F = c(NA, 0L, NA, 0L), Diag10F = c(NA, 
    0L, NA, 0L), NNDF = c(NA, 0L, NA, 0L), WindDir_D1_WVT = c(NA, 
    170.6, NA, 150.3), WindDir_SD1_WVT = c(NA, 16.43, NA, 14.95
    ), source = c("E:\\Data_Collected_withDups/3999/3999_1min.csv-20230216_0122", 
    "E:\\Data_Collected_withDups/3999/3999_1min_exp.csv-20230216_0122", 
    "E:\\Data_Collected_withDups/3999/3999_1min.csv-20230216_0122", 
    "E:\\Data_Collected_withDups/3999/3999_1min_exp.csv-20230216_0122"
    )), class = "data.frame", row.names = 1886274:1886277)

The result I want is:

no_dupped_records <- structure(list(Station_Number = c(3999L, 3999L), Station_Name = c("Eagle 3NW Beta", 
"Eagle 3NW Beta"), NWSLI = c("EEMN1", "EEMN1"), Station_Legacy_Name = c("Rogers Farm Tower", 
"Rogers Farm Tower"), Lat = c(40.846994, 40.846994), Lon = c(-96.467199, 
-96.467199), Elev_m = c(368.7, 368.7), TIMESTAMP = c("2021-07-28 20:05:00", 
"2021-07-28 20:06:00"), RECORD = 18:19, Ta_2m_Avg = c(34.3, 34.26
), TaMax_2m = c(34.39, 34.35), TaMaxTime_2m = c("2021-07-28 20:04:15", 
"2021-07-28 20:05:05"), TaMin_2m = c(34.25, 34.16), TaMinTime_2m = c("2021-07-28 20:04:45", 
"2021-07-28 20:05:50"), RH_2m_Avg = c(57.35, 56.97), RHMax_2m = c(58.68, 
57.99), RHMaxTime_2m = c("2021-07-28 20:04:30", "2021-07-28 20:05:40"
), RHMin_2m = c(55.86, 55.41), RHMinTime_2m = c("2021-07-28 20:04:45", 
"2021-07-28 20:05:20"), Dp_2m_Avg = c(24.68, 24.53), DpMax_2m = c(25.06, 
24.77), DpMaxTime_2m = c("2021-07-28 20:04:25", "2021-07-28 20:05:40"
), DpMin_2m = c(24.19, 24.06), DpMinTime_2m = c("2021-07-28 20:04:45", 
"2021-07-28 20:05:20"), HeatIndex_2m_Avg = c(41.9, 41.64), HeatIndexMax_2m = c(42.51, 
41.95), HeatIndexMaxTime_2m = c("2021-07-28 20:04:25", "2021-07-28 20:05:45"
), WindChill_2m_Avg = c(37.65, 37.77), WindChillMin_2m = c(37.47, 
37.56), WindChillMinTime_2m = c("2021-07-28 20:04:45", "2021-07-28 20:05:20"
), WndAveSpd_3m = c(2.746, 3.796), WndVecMagAve_3m = c(2.594, 
3.607), WndAveDir_3m = c(166.5, 152.6), WndAveDirSD_3m = c(19.03, 
18.05), WndMaxSpd5s_3m = c(5.165, 5.498), WndMaxSpd5sTime_3m = c("2021-07-28 20:05:00", 
"2021-07-28 20:05:05"), WndMax_5sec_Dir_3m = c(158.3, 162.8), 
    PresAvg_1pnt5m = c(972.3953, 972.3508), PresMax_1pnt5m = c(972.4536, 
    972.42), PresMaxTime_1pnt5m = c("2021-07-28 20:04:25", "2021-07-28 20:05:10"
    ), PresMin_1pnt5m = c(972.3868, 972.2872), PresMinTime_1pnt5m = c("2021-07-28 20:04:35", 
    "2021-07-28 20:05:50"), Solar_2m_Avg = c(771.1, 776.1), Rain_1m_Tot = c(0L, 
    0L), Ts_bare_10cm_Avg = c(32.98, 33.01), TsMax_bare_10cm = c(33, 
    33.02), TsMaxTime_bare_10cm = c("2021-07-28 20:05:00", "2021-07-28 20:06:00"
    ), TsMin_bare_10cm = c(32.97, 33), TsMinTime_bare_10cm = c("2021-07-28 20:04:10", 
    "2021-07-28 20:05:05"), BattVolts_Min = c(12.77, 12.77), 
    LithBatt_Min = c(3.56, 3.56), MaintMode = c(0L, 0L), WS_ms_S_WVT = c(3.082, 
    3.614), SlrW_Avg = c(854L, 859L), SlrW_Max = c(858L, 862L
    ), SlrW_TMx = c("2021-07-28 20:04:05", "2021-07-28 20:06:00"
    ), SlrW = c(856L, 862L), SlrW_Std = c(2.334, 2.376), WS_Dir = c(164L, 
    128L), WS_ms_Avg = c(3.082, 3.614), WS_ms_Max = c(5.17, 5.71
    ), WS_ms_TMx = c("2021-07-28 20:05:00", "2021-07-28 20:05:05"
    ), WS_Diag = c(0L, 0L), SmplsF = c(1L, 1L), Diag1F = c(0L, 
    0L), Diag2F = c(0L, 0L), Diag4F = c(0L, 0L), Diag8F = c(0L, 
    0L), Diag9F = c(0L, 0L), Diag10F = c(0L, 0L), NNDF = c(0L, 
    0L), WindDir_D1_WVT = c(170.6, 150.3), WindDir_SD1_WVT = c(16.43, 
    14.95), source = c("E:\\Data_Collected_withDups/3999/3999_1min.csv-20230216_0122", 
    "E:\\Data_Collected_withDups/3999/3999_1min.csv-20230216_0122"
    )), class = "data.frame", row.names = c(NA, -2L))

It looks like I need to use mutate somehow to get what I want, based on the answers to my previous question. But how? I need all columns returned.

Here are two statements I thought might work. I get the error that everything must be used within a selecting function.

dupped_records |> group_by(TIMESTAMP) |> mutate_all(fill(everything(), .direction = 'up'))

This statement does nothing. Why?

dupped_records |> group_by(TIMESTAMP) |> mutate(across(fill(everything(), .direction = 'up')))

Anyway, I appreciate your help, as always. A quick EDIT: Is there a way to do this with dplyr::coalesce?


Solution

  • library(tidyverse)
    
    dupped_records |>
      mutate(TIMESTAMP = ymd_hms(TIMESTAMP)) |>
      summarise(across(Station_Number:RECORD, first), across(Ta_2m_Avg:WindDir_SD1_WVT, na.omit), .by = "TIMESTAMP")
    
                TIMESTAMP Station_Number   Station_Name NWSLI
    1 2021-07-28 20:05:00           3999 Eagle 3NW Beta EEMN1
    2 2021-07-28 20:06:00           3999 Eagle 3NW Beta EEMN1
          Station_Legacy_Name      Lat      Lon Elev_m RECORD Ta_2m_Avg TaMax_2m
    1 Rogers \n    Farm Tower 40.84699 -96.4672  368.7     18     34.30    34.39
    2       Rogers Farm Tower 40.84699 -96.4672  368.7     19     34.26    34.35
             TaMaxTime_2m TaMin_2m        TaMinTime_2m RH_2m_Avg RHMax_2m
    1 2021-07-28 20:04:15    34.25 2021-07-28 20:04:45     57.35    58.68
    2 2021-07-28 20:05:05    34.16 2021-07-28 20:05:50     56.97    57.99
             RHMaxTime_2m RHMin_2m        RHMinTime_2m Dp_2m_Avg DpMax_2m
    1 2021-07-28 20:04:30    55.86 2021-07-28 20:04:45     24.68    25.06
    2 2021-07-28 20:05:40    55.41 2021-07-28 20:05:20     24.53    24.77
             DpMaxTime_2m DpMin_2m        DpMinTime_2m HeatIndex_2m_Avg
    1 2021-07-28 20:04:25    24.19 2021-07-28 20:04:45            41.90
    2 2021-07-28 20:05:40    24.06 2021-07-28 20:05:20            41.64
      HeatIndexMax_2m HeatIndexMaxTime_2m WindChill_2m_Avg WindChillMin_2m
    1           42.51 2021-07-28 20:04:25            37.65           37.47
    2           41.95 2021-07-28 20:05:45            37.77           37.56
      WindChillMinTime_2m WndAveSpd_3m WndVecMagAve_3m WndAveDir_3m WndAveDirSD_3m
    1 2021-07-28 20:04:45        2.746           2.594        166.5          19.03
    2 2021-07-28 20:05:20        3.796           3.607        152.6          18.05
      WndMaxSpd5s_3m  WndMaxSpd5sTime_3m WndMax_5sec_Dir_3m PresAvg_1pnt5m
    1          5.165 2021-07-28 20:05:00              158.3       972.3953
    2          5.498 2021-07-28 20:05:05              162.8       972.3508
      PresMax_1pnt5m  PresMaxTime_1pnt5m PresMin_1pnt5m  PresMinTime_1pnt5m
    1       972.4536 2021-07-28 20:04:25       972.3868 2021-07-28 20:04:35
    2       972.4200 2021-07-28 20:05:10       972.2872 2021-07-28 20:05:50
      Solar_2m_Avg Rain_1m_Tot Ts_bare_10cm_Avg TsMax_bare_10cm TsMaxTime_bare_10cm
    1        771.1           0            32.98           33.00 2021-07-28 20:05:00
    2        776.1           0            33.01           33.02 2021-07-28 20:06:00
      TsMin_bare_10cm TsMinTime_bare_10cm BattVolts_Min LithBatt_Min MaintMode
    1           32.97 2021-07-28 20:04:10         12.77         3.56         0
    2           33.00 2021-07-28 20:05:05         12.77         3.56         0
      WS_ms_S_WVT SlrW_Avg SlrW_Max            SlrW_TMx SlrW SlrW_Std WS_Dir
    1       3.082      854      858 2021-07-28 20:04:05  856    2.334    164
    2       3.614      859      862 2021-07-28 20:06:00  862    2.376    128
      WS_ms_Avg WS_ms_Max           WS_ms_TMx WS_Diag SmplsF Diag1F Diag2F Diag4F
    1     3.082      5.17 2021-07-28 20:05:00       0      1      0      0      0
    2     3.614      5.71 2021-07-28 20:05:05       0      1      0      0      0
      Diag8F Diag9F Diag10F NNDF WindDir_D1_WVT WindDir_SD1_WVT
    1      0      0       0    0          170.6           16.43
    2      0      0       0    0          150.3           14.95