This is a slightly tricky dataset where the columns are laid out like this.
ID C.Date T.Date C(Area) T(Area) Level(closet)_1 Venti_1 Level(closet)_2 Venti_2
733 2013.06.18 2013.06.18 65.2 42.1 C6 0 C3 1
537 2015.10.01 2015.15.01 34.5 27.2 C3 0 T11 0
909 2016-01-14 2016-01-14 15.1 25.9 T4 1 T2 1
Rule
Step1 : Consider columns: ID, C.Date, C(Area), Level(closet)_1, Venti_1, Level(closet)_2, Venti_2
Rearrange the data like this.
ID Index Date Ref.Level Area Level(closet) Venti
733 1 2013.06.18 C 65.2 C6 0
733 2 2013.06.18 C 65.2 C3 1
Step2 : Consider columns: ID, T.Date, T(Area), Level(closet)_1, Venti_1, Level(closet)_2, Venti_2
Rearrange the data like this.
ID Index Date Ref.Level Area Level(closet) Venti
733 3 2013.06.18 T 42.1 NA NA
Notice both Step1 and Step2 references values in columns Level(closet)_1, Venti_1, Level(closet)_2, Venti_2
. The difference is in Step2, when there are values for T.Date
and T(Area)
the expectation is that either one of the Level(closet)
value will start with T*
, in the 1st ID 733 there were NONE. So the transformed dataset 3rd row has values NA for columns Level(closet), Venti
. The 2nd ID 537 again has both T.Date
and T(Area)
values, again based on the Step2 we look for Level(closet)
column values that start with T*
in this case Level(closet)_2
contains value T11
so for the wide-to-long transformed data for ID 523 will be
Step1 : Consider columns: ID, C.Date, C(Area), Level(closet)_1, Venti_1, Level(closet)_2, Venti_2 Rearrange the data like this.
ID Index Date Ref.Level Area Level(closet) Venti
537 1 2015.10.01 C 34.5 C3 0
Step2 : Consider columns: ID, T.Date, T(Area), Level(closet)_1, Venti_1, Level(closet)_2, Venti_2 Rearrange the data like this.
ID Index Date Ref.Level Area Level(closet) Venti
537 2 2015.15.01 T 27.2 T11 0
The final expected dataset would look like this below
ID Index Date Ref.Level Area Level(closet) Venti
733 1 2013.06.18 C 65.2 C6 0
733 2 2013.06.18 C 65.2 C3 1
733 3 2013.06.18 T 42.1 NA NA
537 1 2015.10.01 C 34.5 C3 0
537 2 2015.15.01 T 27.2 T11 0
909 1 2016-01-14 C 15.1 NA NA
909 2 2016-01-14 T 25.9 T4 1
909 3 2016-01-14 T 25.9 T2 1
Sorry this is a bit complicated. On the surface level this looks like taking few rows in the wide format and reshaping this to a long format but there is a nested ifelse to see if there are any values starting with T*
in the Level(closet)
columns. I am completely blank how to structure this in a long format like this. Any help or suggestions is much apricated. Thanks.
library(tidyverse)
df <- tibble::tribble(~`ID`, ~`C.Date`, ~`T.Date`, ~`C(Area)`, ~`T(Area)`, ~`Level(closet)_1`, ~`Venti_1`, ~`Level(closet)_2`, ~`Venti_2`,
"733", "2013.06.18", "2013.06.18", "65.2", "42.1", "C6", "0", "C3", "1",
"537", "2015.10.01", "2015.15.01", "34.5", "27.2", "C3", "0", "T11", "0",
"909", "2016-01-14", "2016-01-14", "15.1", "25.9", "T4", "1", "T2", "1"
)
The following code works, you can follow it easily, but probably not the efficient way of doing this but does the job.
library(tidyverse)
tibble::tribble(~`ID`, ~`C.Date`, ~`T.Date`, ~`C(Area)`, ~`T(Area)`, ~`Level(closet)_1`, ~`Venti_1`, ~`Level(closet)_2`, ~`Venti_2`,
"733", "2013.06.18", "2013.06.18", "65.2", "42.1", "C6", "0", "C3", "1",
"537", "2015.10.01", "2015.15.01", "34.5", "27.2", "C3", "0", "T11", "0",
"909", "2016-01-14", "2016-01-14", "15.1", "25.9", "T4", "1", "T2", "1"
) -> df
df
#> # A tibble: 3 x 9
#> ID C.Date T.Date `C(Area)` `T(Area)` `Level(closet)_… Venti_1
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 733 2013.… 2013.… 65.2 42.1 C6 0
#> 2 537 2015.… 2015.… 34.5 27.2 C3 0
#> 3 909 2016-… 2016-… 15.1 25.9 T4 1
#> # … with 2 more variables: `Level(closet)_2` <chr>, Venti_2 <chr>
df %>%
mutate(across(c(1,4,5,7,9), as.numeric)) %>%
janitor::clean_names()-> df1
df1
#> # A tibble: 3 x 9
#> id c_date t_date c_area t_area level_closet_1 venti_1 level_closet_2
#> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr>
#> 1 733 2013.… 2013.… 65.2 42.1 C6 0 C3
#> 2 537 2015.… 2015.… 34.5 27.2 C3 0 T11
#> 3 909 2016-… 2016-… 15.1 25.9 T4 1 T2
#> # … with 1 more variable: venti_2 <dbl>
df1 %>%
select(id, c_date, c_area) -> df2
df2
#> # A tibble: 3 x 3
#> id c_date c_area
#> <dbl> <chr> <dbl>
#> 1 733 2013.06.18 65.2
#> 2 537 2015.10.01 34.5
#> 3 909 2016-01-14 15.1
df1 %>%
select(id, t_date, t_area) -> df3
df3
#> # A tibble: 3 x 3
#> id t_date t_area
#> <dbl> <chr> <dbl>
#> 1 733 2013.06.18 42.1
#> 2 537 2015.15.01 27.2
#> 3 909 2016-01-14 25.9
df1 %>%
select(id, level_closet_1, level_closet_2) %>%
pivot_longer(-1) %>%
left_join(df2) %>%
filter(str_detect(value, "C")) %>%
rename(date = c_date,
area = c_area)-> c_df
#> Joining, by = "id"
c_df
#> # A tibble: 3 x 5
#> id name value date area
#> <dbl> <chr> <chr> <chr> <dbl>
#> 1 733 level_closet_1 C6 2013.06.18 65.2
#> 2 733 level_closet_2 C3 2013.06.18 65.2
#> 3 537 level_closet_1 C3 2015.10.01 34.5
df1 %>%
select(id, level_closet_1, level_closet_2) %>%
pivot_longer(-1) %>%
left_join(df3) %>%
filter(str_detect(value, "T")) %>%
rename(date = t_date,
area = t_area) -> t_df
#> Joining, by = "id"
t_df
#> # A tibble: 3 x 5
#> id name value date area
#> <dbl> <chr> <chr> <chr> <dbl>
#> 1 537 level_closet_2 T11 2015.15.01 27.2
#> 2 909 level_closet_1 T4 2016-01-14 25.9
#> 3 909 level_closet_2 T2 2016-01-14 25.9
c_df %>%
bind_rows(t_df) -> ct_df
ct_df
#> # A tibble: 6 x 5
#> id name value date area
#> <dbl> <chr> <chr> <chr> <dbl>
#> 1 733 level_closet_1 C6 2013.06.18 65.2
#> 2 733 level_closet_2 C3 2013.06.18 65.2
#> 3 537 level_closet_1 C3 2015.10.01 34.5
#> 4 537 level_closet_2 T11 2015.15.01 27.2
#> 5 909 level_closet_1 T4 2016-01-14 25.9
#> 6 909 level_closet_2 T2 2016-01-14 25.9
df1 %>%
select(id, level_closet_1, venti_1) %>%
bind_rows(df1 %>%
select(id, level_closet_2, venti_2)) -> df_venti
t(apply(df_venti, 1, function(x) c(x[!is.na(x)], x[is.na(x)]))) -> df_venti[]
df_venti
#> # A tibble: 6 x 5
#> id level_closet_1 venti_1 level_closet_2 venti_2
#> <chr> <chr> <chr> <chr> <chr>
#> 1 733 C6 " 0" <NA> <NA>
#> 2 537 C3 " 0" <NA> <NA>
#> 3 909 T4 " 1" <NA> <NA>
#> 4 733 C3 " 1" <NA> <NA>
#> 5 537 T11 " 0" <NA> <NA>
#> 6 909 T2 " 1" <NA> <NA>
df_venti %>%
select(1:3) %>%
rename(value = level_closet_1,
venti = venti_1) %>%
mutate(venti = venti %>% as.numeric(),
id = id %>% as.numeric()) -> venti_df2
venti_df2
#> # A tibble: 6 x 3
#> id value venti
#> <dbl> <chr> <dbl>
#> 1 733 C6 0
#> 2 537 C3 0
#> 3 909 T4 1
#> 4 733 C3 1
#> 5 537 T11 0
#> 6 909 T2 1
ct_df %>%
left_join(venti_df2) -> df_with_venti
#> Joining, by = c("id", "value")
df_with_venti
#> # A tibble: 6 x 6
#> id name value date area venti
#> <dbl> <chr> <chr> <chr> <dbl> <dbl>
#> 1 733 level_closet_1 C6 2013.06.18 65.2 0
#> 2 733 level_closet_2 C3 2013.06.18 65.2 1
#> 3 537 level_closet_1 C3 2015.10.01 34.5 0
#> 4 537 level_closet_2 T11 2015.15.01 27.2 0
#> 5 909 level_closet_1 T4 2016-01-14 25.9 1
#> 6 909 level_closet_2 T2 2016-01-14 25.9 1
df_with_venti %>%
mutate(value = value %>% str_remove_all('[0-9]+')) %>%
mutate(mm = 1) %>%
complete(id, value, fill = list(mm = 0)) %>%
group_by(id, value) %>%
summarise(count = sum(mm)) %>%
filter(count == 0) -> missing_df
#> `summarise()` regrouping output by 'id' (override with `.groups` argument)
missing_df
#> # A tibble: 2 x 3
#> # Groups: id [2]
#> id value count
#> <dbl> <chr> <dbl>
#> 1 733 T 0
#> 2 909 C 0
missing_df %>%
filter(value == "C") %>%
pull(id) -> c_missing
c_missing
#> [1] 909
missing_df %>%
filter(value == "T") %>%
pull(id) -> t_missing
t_missing
#> [1] 733
df1 %>%
filter(id %in% c_missing) %>%
select(id, c_date, c_area) %>%
rename(date = c_date,
area = c_area) %>%
mutate(ref_level = "C",
value = NA,
venti = NA) -> c_fill_df
c_fill_df
#> # A tibble: 1 x 6
#> id date area ref_level value venti
#> <dbl> <chr> <dbl> <chr> <lgl> <lgl>
#> 1 909 2016-01-14 15.1 C NA NA
df1 %>%
filter(id %in% t_missing) %>%
select(id, t_date, t_area) %>%
rename(date = t_date,
area = t_area) %>%
mutate(ref_level = "T",
value = NA,
venti = NA) -> t_fill_df
t_fill_df
#> # A tibble: 1 x 6
#> id date area ref_level value venti
#> <dbl> <chr> <dbl> <chr> <lgl> <lgl>
#> 1 733 2013.06.18 42.1 T NA NA
df_with_venti %>%
select(id, date, area, value, venti) %>%
mutate(ref_level = value %>% str_remove_all('[0-9]+')) %>%
bind_rows(c_fill_df) %>%
bind_rows(t_fill_df) %>%
group_by(id) %>%
mutate(index = row_number()) %>%
arrange(id) %>%
select(id, index, date, ref_level, area, value, venti) %>%
rename(level_closet = value)
#> # A tibble: 8 x 7
#> # Groups: id [3]
#> id index date ref_level area level_closet venti
#> <dbl> <int> <chr> <chr> <dbl> <chr> <dbl>
#> 1 537 1 2015.10.01 C 34.5 C3 0
#> 2 537 2 2015.15.01 T 27.2 T11 0
#> 3 733 1 2013.06.18 C 65.2 C6 0
#> 4 733 2 2013.06.18 C 65.2 C3 1
#> 5 733 3 2013.06.18 T 42.1 <NA> NA
#> 6 909 1 2016-01-14 T 25.9 T4 1
#> 7 909 2 2016-01-14 T 25.9 T2 1
#> 8 909 3 2016-01-14 C 15.1 <NA> NA
Created on 2021-01-22 by the reprex package (v0.3.0)