This is a dput()sample of my data:
structure(list(ID = c("101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101"), IDA = c("1000",
"1279", "1392", "534", "835", "910", "748", "589", "675", "500",
"1243", "635", "1181", "791", "755", "1069", "640", "1229", "1856",
"116", "767", "1126", "863", "1141", "1858", "899", "5", "225",
"175", "1764", "1017", "497", "771", "41", "816", "1046", "439",
"930", "1350", "641", "1057", "1021", "503", "553", "1738", "1379",
"774", "442", "1113", "1503"), DATE = structure(c(1497315600,
1552352400, 1552957200, 1390438800, 1439427600, 1479776400, 1455757200,
1402534800, 1409187600, 1383008400, 1536022800, 1414630800, 1545094800,
1551142800, 1461805200, 1483405200, 1420506000, 1534813200, 1493600400,
1348448400, 1458176400, 1521075600, 1464656400, 1527555600, 1504573200,
1478134800, 1278378000, 1320886800, 1309395600, 1598576400, 1500512400,
1385600400, 1436403600, 1284426000, 1430960400, 1485824400, 1381971600,
1477962000, 1510023600, 1420509600, 1508806800, 1499302800, 1386205200,
1379466000, 1555290000, 1565226000, 1435798800, 1494896400, 1516064400,
1593478800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
NR = c("CH-1000", " CH-1279", "CH-1392",
"CH-0534", "CH-0835", " CH-0910", "CH-0748",
"CH-0589", "CH-0675", "CH-0500", "CH-1243",
"CH-0635", "CH-1181", "CH-0791", "CH-0755",
"CH-1069", "CH-0640", "CH-1229", "CH-1856",
"CH-0116", "CH-0767", "CH-1126", "CH-0863",
"CH-1141", "CH-1858", "CH-0899", "CH-0005",
"CH-0225", "CH-0175", "CH-1764", "CH-1017",
"CH-0497", "CH-0771", "CH-0041", "CH-0816",
"CH-1046", "CH-0439", "CH-0930", "CH-1350",
"CH-0641", "CH-1057", "CH-1021", "CH-0503",
"CH-0553", "CH-1738", "CH-1379", "CH-0774",
"CH-0442", "CH-1113", "CH-1503"), PAT = c("101-1000",
"101-1279", "101-1392", "101-534", "101-835", "101-910",
"101-748", "101-589", "101-675", "101-500", "101-1243", "101-635",
"101-1181", "101-791", "101-755", "101-1069", "101-640",
"101-1229", "101-1856", "101-116", "101-767", "101-1126",
"101-863", "101-1141", "101-1858", "101-899", "101-5", "101-225",
"101-175", "101-1764", "101-1017", "101-497", "101-771",
"101-41", "101-816", "101-1046", "101-439", "101-930", "101-1350",
"101-641", "101-1057", "101-1021", "101-503", "101-553",
"101-1738", "101-1379", "101-774", "101-442", "101-1113",
"101-1503"), INT1 = c(NA, NA, NA, 280035, 280040, NA,
280040, 280040, 285030, 245040, NA, 280035, NA, NA, 280040,
NA, 220035, NA, NA, 280040, 280040, NA, 245005, NA, NA, 275005,
240070, 220035, 280040, NA, NA, 280040, 280040, 240005, 280040,
NA, 280040, 240005, 245040, 240030, NA, NA, 260010, NA, NA,
NA, 280040, NA, NA, NA), INT2 = c(NA, NA, NA, NA, NA,
NA, NA, 280040, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 240030, NA, 260005, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), INT3 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX1 = c(NA, 280005, 220035, NA, NA, NA,
NA, NA, NA, NA, 280050, NA, 240080, 280050, NA, 240085, NA,
280050, 270010, NA, NA, 280050, NA, 280005, NA, NA, NA, NA,
NA, 275045, 280050, NA, NA, NA, NA, 245005, NA, NA, 245040,
NA, NA, 280050, NA, NA, 220035, 280050, NA, 255005, 280050,
220005), INTX2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 240085, NA, NA, NA, NA, NA, NA, NA, NA, 280050,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), INTX3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))
What I needed to do was
first, to recode some values from INT1:INT4 and INTX1:INTX4 and put them in new columns. To this aim, I used long_pivot as below:
longDATA <- DATA %>%
pivot_longer(cols = c('INT1':'INTX4'),
names_to = "INT", values_to = "Code")
Then I used the long list to mutate new variables as below:
longDATA1 <- longDATA %>% mutate(palm = case_when(Code == 210025 ~ 1))
longDATA2 <- longDATA1 %>% mutate(bio = case_when(Code == 210015 ~ '12.06.25',Code == 210020 ~ '12.07.25',Code == 275015 ~ '12.06.25',Code == 275020 ~ '12.07.25'))
longDATA3 <- longDATA2 %>% mutate(EPX = case_when(Code == 280005 ~ 1, Code == 280010 ~ 1))
Finally, I needed to return it to a wide format. I used the code below:
WideDATA <- longDATA3 %>% pivot_wider(names_from = INT, values_from = Code)
Below you can see the output but it does not show the problem as I have a huge dataset. I realized the number of rows has changed from 2480 (my initial data before doing a longer pivot)to 2633 (the Wide data). I realized that the added rows are created when in one of the EPX, bio, or palm columns a value has been recorded. for example, if there is a row where INT1 and INT2 are filled with values other than NA (e.g., 280010 and 280040), EPX is assigned value 1, based on the mutation that we did earlier. However, EPX=1 and INT1=280010 appear in one row and 280040 appears in another row. like the photo below. [1]: https://i.sstatic.net/jjEea.png
I have spent a week figuring this out but no luck. I would highly appreciate your advice.
structure(list(ID = c("101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101"), IDA = c("1000",
"1279", "1392", "534", "835", "910", "748", "589", "675", "500",
"1243", "635", "1181", "791", "755", "1069", "640", "1229", "1856",
"116", "767", "1126", "863", "1141", "1858", "899", "5", "225",
"175", "1764", "1017", "497", "771", "41", "816", "1046", "439",
"930", "1350", "641", "1057", "1021", "503", "553", "1738", "1379",
"774", "442", "1113", "1503"), DATE = structure(c(1497315600,
1552352400, 1552957200, 1390438800, 1439427600, 1479776400, 1455757200,
1402534800, 1409187600, 1383008400, 1536022800, 1414630800, 1545094800,
1551142800, 1461805200, 1483405200, 1420506000, 1534813200, 1493600400,
1348448400, 1458176400, 1521075600, 1464656400, 1527555600, 1504573200,
1478134800, 1278378000, 1320886800, 1309395600, 1598576400, 1500512400,
1385600400, 1436403600, 1284426000, 1430960400, 1485824400, 1381971600,
1477962000, 1510023600, 1420509600, 1508806800, 1499302800, 1386205200,
1379466000, 1555290000, 1565226000, 1435798800, 1494896400, 1516064400,
1593478800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
NR = c("CH-1000", " CH-1279", "CH-1392",
"CH-0534", "CH-0835", " CH-0910", "CH-0748",
"CH-0589", "CH-0675", "CH-0500", "CH-1243",
"CH-0635", "CH-1181", "CH-0791", "CH-0755",
"CH-1069", "CH-0640", "CH-1229", "CH-1856",
"CH-0116", "CH-0767", "CH-1126", "CH-0863",
"CH-1141", "CH-1858", "CH-0899", "CH-0005",
"CH-0225", "CH-0175", "CH-1764", "CH-1017",
"CH-0497", "CH-0771", "CH-0041", "CH-0816",
"CH-1046", "CH-0439", "CH-0930", "CH-1350",
"CH-0641", "CH-1057", "CH-1021", "CH-0503",
"CH-0553", "CH-1738", "CH-1379", "CH-0774",
"CH-0442", "CH-1113", "CH-1503"), PAT = c("101-1000",
"101-1279", "101-1392", "101-534", "101-835", "101-910",
"101-748", "101-589", "101-675", "101-500", "101-1243", "101-635",
"101-1181", "101-791", "101-755", "101-1069", "101-640",
"101-1229", "101-1856", "101-116", "101-767", "101-1126",
"101-863", "101-1141", "101-1858", "101-899", "101-5", "101-225",
"101-175", "101-1764", "101-1017", "101-497", "101-771",
"101-41", "101-816", "101-1046", "101-439", "101-930", "101-1350",
"101-641", "101-1057", "101-1021", "101-503", "101-553",
"101-1738", "101-1379", "101-774", "101-442", "101-1113",
"101-1503"), palm = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), bio= c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), EPx = c(NA,
NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), INT1 = c(NA, NA, NA, 280035, 280040, NA,
280040, 280040, 285030, 245040, NA, 280035, NA, NA, 280040,
NA, 220035, NA, NA, 280040, 280040, NA, 245005, NA, NA, 275005,
240070, 220035, 280040, NA, NA, 280040, 280040, 240005, 280040,
NA, 280040, 240005, 245040, 240030, NA, NA, 260010, NA, NA,
NA, 280040, NA, NA, NA), INT2 = c(NA, NA, NA, NA, NA,
NA, NA, 280040, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 240030, NA, 260005, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), INT3 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX1 = c(NA, 280005, 220035, NA, NA, NA,
NA, NA, NA, NA, 280050, NA, 240080, 280050, NA, 240085, NA,
280050, 270010, NA, NA, 280050, NA, 280005, NA, NA, NA, NA,
NA, 275045, 280050, NA, NA, NA, NA, 245005, NA, NA, 245040,
NA, NA, 280050, NA, NA, 220035, 280050, NA, 255005, 280050,
220005), INTX2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 240085, NA, NA, NA, NA, NA, NA, NA, NA, 280050,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), INTX3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))
The tricky thing with the pivoting here is that if one row produces an NA
(e.g. if INT1
does not match the code) and another produces a value (INT3X
for example), when pivoting R recognises these as two distinct category rows and keeps them both.
Easier method would be to mutate
and test if any values are present without pivoting:
library(tidyverse)
df |>
mutate(
palm = case_when(if_any(INT1:INTX4, ~ .x == 210025) ~ 1),
bio = case_when(
if_any(INT1:INTX4, ~ .x == 210015) ~ '12.06.25',
if_any(INT1:INTX4, ~ .x == 210020) ~ '12.07.25',
if_any(INT1:INTX4, ~ .x == 275015) ~ '12.06.25',
if_any(INT1:INTX4, ~ .x == 275020) ~ '12.07.25'
),
EPX = case_when(
if_any(INT1:INTX4, ~ .x == 280005) ~ 1,
if_any(INT1:INTX4, ~ .x == 280010) ~ 1
)
)
#> # A tibble: 50 × 17
#> ID IDA DATE NR PAT INT1 INT2 INT3 INT4 INTX1 INTX2 INTX3
#> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <lgl> <lgl> <dbl> <dbl> <lgl>
#> 1 101 1000 2017-06… "CH-… 101-… NA NA NA NA NA NA NA
#> 2 101 1279 2019-03… " CH… 101-… NA NA NA NA 280005 NA NA
#> 3 101 1392 2019-03… "CH-… 101-… NA NA NA NA 220035 NA NA
#> 4 101 534 2014-01… "CH-… 101-… 280035 NA NA NA NA NA NA
#> 5 101 835 2015-08… "CH-… 101-… 280040 NA NA NA NA NA NA
#> 6 101 910 2016-11… " CH… 101-… NA NA NA NA NA NA NA
#> 7 101 748 2016-02… "CH-… 101-… 280040 NA NA NA NA NA NA
#> 8 101 589 2014-06… "CH-… 101-… 280040 280040 NA NA NA NA NA
#> 9 101 675 2014-08… "CH-… 101-… 285030 NA NA NA NA NA NA
#> 10 101 500 2013-10… "CH-… 101-… 245040 NA NA NA NA NA NA
#> # … with 40 more rows, and 5 more variables: INTX4 <lgl>, KAT <dbl>,
#> # palm <dbl>, bio <chr>, EPX <dbl>