Search code examples
dplyrnullpivotmutate

Why changing from pivot long to Pivot wide adds new rows to my data?


This is a dput()sample of my data:

structure(list(ID = c("101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101"), IDA = c("1000", 
"1279", "1392", "534", "835", "910", "748", "589", "675", "500", 
"1243", "635", "1181", "791", "755", "1069", "640", "1229", "1856", 
"116", "767", "1126", "863", "1141", "1858", "899", "5", "225", 
"175", "1764", "1017", "497", "771", "41", "816", "1046", "439", 
"930", "1350", "641", "1057", "1021", "503", "553", "1738", "1379", 
"774", "442", "1113", "1503"), DATE = structure(c(1497315600, 
1552352400, 1552957200, 1390438800, 1439427600, 1479776400, 1455757200, 
1402534800, 1409187600, 1383008400, 1536022800, 1414630800, 1545094800, 
1551142800, 1461805200, 1483405200, 1420506000, 1534813200, 1493600400, 
1348448400, 1458176400, 1521075600, 1464656400, 1527555600, 1504573200, 
1478134800, 1278378000, 1320886800, 1309395600, 1598576400, 1500512400, 
1385600400, 1436403600, 1284426000, 1430960400, 1485824400, 1381971600, 
1477962000, 1510023600, 1420509600, 1508806800, 1499302800, 1386205200, 
1379466000, 1555290000, 1565226000, 1435798800, 1494896400, 1516064400, 
1593478800), tzone = "UTC", class = c("POSIXct", "POSIXt")), 
    NR = c("CH-1000", " CH-1279", "CH-1392", 
    "CH-0534", "CH-0835", " CH-0910", "CH-0748", 
    "CH-0589", "CH-0675", "CH-0500", "CH-1243", 
    "CH-0635", "CH-1181", "CH-0791", "CH-0755", 
    "CH-1069", "CH-0640", "CH-1229", "CH-1856", 
    "CH-0116", "CH-0767", "CH-1126", "CH-0863", 
    "CH-1141", "CH-1858", "CH-0899", "CH-0005", 
    "CH-0225", "CH-0175", "CH-1764", "CH-1017", 
    "CH-0497", "CH-0771", "CH-0041", "CH-0816", 
    "CH-1046", "CH-0439", "CH-0930", "CH-1350", 
    "CH-0641", "CH-1057", "CH-1021", "CH-0503", 
    "CH-0553", "CH-1738", "CH-1379", "CH-0774", 
    "CH-0442", "CH-1113", "CH-1503"), PAT = c("101-1000", 
    "101-1279", "101-1392", "101-534", "101-835", "101-910", 
    "101-748", "101-589", "101-675", "101-500", "101-1243", "101-635", 
    "101-1181", "101-791", "101-755", "101-1069", "101-640", 
    "101-1229", "101-1856", "101-116", "101-767", "101-1126", 
    "101-863", "101-1141", "101-1858", "101-899", "101-5", "101-225", 
    "101-175", "101-1764", "101-1017", "101-497", "101-771", 
    "101-41", "101-816", "101-1046", "101-439", "101-930", "101-1350", 
    "101-641", "101-1057", "101-1021", "101-503", "101-553", 
    "101-1738", "101-1379", "101-774", "101-442", "101-1113", 
    "101-1503"), INT1 = c(NA, NA, NA, 280035, 280040, NA, 
    280040, 280040, 285030, 245040, NA, 280035, NA, NA, 280040, 
    NA, 220035, NA, NA, 280040, 280040, NA, 245005, NA, NA, 275005, 
    240070, 220035, 280040, NA, NA, 280040, 280040, 240005, 280040, 
    NA, 280040, 240005, 245040, 240030, NA, NA, 260010, NA, NA, 
    NA, 280040, NA, NA, NA), INT2 = c(NA, NA, NA, NA, NA, 
    NA, NA, 280040, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, 240030, NA, 260005, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA), INT3 = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), INTX1 = c(NA, 280005, 220035, NA, NA, NA, 
    NA, NA, NA, NA, 280050, NA, 240080, 280050, NA, 240085, NA, 
    280050, 270010, NA, NA, 280050, NA, 280005, NA, NA, NA, NA, 
    NA, 275045, 280050, NA, NA, NA, NA, 245005, NA, NA, 245040, 
    NA, NA, 280050, NA, NA, 220035, 280050, NA, 255005, 280050, 
    220005), INTX2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, 240085, NA, NA, NA, NA, NA, NA, NA, NA, 280050, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), INTX3 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(1, 
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))

What I needed to do was
first, to recode some values from INT1:INT4 and INTX1:INTX4 and put them in new columns. To this aim, I used long_pivot as below:

    longDATA <- DATA %>% 
                  pivot_longer(cols = c('INT1':'INTX4'),
                  names_to = "INT", values_to = "Code")

Then I used the long list to mutate new variables as below:

    longDATA1 <- longDATA %>% mutate(palm = case_when(Code == 210025 ~ 1))


    longDATA2 <- longDATA1 %>% mutate(bio = case_when(Code == 210015 ~ '12.06.25',Code == 210020 ~ '12.07.25',Code == 275015 ~ '12.06.25',Code == 275020 ~ '12.07.25'))
                                                                 
    longDATA3 <- longDATA2 %>% mutate(EPX = case_when(Code == 280005 ~ 1, Code == 280010 ~ 1))

Finally, I needed to return it to a wide format. I used the code below:

    WideDATA <- longDATA3 %>% pivot_wider(names_from = INT, values_from = Code)

Below you can see the output but it does not show the problem as I have a huge dataset. I realized the number of rows has changed from 2480 (my initial data before doing a longer pivot)to 2633 (the Wide data). I realized that the added rows are created when in one of the EPX, bio, or palm columns a value has been recorded. for example, if there is a row where INT1 and INT2 are filled with values other than NA (e.g., 280010 and 280040), EPX is assigned value 1, based on the mutation that we did earlier. However, EPX=1 and INT1=280010 appear in one row and 280040 appears in another row. like the photo below. [1]: https://i.sstatic.net/jjEea.png

I have spent a week figuring this out but no luck. I would highly appreciate your advice.

structure(list(ID = c("101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101", "101", 
"101", "101", "101", "101", "101", "101", "101", "101"), IDA = c("1000", 
"1279", "1392", "534", "835", "910", "748", "589", "675", "500", 
"1243", "635", "1181", "791", "755", "1069", "640", "1229", "1856", 
"116", "767", "1126", "863", "1141", "1858", "899", "5", "225", 
"175", "1764", "1017", "497", "771", "41", "816", "1046", "439", 
"930", "1350", "641", "1057", "1021", "503", "553", "1738", "1379", 
"774", "442", "1113", "1503"), DATE = structure(c(1497315600, 
1552352400, 1552957200, 1390438800, 1439427600, 1479776400, 1455757200, 
1402534800, 1409187600, 1383008400, 1536022800, 1414630800, 1545094800, 
1551142800, 1461805200, 1483405200, 1420506000, 1534813200, 1493600400, 
1348448400, 1458176400, 1521075600, 1464656400, 1527555600, 1504573200, 
1478134800, 1278378000, 1320886800, 1309395600, 1598576400, 1500512400, 
1385600400, 1436403600, 1284426000, 1430960400, 1485824400, 1381971600, 
1477962000, 1510023600, 1420509600, 1508806800, 1499302800, 1386205200, 
1379466000, 1555290000, 1565226000, 1435798800, 1494896400, 1516064400, 
1593478800), tzone = "UTC", class = c("POSIXct", "POSIXt")), 
    NR = c("CH-1000", " CH-1279", "CH-1392", 
    "CH-0534", "CH-0835", " CH-0910", "CH-0748", 
    "CH-0589", "CH-0675", "CH-0500", "CH-1243", 
    "CH-0635", "CH-1181", "CH-0791", "CH-0755", 
    "CH-1069", "CH-0640", "CH-1229", "CH-1856", 
    "CH-0116", "CH-0767", "CH-1126", "CH-0863", 
    "CH-1141", "CH-1858", "CH-0899", "CH-0005", 
    "CH-0225", "CH-0175", "CH-1764", "CH-1017", 
    "CH-0497", "CH-0771", "CH-0041", "CH-0816", 
    "CH-1046", "CH-0439", "CH-0930", "CH-1350", 
    "CH-0641", "CH-1057", "CH-1021", "CH-0503", 
    "CH-0553", "CH-1738", "CH-1379", "CH-0774", 
    "CH-0442", "CH-1113", "CH-1503"), PAT = c("101-1000", 
    "101-1279", "101-1392", "101-534", "101-835", "101-910", 
    "101-748", "101-589", "101-675", "101-500", "101-1243", "101-635", 
    "101-1181", "101-791", "101-755", "101-1069", "101-640", 
    "101-1229", "101-1856", "101-116", "101-767", "101-1126", 
    "101-863", "101-1141", "101-1858", "101-899", "101-5", "101-225", 
    "101-175", "101-1764", "101-1017", "101-497", "101-771", 
    "101-41", "101-816", "101-1046", "101-439", "101-930", "101-1350", 
    "101-641", "101-1057", "101-1021", "101-503", "101-553", 
    "101-1738", "101-1379", "101-774", "101-442", "101-1113", 
    "101-1503"), palm = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), bio= c(NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), EPx = c(NA, 
    NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA),  INT1 = c(NA, NA, NA, 280035, 280040, NA, 
    280040, 280040, 285030, 245040, NA, 280035, NA, NA, 280040, 
    NA, 220035, NA, NA, 280040, 280040, NA, 245005, NA, NA, 275005, 
    240070, 220035, 280040, NA, NA, 280040, 280040, 240005, 280040, 
    NA, 280040, 240005, 245040, 240030, NA, NA, 260010, NA, NA, 
    NA, 280040, NA, NA, NA), INT2 = c(NA, NA, NA, NA, NA, 
    NA, NA, 280040, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, 240030, NA, 260005, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA), INT3 = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), INTX1 = c(NA, 280005, 220035, NA, NA, NA, 
    NA, NA, NA, NA, 280050, NA, 240080, 280050, NA, 240085, NA, 
    280050, 270010, NA, NA, 280050, NA, 280005, NA, NA, NA, NA, 
    NA, 275045, 280050, NA, NA, NA, NA, 245005, NA, NA, 245040, 
    NA, NA, 280050, NA, NA, 220035, 280050, NA, 255005, 280050, 
    220005), INTX2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, 240085, NA, NA, NA, NA, NA, NA, NA, NA, 280050, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), INTX3 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(1, 
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))

Solution

  • The tricky thing with the pivoting here is that if one row produces an NA (e.g. if INT1 does not match the code) and another produces a value (INT3X for example), when pivoting R recognises these as two distinct category rows and keeps them both.

    Easier method would be to mutate and test if any values are present without pivoting:

    library(tidyverse)
    
    df |>
      mutate(
        palm = case_when(if_any(INT1:INTX4, ~ .x == 210025) ~ 1),
        bio = case_when(
          if_any(INT1:INTX4, ~ .x == 210015) ~ '12.06.25',
          if_any(INT1:INTX4, ~ .x == 210020) ~ '12.07.25',
          if_any(INT1:INTX4, ~ .x == 275015) ~ '12.06.25',
          if_any(INT1:INTX4, ~ .x == 275020) ~ '12.07.25'
        ),
        EPX = case_when(
          if_any(INT1:INTX4, ~ .x == 280005) ~ 1,
          if_any(INT1:INTX4, ~ .x  == 280010) ~ 1
        )
      )
    #> # A tibble: 50 × 17
    #>    ID    IDA   DATE     NR    PAT     INT1   INT2 INT3  INT4   INTX1 INTX2 INTX3
    #>    <chr> <chr> <chr>    <chr> <chr>  <dbl>  <dbl> <lgl> <lgl>  <dbl> <dbl> <lgl>
    #>  1 101   1000  2017-06… "CH-… 101-…     NA     NA NA    NA        NA    NA NA   
    #>  2 101   1279  2019-03… " CH… 101-…     NA     NA NA    NA    280005    NA NA   
    #>  3 101   1392  2019-03… "CH-… 101-…     NA     NA NA    NA    220035    NA NA   
    #>  4 101   534   2014-01… "CH-… 101-… 280035     NA NA    NA        NA    NA NA   
    #>  5 101   835   2015-08… "CH-… 101-… 280040     NA NA    NA        NA    NA NA   
    #>  6 101   910   2016-11… " CH… 101-…     NA     NA NA    NA        NA    NA NA   
    #>  7 101   748   2016-02… "CH-… 101-… 280040     NA NA    NA        NA    NA NA   
    #>  8 101   589   2014-06… "CH-… 101-… 280040 280040 NA    NA        NA    NA NA   
    #>  9 101   675   2014-08… "CH-… 101-… 285030     NA NA    NA        NA    NA NA   
    #> 10 101   500   2013-10… "CH-… 101-… 245040     NA NA    NA        NA    NA NA   
    #> # … with 40 more rows, and 5 more variables: INTX4 <lgl>, KAT <dbl>,
    #> #   palm <dbl>, bio <chr>, EPX <dbl>