Search code examples
rdataframe

Populating a Column Based on First Come First Basis while Looking at Multiple Columns in R


I'm trying to populate a column named "Total" in a table based on the values in three other columns: "Sub Family", "Size", and "Num1", "Num2", and "Num3".

Here's the logic:

  1. Iterate through each row.
  2. Check the "Sub Family" value.
  3. For the current "Sub Family", iterate through the "Size" values from 1H to 5H.
  4. For each "Size", check the values in "Num1", "Num2", and "Num3" in order.
  5. If a value is found in any of these columns, populate the corresponding Column in the "Total" cell with that value and move to the next value.
  6. If no value is found for a "Size", populate the corresponding "Total" cell with 0.

The data frame I am working with is given below:

structure(list(Key1 = c("ADAM-1H", "ADAM-2H", "ADAM-3H", "ADAM-4H", 
"ADAM-5H", "EVE-1H", "EVE-2H", "EVE-3H", "EVE-4H", "EVE-5H", 
"ADAM-1H", "ADAM-2H", "ADAM-3H", "ADAM-4H", "ADAM-5H", "EVE-1H", 
"EVE-2H", "EVE-3H", "EVE-4H", "EVE-5H"), Key2 = c("POP-1H", "POP-2H", 
"POP-3H", "POP-4H", "POP-5H", "POP-1H", "POP-2H", "POP-3H", "POP-4H", 
"POP-5H", "QOP-1H", "QOP-2H", "QOP-3H", "QOP-4H", "QOP-5H", "QOP-1H", 
"QOP-2H", "QOP-3H", "QOP-4H", "QOP-5H"), Key3 = c("ABC-1H", "ABC-2H", 
"ABC-3H", "ABC-4H", "ABC-5H", "XYZ-1H", "XYZ-2H", "XYZ-3H", "XYZ-4H", 
"XYZ-5H", "PRQ-1H", "PRQ-2H", "PRQ-3H", "PRQ-4H", "PRQ-5H", "STV-1H", 
"STV-2H", "STV-3H", "STV-4H", "STV-5H"), Name = c("ADAM", "ADAM", 
"ADAM", "ADAM", "ADAM", "EVE", "EVE", "EVE", "EVE", "EVE", "ADAM", 
"ADAM", "ADAM", "ADAM", "ADAM", "EVE", "EVE", "EVE", "EVE", "EVE"
), Family = c("POP", "POP", "POP", "POP", "POP", "POP", "POP", 
"POP", "POP", "POP", "QOP", "QOP", "QOP", "QOP", "QOP", "QOP", 
"QOP", "QOP", "QOP", "QOP"), `Sub Family` = c("ABC", "ABC", "ABC", 
"ABC", "ABC", "XYZ", "XYZ", "XYZ", "XYZ", "XYZ", "PRQ", "PRQ", 
"PRQ", "PRQ", "PRQ", "STV", "STV", "STV", "STV", "STV"), Size = c("1H", 
"2H", "3H", "4H", "5H", "1H", "2H", "3H", "4H", "5H", "1H", "2H", 
"3H", "4H", "5H", "1H", "2H", "3H", "4H", "5H"), Num1 = c(NA, 
NA, NA, NA, NA, 0.25, 0.25, 0.25, 0.25, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA), Num2 = c(0.5, 0.2, 0.3, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA), Num3 = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2, 0.2, 0.2, 0.2, 0.2, 
0.2, 0.2, 0.2, 0.2, 0.2), Total = c(0.5, 0.2, 0.3, 0, 0, 0.25, 
0.25, 0.25, 0.25, 0, 0.2, 0.2, 0.2, 0.2, 0.2, 1, 0, 0, 0, 0)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -20L))

The code I wrote is given below:

library(readxl)
library(dplyr)

Logic_For_Concolidation <- read_excel("C:/X/X/X/Logic For Concolidation.xlsx")


# Group by "Sub Family" and "Size", apply the function, and ungroup
df <- Logic_For_Concolidation %>%
  group_by(`Sub Family`, Size) %>%
  mutate(Total1 = coalesce(Num1,Num2,Num3)) %>%
  ungroup()

The output I am getting is shown below:

structure(list(Key1 = c("ADAM-1H", "ADAM-2H", "ADAM-3H", "ADAM-4H", 
"ADAM-5H", "EVE-1H", "EVE-2H", "EVE-3H", "EVE-4H", "EVE-5H", 
"ADAM-1H", "ADAM-2H", "ADAM-3H", "ADAM-4H", "ADAM-5H", "EVE-1H", 
"EVE-2H", "EVE-3H", "EVE-4H", "EVE-5H"), Key2 = c("POP-1H", "POP-2H", 
"POP-3H", "POP-4H", "POP-5H", "POP-1H", "POP-2H", "POP-3H", "POP-4H", 
"POP-5H", "QOP-1H", "QOP-2H", "QOP-3H", "QOP-4H", "QOP-5H", "QOP-1H", 
"QOP-2H", "QOP-3H", "QOP-4H", "QOP-5H"), Key3 = c("ABC-1H", "ABC-2H", 
"ABC-3H", "ABC-4H", "ABC-5H", "XYZ-1H", "XYZ-2H", "XYZ-3H", "XYZ-4H", 
"XYZ-5H", "PRQ-1H", "PRQ-2H", "PRQ-3H", "PRQ-4H", "PRQ-5H", "STV-1H", 
"STV-2H", "STV-3H", "STV-4H", "STV-5H"), Name = c("ADAM", "ADAM", 
"ADAM", "ADAM", "ADAM", "EVE", "EVE", "EVE", "EVE", "EVE", "ADAM", 
"ADAM", "ADAM", "ADAM", "ADAM", "EVE", "EVE", "EVE", "EVE", "EVE"
), Family = c("POP", "POP", "POP", "POP", "POP", "POP", "POP", 
"POP", "POP", "POP", "QOP", "QOP", "QOP", "QOP", "QOP", "QOP", 
"QOP", "QOP", "QOP", "QOP"), `Sub Family` = c("ABC", "ABC", "ABC", 
"ABC", "ABC", "XYZ", "XYZ", "XYZ", "XYZ", "XYZ", "PRQ", "PRQ", 
"PRQ", "PRQ", "PRQ", "STV", "STV", "STV", "STV", "STV"), Size = c("1H", 
"2H", "3H", "4H", "5H", "1H", "2H", "3H", "4H", "5H", "1H", "2H", 
"3H", "4H", "5H", "1H", "2H", "3H", "4H", "5H"), Num1 = c(NA, 
NA, NA, NA, NA, 0.25, 0.25, 0.25, 0.25, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA), Num2 = c(0.5, 0.2, 0.3, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA), Num3 = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2, 0.2, 0.2, 0.2, 0.2, 
0.2, 0.2, 0.2, 0.2, 0.2), Total1 = c(0.5, 0.2, 0.3, NA, NA, 0.25, 
0.25, 0.25, 0.25, NA, 0.2, 0.2, 0.2, 0.2, 0.2, 1, 0.2, 0.2, 0.2, 
0.2)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"
))

The only issue that the output is only partially correct.

Attached is the snapshot of how the expected output is and how it should be.

enter image description here

As you can see its matching in most cases except the last one. In Sub Family "STV", the output should be 1, 0, 0, 0, 0 instead of 1,0.2,0.2,0.2,0.2. For 1H, 2H, 3H, 4H, 5H respectively.

Can this be done in R, and what would the efficient way of doing it

Please keep in mind this is a dummy data set, the real data set has 6 columns from Num1 to Num6. So please share you thought on this and let me know.

Detailed Explanation:

  1. The expectation is that, I want to populate the column Total. The logic of population is quite complex, The column Sub Family and check the Size from 1H to 5H, to populate the column total we have to look at the 3 columns, Num1, Num2, Num3.

    We look into the each value in ABC, and check first column called Num1. As you can see the value of Sub Family which is ABC and then we need to look at Size Column which is from 1H to 5H and see the column Num1, as you can see there is no value there and is fully blank, we move to the next column Num2.

    And in Num2, we see in 1H we have 0.5 and 2H we have 0.2, 3H has 0.3 and 4H has blank and 5H has blank. So then we populate the Total column with 0.5,0.2,0.3,0,0.

  2. List item Then we look into the next value in Sub Family which is XYZ and in the first column Num 1 itself we can see the value in 0.25, 0.25,0.25,0.25, Blank. And then we don’t look further and we populate the column Total with 0.25,0.25,0.25,0.25,0.

  3. Then we look into the next value in Sub family which is PRQ and Num1 is empty and then we look into Num2 and which is empty and then we look into Num3 and we found 0.2,0.2,0.2,0.2,0.2. So we populate the same value in Total.

  4. Finally we look into the next value of sub family which is STV and we find Num1 column empty and in Num2, we found 1, blank, blank, blank and blank. So we populate the column total with 1,0,0,0,0. Keep in mind only 1,0,0,0,0 should be populated here, and the next column Num3 which has 0.2,0.2,0.2,0.2,0.2 should be completely ignored since we got the value in Num2**


Solution

  • I think this should work. Iterating across the Num variables replace NAs with 0 if there are any non-NA values by Family and `Sub Family`, then coalesce() the modified Num variables.

    library(dplyr)
    library(tidyr)
    
    dat |> 
      mutate(Total1 = across(starts_with("Num"), ~ if (any(!is.na(.x))) replace_na(.x, 0) else .x), .by = c(Family, `Sub Family`)) |> 
      mutate(Total1 = do.call(coalesce, c(Total1, 0)))
    
    # A tibble: 20 × 11
       Key1    Key2   Key3   Name  Family `Sub Family` Size   Num1  Num2  Num3 Total1
       <chr>   <chr>  <chr>  <chr> <chr>  <chr>        <chr> <dbl> <dbl> <dbl>  <dbl>
     1 ADAM-1H POP-1H ABC-1H ADAM  POP    ABC          1H    NA      0.5  NA     0.5 
     2 ADAM-2H POP-2H ABC-2H ADAM  POP    ABC          2H    NA      0.2  NA     0.2 
     3 ADAM-3H POP-3H ABC-3H ADAM  POP    ABC          3H    NA      0.3  NA     0.3 
     4 ADAM-4H POP-4H ABC-4H ADAM  POP    ABC          4H    NA     NA    NA     0   
     5 ADAM-5H POP-5H ABC-5H ADAM  POP    ABC          5H    NA     NA    NA     0   
     6 EVE-1H  POP-1H XYZ-1H EVE   POP    XYZ          1H     0.25  NA    NA     0.25
     7 EVE-2H  POP-2H XYZ-2H EVE   POP    XYZ          2H     0.25  NA    NA     0.25
     8 EVE-3H  POP-3H XYZ-3H EVE   POP    XYZ          3H     0.25  NA    NA     0.25
     9 EVE-4H  POP-4H XYZ-4H EVE   POP    XYZ          4H     0.25  NA    NA     0.25
    10 EVE-5H  POP-5H XYZ-5H EVE   POP    XYZ          5H    NA     NA    NA     0   
    11 ADAM-1H QOP-1H PRQ-1H ADAM  QOP    PRQ          1H    NA     NA     0.2   0.2 
    12 ADAM-2H QOP-2H PRQ-2H ADAM  QOP    PRQ          2H    NA     NA     0.2   0.2 
    13 ADAM-3H QOP-3H PRQ-3H ADAM  QOP    PRQ          3H    NA     NA     0.2   0.2 
    14 ADAM-4H QOP-4H PRQ-4H ADAM  QOP    PRQ          4H    NA     NA     0.2   0.2 
    15 ADAM-5H QOP-5H PRQ-5H ADAM  QOP    PRQ          5H    NA     NA     0.2   0.2 
    16 EVE-1H  QOP-1H STV-1H EVE   QOP    STV          1H    NA      1     0.2   1   
    17 EVE-2H  QOP-2H STV-2H EVE   QOP    STV          2H    NA     NA     0.2   0   
    18 EVE-3H  QOP-3H STV-3H EVE   QOP    STV          3H    NA     NA     0.2   0   
    19 EVE-4H  QOP-4H STV-4H EVE   QOP    STV          4H    NA     NA     0.2   0   
    20 EVE-5H  QOP-5H STV-5H EVE   QOP    STV          5H    NA     NA     0.2   0