I'm trying to populate a column named "Total" in a table based on the values in three other columns: "Sub Family", "Size", and "Num1", "Num2", and "Num3".
Here's the logic:
The data frame I am working with is given below:
structure(list(Key1 = c("ADAM-1H", "ADAM-2H", "ADAM-3H", "ADAM-4H",
"ADAM-5H", "EVE-1H", "EVE-2H", "EVE-3H", "EVE-4H", "EVE-5H",
"ADAM-1H", "ADAM-2H", "ADAM-3H", "ADAM-4H", "ADAM-5H", "EVE-1H",
"EVE-2H", "EVE-3H", "EVE-4H", "EVE-5H"), Key2 = c("POP-1H", "POP-2H",
"POP-3H", "POP-4H", "POP-5H", "POP-1H", "POP-2H", "POP-3H", "POP-4H",
"POP-5H", "QOP-1H", "QOP-2H", "QOP-3H", "QOP-4H", "QOP-5H", "QOP-1H",
"QOP-2H", "QOP-3H", "QOP-4H", "QOP-5H"), Key3 = c("ABC-1H", "ABC-2H",
"ABC-3H", "ABC-4H", "ABC-5H", "XYZ-1H", "XYZ-2H", "XYZ-3H", "XYZ-4H",
"XYZ-5H", "PRQ-1H", "PRQ-2H", "PRQ-3H", "PRQ-4H", "PRQ-5H", "STV-1H",
"STV-2H", "STV-3H", "STV-4H", "STV-5H"), Name = c("ADAM", "ADAM",
"ADAM", "ADAM", "ADAM", "EVE", "EVE", "EVE", "EVE", "EVE", "ADAM",
"ADAM", "ADAM", "ADAM", "ADAM", "EVE", "EVE", "EVE", "EVE", "EVE"
), Family = c("POP", "POP", "POP", "POP", "POP", "POP", "POP",
"POP", "POP", "POP", "QOP", "QOP", "QOP", "QOP", "QOP", "QOP",
"QOP", "QOP", "QOP", "QOP"), `Sub Family` = c("ABC", "ABC", "ABC",
"ABC", "ABC", "XYZ", "XYZ", "XYZ", "XYZ", "XYZ", "PRQ", "PRQ",
"PRQ", "PRQ", "PRQ", "STV", "STV", "STV", "STV", "STV"), Size = c("1H",
"2H", "3H", "4H", "5H", "1H", "2H", "3H", "4H", "5H", "1H", "2H",
"3H", "4H", "5H", "1H", "2H", "3H", "4H", "5H"), Num1 = c(NA,
NA, NA, NA, NA, 0.25, 0.25, 0.25, 0.25, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), Num2 = c(0.5, 0.2, 0.3, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA), Num3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2, 0.2, 0.2, 0.2, 0.2,
0.2, 0.2, 0.2, 0.2, 0.2), Total = c(0.5, 0.2, 0.3, 0, 0, 0.25,
0.25, 0.25, 0.25, 0, 0.2, 0.2, 0.2, 0.2, 0.2, 1, 0, 0, 0, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L))
The code I wrote is given below:
library(readxl)
library(dplyr)
Logic_For_Concolidation <- read_excel("C:/X/X/X/Logic For Concolidation.xlsx")
# Group by "Sub Family" and "Size", apply the function, and ungroup
df <- Logic_For_Concolidation %>%
group_by(`Sub Family`, Size) %>%
mutate(Total1 = coalesce(Num1,Num2,Num3)) %>%
ungroup()
The output I am getting is shown below:
structure(list(Key1 = c("ADAM-1H", "ADAM-2H", "ADAM-3H", "ADAM-4H",
"ADAM-5H", "EVE-1H", "EVE-2H", "EVE-3H", "EVE-4H", "EVE-5H",
"ADAM-1H", "ADAM-2H", "ADAM-3H", "ADAM-4H", "ADAM-5H", "EVE-1H",
"EVE-2H", "EVE-3H", "EVE-4H", "EVE-5H"), Key2 = c("POP-1H", "POP-2H",
"POP-3H", "POP-4H", "POP-5H", "POP-1H", "POP-2H", "POP-3H", "POP-4H",
"POP-5H", "QOP-1H", "QOP-2H", "QOP-3H", "QOP-4H", "QOP-5H", "QOP-1H",
"QOP-2H", "QOP-3H", "QOP-4H", "QOP-5H"), Key3 = c("ABC-1H", "ABC-2H",
"ABC-3H", "ABC-4H", "ABC-5H", "XYZ-1H", "XYZ-2H", "XYZ-3H", "XYZ-4H",
"XYZ-5H", "PRQ-1H", "PRQ-2H", "PRQ-3H", "PRQ-4H", "PRQ-5H", "STV-1H",
"STV-2H", "STV-3H", "STV-4H", "STV-5H"), Name = c("ADAM", "ADAM",
"ADAM", "ADAM", "ADAM", "EVE", "EVE", "EVE", "EVE", "EVE", "ADAM",
"ADAM", "ADAM", "ADAM", "ADAM", "EVE", "EVE", "EVE", "EVE", "EVE"
), Family = c("POP", "POP", "POP", "POP", "POP", "POP", "POP",
"POP", "POP", "POP", "QOP", "QOP", "QOP", "QOP", "QOP", "QOP",
"QOP", "QOP", "QOP", "QOP"), `Sub Family` = c("ABC", "ABC", "ABC",
"ABC", "ABC", "XYZ", "XYZ", "XYZ", "XYZ", "XYZ", "PRQ", "PRQ",
"PRQ", "PRQ", "PRQ", "STV", "STV", "STV", "STV", "STV"), Size = c("1H",
"2H", "3H", "4H", "5H", "1H", "2H", "3H", "4H", "5H", "1H", "2H",
"3H", "4H", "5H", "1H", "2H", "3H", "4H", "5H"), Num1 = c(NA,
NA, NA, NA, NA, 0.25, 0.25, 0.25, 0.25, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), Num2 = c(0.5, 0.2, 0.3, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA), Num3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.2, 0.2, 0.2, 0.2, 0.2,
0.2, 0.2, 0.2, 0.2, 0.2), Total1 = c(0.5, 0.2, 0.3, NA, NA, 0.25,
0.25, 0.25, 0.25, NA, 0.2, 0.2, 0.2, 0.2, 0.2, 1, 0.2, 0.2, 0.2,
0.2)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"
))
The only issue that the output is only partially correct.
Attached is the snapshot of how the expected output is and how it should be.
As you can see its matching in most cases except the last one. In Sub Family "STV", the output should be 1, 0, 0, 0, 0 instead of 1,0.2,0.2,0.2,0.2. For 1H, 2H, 3H, 4H, 5H respectively.
Can this be done in R, and what would the efficient way of doing it
Please keep in mind this is a dummy data set, the real data set has 6 columns from Num1 to Num6. So please share you thought on this and let me know.
Detailed Explanation:
The expectation is that, I want to populate the column Total. The logic of population is quite complex, The column Sub Family and check the Size from 1H to 5H, to populate the column total we have to look at the 3 columns, Num1, Num2, Num3.
We look into the each value in ABC, and check first column called Num1. As you can see the value of Sub Family which is ABC and then we need to look at Size Column which is from 1H to 5H and see the column Num1, as you can see there is no value there and is fully blank, we move to the next column Num2.
And in Num2, we see in 1H we have 0.5 and 2H we have 0.2, 3H has 0.3 and 4H has blank and 5H has blank. So then we populate the Total column with 0.5,0.2,0.3,0,0.
List item Then we look into the next value in Sub Family which is XYZ and in the first column Num 1 itself we can see the value in 0.25, 0.25,0.25,0.25, Blank. And then we don’t look further and we populate the column Total with 0.25,0.25,0.25,0.25,0.
Then we look into the next value in Sub family which is PRQ and Num1 is empty and then we look into Num2 and which is empty and then we look into Num3 and we found 0.2,0.2,0.2,0.2,0.2. So we populate the same value in Total.
Finally we look into the next value of sub family which is STV and we find Num1 column empty and in Num2, we found 1, blank, blank, blank and blank. So we populate the column total with 1,0,0,0,0. Keep in mind only 1,0,0,0,0 should be populated here, and the next column Num3 which has 0.2,0.2,0.2,0.2,0.2 should be completely ignored since we got the value in Num2**
I think this should work. Iterating across the Num
variables replace NA
s with 0
if there are any non-NA
values by Family
and `Sub Family`
, then coalesce()
the modified Num
variables.
library(dplyr)
library(tidyr)
dat |>
mutate(Total1 = across(starts_with("Num"), ~ if (any(!is.na(.x))) replace_na(.x, 0) else .x), .by = c(Family, `Sub Family`)) |>
mutate(Total1 = do.call(coalesce, c(Total1, 0)))
# A tibble: 20 × 11
Key1 Key2 Key3 Name Family `Sub Family` Size Num1 Num2 Num3 Total1
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 ADAM-1H POP-1H ABC-1H ADAM POP ABC 1H NA 0.5 NA 0.5
2 ADAM-2H POP-2H ABC-2H ADAM POP ABC 2H NA 0.2 NA 0.2
3 ADAM-3H POP-3H ABC-3H ADAM POP ABC 3H NA 0.3 NA 0.3
4 ADAM-4H POP-4H ABC-4H ADAM POP ABC 4H NA NA NA 0
5 ADAM-5H POP-5H ABC-5H ADAM POP ABC 5H NA NA NA 0
6 EVE-1H POP-1H XYZ-1H EVE POP XYZ 1H 0.25 NA NA 0.25
7 EVE-2H POP-2H XYZ-2H EVE POP XYZ 2H 0.25 NA NA 0.25
8 EVE-3H POP-3H XYZ-3H EVE POP XYZ 3H 0.25 NA NA 0.25
9 EVE-4H POP-4H XYZ-4H EVE POP XYZ 4H 0.25 NA NA 0.25
10 EVE-5H POP-5H XYZ-5H EVE POP XYZ 5H NA NA NA 0
11 ADAM-1H QOP-1H PRQ-1H ADAM QOP PRQ 1H NA NA 0.2 0.2
12 ADAM-2H QOP-2H PRQ-2H ADAM QOP PRQ 2H NA NA 0.2 0.2
13 ADAM-3H QOP-3H PRQ-3H ADAM QOP PRQ 3H NA NA 0.2 0.2
14 ADAM-4H QOP-4H PRQ-4H ADAM QOP PRQ 4H NA NA 0.2 0.2
15 ADAM-5H QOP-5H PRQ-5H ADAM QOP PRQ 5H NA NA 0.2 0.2
16 EVE-1H QOP-1H STV-1H EVE QOP STV 1H NA 1 0.2 1
17 EVE-2H QOP-2H STV-2H EVE QOP STV 2H NA NA 0.2 0
18 EVE-3H QOP-3H STV-3H EVE QOP STV 3H NA NA 0.2 0
19 EVE-4H QOP-4H STV-4H EVE QOP STV 4H NA NA 0.2 0
20 EVE-5H QOP-5H STV-5H EVE QOP STV 5H NA NA 0.2 0