Search code examples
rdataframetibblecoalescing

Fill the NAs values in a dataframe based on calculation of columns in other dataframe


I have 2 dataframes. The RP and the RP2. My goal is to fill the NAs of RP columns Country_1 and Country_2 based on the conditions below:

1.If a cell of RP has already value leave it as it is.

2.If it has NA then see the TreatmenArea and Subarea of that cell and find the same pair of TreatmenArea, Subarea in RP2 then:

a.multiply this value with the PFaverage column of RP2 and divide by 100.

b.if the relative cell in RP2 is NA as well then leave the value in RP as NA

RP<-structure(list(ProductFamily = c("PF_1", "PF_10", "PF_10", "PF_100", 
"PF_100", "PF_101", "PF_102", "PF_102", "PF_102", "PF_102", "PF_103", 
"PF_103", "PF_104", "PF_105", "PF_106", "PF_106", "PF_106", "PF_106", 
"PF_107", "PF_108", "PF_109", "PF_11", "PF_110", "PF_110", "PF_111", 
"PF_111", "PF_111", "PF_112", "PF_112", "PF_113", "PF_113", "PF_113", 
"PF_114", "PF_115", "PF_115", "PF_116", "PF_117", "PF_118", "PF_119", 
"PF_12", "PF_12", "PF_12", "PF_120", "PF_120", "PF_120", "PF_120", 
"PF_120", "PF_120", "PF_121", "PF_122"), TreatmenArea = c("TA_7", 
"TA_2", "TA_2", "TA_6", "TA_6", "TA_2", "TA_2", "TA_2", "TA_2", 
"TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", 
"TA_2", "TA_2", "TA_2", "TA_2", "TA_7", "TA_2", "TA_2", "TA_7", 
"TA_7", "TA_7", "TA_7", "TA_7", "TA_2", "TA_2", "TA_2", "TA_6", 
"TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_5", "TA_5", 
"TA_5", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_4", 
"TA_4"), Subarea = c("SA_16", "SA_5", "SA_5", "SA_15", "SA_15", 
"SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", 
"SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", 
"SA_17", "SA_6", "SA_6", "SA_22", "SA_22", "SA_22", "SA_22", 
"SA_22", "SA_6", "SA_6", "SA_6", "SA_15", "SA_6", "SA_6", "SA_6", 
"SA_6", "SA_6", "SA_6", "SA_11", "SA_11", "SA_11", "SA_6", "SA_6", 
"SA_6", "SA_6", "SA_6", "SA_6", "SA_10", "SA_10"), Country_1 = c(NA, 
NA, NA, NA, NA, 92.6961421759861, NA, NA, NA, NA, 78.3001808318264,NA, NA, NA, 106.832963501416, 0.613496932515337, 104.21011973735, 
NA, NA, NA, NA, NA, 99.5238622522423, NA, NA, NA, NA, NA, NA, 
89.0343347639485, NA, NA, NA, NA, NA, NA, 101.231684009344, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Country_2 = c(NA, 
NA, 161.55950752394, NA, NA, NA, NA, NA, 59.1346153846154, NA, 
NA, NA, NA, 128.113063407181, 93.3812839543959, NA, NA, NA, 137.724550898204, 
NA, NA, NA, NA, 90.1602849510241, 37.3939722071828, NA, NA, 40.9756097560976, 
NA, NA, NA, 87.0095902353967, NA, NA, NA, 50.4591590140164, 92.1639413888299, 
44.7601588756493, NA, NA, NA, NA, NA, 100.053835800808, NA, NA, 
NA, NA, 136.420722135008, NA)), row.names = c(NA, 50L), class = "data.frame")

and the RP2

RP2<-structure(list(TreatmenArea = c("TA_1", "TA_1", "TA_1", "TA_1", 
"TA_2", "TA_2", "TA_2", "TA_3", "TA_4", "TA_4", "TA_5", "TA_5", 
"TA_5", "TA_6", "TA_6", "TA_7", "TA_7", "TA_7", "TA_7", "TA_7", 
"TA_7", "TA_7", "TA_7", "TA_8", "TA_9"), Subarea = c("SA_1", 
"SA_2", "SA_3", "SA_4", "SA_5", "SA_6", "SA_7", "SA_8", "SA_10", 
"SA_9", "SA_11", "SA_12", "SA_13", "SA_14", "SA_15", "SA_16", 
"SA_17", "SA_18", "SA_19", "SA_20", "SA_21", "SA_22", "SA_23", 
"SA_24", "SA_25"), Country_1 = c(101.37519256645, 105.268942332558, 
100.49933368058, 104.531597221684, NaN, 83.4404308144341, 86.2833044714836, 
81.808967345926, 79.6786979951661, 77.6863475527052, NaN, 78.3001808318264, 
112.499238782021, 113.526674294436, NaN, 108.350959378962, NaN, 
NaN, 102.243471199266, NaN, 104.323270355678, NaN, NaN, NaN, 
100), Country_2 = c(98.7267717862572, 83.9572019653478, 97.164068306148, 
103.654771613923, 161.55950752394, 75.4091957339533, 96.5255996196344, 
99.8317785594128, 88.1477193135348, NaN, NaN, NaN, 151.411687458963, 
107.652477161141, NaN, NaN, NaN, NaN, 92.4695215620261, NaN, 
88.9890571623243, 39.1847909816402, 87.0642912470953, NaN, NaN
), PFaverage = c(9.09293100169062, 16.2821052631579, 9.76688333333333, 
13.4754047619048, 17.8741666666667, 9.42546567085954, 12.6145188492064, 
12.4536666666667, 7.36169471153846, 13.1581818181818, 21.0866666666667, 
9.58, 16.2525049019608, 11.552822039072, 12.6908333333333, 20.5489611111111, 
11.99, 7.19134920634921, 8.51728472222222, 5.97, 12.2264183501684,6.37925, 16.1375, 11.9384615384615, 15.9185714285714)), class = c("data.frame"), row.names = c(NA, -25L)) 

If for example we check first row of Country_1 in RP we see that its NA and its Subarea is SA_16 so we move to RP2 and check that the relative value for Country_1 and Subarea SA_16 is 108.35. So the calculation is 108.35*20.54(PFaverage)/100.


Solution

  • library(dplyr)
    
    RP2 %>% 
      mutate_at(c("Country_1", "Country_2"), 
                list(PFa = ~. * PFaverage / 100)) %>% 
      select(TreatmenArea, Subarea, contains("PFa")) %>% 
      right_join(RP, . , by = c("TreatmenArea" , "Subarea")) %>% 
      mutate(Country_1 = coalesce(Country_1, Country_1_PFa),
             Country_2 = coalesce(Country_2, Country_2_PFa)) %>% 
      select(-contains("PFa")) 
    
    #>    ProductFamily TreatmenArea Subarea   Country_1  Country_2
    #> 1           PF_1         TA_7   SA_16  22.2649965        NaN
    #> 2          PF_10         TA_2    SA_5         NaN  28.877416
    #> 3          PF_10         TA_2    SA_5         NaN 161.559508
    #> 4         PF_100         TA_6   SA_15         NaN        NaN
    #> 5         PF_100         TA_6   SA_15         NaN        NaN
    #> 6         PF_101         TA_2    SA_6  92.6961422   7.107668
    #> 7         PF_102         TA_2    SA_6   7.8646492   7.107668
    #> 8         PF_102         TA_2    SA_6   7.8646492   7.107668
    #> 9         PF_102         TA_2    SA_6   7.8646492  59.134615
    #> 10        PF_102         TA_2    SA_6   7.8646492   7.107668
    ...
    

    Created on 2022-04-06 by the reprex package (v2.0.0)