Search code examples
rdataframefor-loopdplyrgroup-by

How to calculate percetage of value within each group across multiple columns using for-loop


Let's say I have

   PG.UniProtIds                                 scloc mean_row_R80080 mean_row_M80080 mean_row_E40040 mean_row_Sr80080
1         Q9UHI8                         Extracellular        15.56592              NA              NA         15.07543
2         P48643                             Cytoplasm        14.32417        18.77053        18.24379               NA
3         P00734                         Extracellular        25.51741        24.65632        25.91294         25.15333
4         P60201                         Cell membrane              NA        13.63354              NA               NA
5  P01036;P01037                                  <NA>              NA        11.88245              NA               NA
6         P38646                         Mitochondrion              NA        17.28343              NA               NA
7         Q9BVK6 Endoplasmic reticulum|Golgi apparatus              NA        15.23676              NA               NA
8         P13686                         Extracellular        13.39949        13.99092              NA               NA
9         P31939                             Cytoplasm        15.64904        16.90990        16.51679               NA
10        Q8IUX7                         Extracellular        17.85344        21.22067        20.26607         16.21366

df$scloc consist of many different characters.

I want to group_by(scloc) and calculate how many percetage the sum of these groups comprise of the entire sum within each column: dfsc$mean_row_R80080, dfsc$mean_row_M80080, dfsc$mean_row_E40040, dfsc$mean_row_sr80080.

I want to summarise these information in a new dataframe. I have started this for loop, but I cannot get it to work:

library(dplyr)
df_ <- NULL

for (i in c("R", "M", "E", "Sr")) {
  j <- ifelse(i == "E", "400", "800")
  k <- ifelse(i == "E", "40", "80")
  
  dfdata <- dfsc %>%
    select(contains("PG"), matches(paste0("^mean_row_", i, j, k, "$")), scloc)
  
    dfdata %>%  
    group_by(scloc) %>%
    summarise(sum = sum(c_across(where(is.numeric)), na.rm = TRUE)) %>% #tried many combinations from here
    mutate(frak = i)
  
  if (is.null(df_)) {
    df_ <- dfdata
  } else {
    df_ <- full_join(df_, dfdata)
  }
}

Expected output:

                                                 scloc frak perc #should sum to 1 with each "frak"
1                                            Cytoplasm    R 0.15
2                                        Extracellular    R 0.05
3         Extracellular|Cell membrane|Lysosome/Vacuole    R 0.10
4                                        Cell membrane    R 0.08
5 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole    R 0.12
6                                              Nucleus    R 0.07
7                                    Cytoplasm|Nucleus    R 0.10
8                                                   NA    R 0.09
9                              Cytoplasm|Cell membrane    R 0.24

My data:

dfsc <- 
structure(list(PG.UniProtIds = c("Q14195", "Q6ZTR7", "P52907", 
"P06310", "Q9P2P6", "Q8TCZ2", "P55011", "P04180", "O75351", "Q9Y3Z3", 
"Q04323", "O00560;Q8IVU3;Q9H190;Q9UII4", "P19652", "O75173", 
"Q5JY77", "P35268", "O43491", "P01111;P01116", "Q8WYK1", "Q5JZY3"
), scloc = c("Cytoplasm", "Cytoplasm", "Cytoplasm", "Extracellular", 
"Cytoplasm", "Extracellular|Cell membrane|Lysosome/Vacuole", 
"Cell membrane", "Extracellular|Endoplasmic reticulum|Lysosome/Vacuole", 
"Cytoplasm", "Nucleus", "Cytoplasm|Nucleus", NA, "Extracellular", 
"Extracellular", "Cytoplasm", "Cytoplasm", "Cytoplasm|Cell membrane", 
NA, "Cell membrane", "Cell membrane"), mean_row_R80080 = c(19.6306684829868, 
NA, 17.2895161634528, 19.4250092205753, NA, 20.117751088096, 
NA, 20.5292180784938, NA, NA, NA, 16.2518076533184, 22.530625412224, 
16.3396081224665, 15.8306614261866, NA, 11.9174064165071, 11.865825570229, 
13.2777622148836, 13.4016030819706), mean_row_M80080 = c(21.9301990460461, 
NA, 18.0559671254489, 16.3279816747423, NA, 14.41753373341, 16.3655313271373, 
18.3355440802636, NA, 16.6044098066449, NA, NA, 16.112024925506, 
13.3846412620776, NA, 17.7952994131055, 18.6626293542089, NA, 
NA, NA), mean_row_E40040 = c(21.3779843457729, 12.6939238702458, 
18.6188367607403, 20.4974200360208, 10.7600476279889, 14.6783675991137, 
NA, 19.7946878082238, 14.9857897533375, 16.5373969851768, 12.8042679773528, 
NA, 18.8525408958313, 12.4521848366364, NA, 15.7339178984839, 
17.9755941567014, NA, NA, NA), mean_row_Sr80080 = c(19.1049545779682, 
NA, NA, NA, NA, 19.6827289816135, NA, 20.3132062790538, NA, NA, 
NA, NA, 23.085168299633, 16.4188217290661, NA, NA, 9.24995136945289, 
NA, 15.3831902811252, NA)), class = "data.frame", row.names = c(NA, 
-20L))

Solution

  • I can't exactly reproduce your expected output, so I'm assuming I may have one of the steps wrong. I would do this by pivoting the data longer and then working with it in long form to make the sums and percentages. You can pull out the letter you're looping over with a regular expression: "mean_row_([A-za-z]*)\\d*" which takes all letters between the underscore after row and the digits at the end of the string. I summed the values and then made them into proportions within each frak group across all the scloc groups.

    library(dplyr)
    library(tidyr)
    dfsc <- 
    structure(list(PG.UniProtIds = c("Q14195", "Q6ZTR7", "P52907", 
    "P06310", "Q9P2P6", "Q8TCZ2", "P55011", "P04180", "O75351", "Q9Y3Z3", 
    "Q04323", "O00560;Q8IVU3;Q9H190;Q9UII4", "P19652", "O75173", 
    "Q5JY77", "P35268", "O43491", "P01111;P01116", "Q8WYK1", "Q5JZY3"
    ), scloc = c("Cytoplasm", "Cytoplasm", "Cytoplasm", "Extracellular", 
    "Cytoplasm", "Extracellular|Cell membrane|Lysosome/Vacuole", 
    "Cell membrane", "Extracellular|Endoplasmic reticulum|Lysosome/Vacuole", 
    "Cytoplasm", "Nucleus", "Cytoplasm|Nucleus", NA, "Extracellular", 
    "Extracellular", "Cytoplasm", "Cytoplasm", "Cytoplasm|Cell membrane", 
    NA, "Cell membrane", "Cell membrane"), mean_row_R80080 = c(19.6306684829868, 
    NA, 17.2895161634528, 19.4250092205753, NA, 20.117751088096, 
    NA, 20.5292180784938, NA, NA, NA, 16.2518076533184, 22.530625412224, 
    16.3396081224665, 15.8306614261866, NA, 11.9174064165071, 11.865825570229, 
    13.2777622148836, 13.4016030819706), mean_row_M80080 = c(21.9301990460461, 
    NA, 18.0559671254489, 16.3279816747423, NA, 14.41753373341, 16.3655313271373, 
    18.3355440802636, NA, 16.6044098066449, NA, NA, 16.112024925506, 
    13.3846412620776, NA, 17.7952994131055, 18.6626293542089, NA, 
    NA, NA), mean_row_E40040 = c(21.3779843457729, 12.6939238702458, 
    18.6188367607403, 20.4974200360208, 10.7600476279889, 14.6783675991137, 
    NA, 19.7946878082238, 14.9857897533375, 16.5373969851768, 12.8042679773528, 
    NA, 18.8525408958313, 12.4521848366364, NA, 15.7339178984839, 
    17.9755941567014, NA, NA, NA), mean_row_Sr80080 = c(19.1049545779682, 
    NA, NA, NA, NA, 19.6827289816135, NA, 20.3132062790538, NA, NA, 
    NA, NA, 23.085168299633, 16.4188217290661, NA, NA, 9.24995136945289, 
    NA, 15.3831902811252, NA)), class = "data.frame", row.names = c(NA, 
    -20L))
    
    dfsc %>% 
      pivot_longer(contains("mean_row"), names_pattern="mean_row_([A-za-z]*)\\d*", 
                   names_to="frak", values_to = "vals") %>%
      group_by(scloc, frak) %>% 
      summarise(vals = sum(vals, na.rm=TRUE)) %>% 
      group_by(frak) %>%
      mutate(perc = vals/sum(vals)) %>% 
      arrange(frak, desc(perc)) %>%
      select(-vals) %>% 
      as.data.frame()
    #> `summarise()` has grouped output by 'scloc'. You can override using the
    #> `.groups` argument.
    #>                                                   scloc frak       perc
    #> 1                                             Cytoplasm    E 0.41345836
    #> 2                                         Extracellular    E 0.22743885
    #> 3  Extracellular|Endoplasmic reticulum|Lysosome/Vacuole    E 0.08690916
    #> 4                               Cytoplasm|Cell membrane    E 0.07892238
    #> 5                                               Nucleus    E 0.07260793
    #> 6          Extracellular|Cell membrane|Lysosome/Vacuole    E 0.06444581
    #> 7                                     Cytoplasm|Nucleus    E 0.05621752
    #> 8                                         Cell membrane    E 0.00000000
    #> 9                                                  <NA>    E 0.00000000
    #> 10                                            Cytoplasm    M 0.30736169
    #> 11                                        Extracellular    M 0.24375881
    #> 12                              Cytoplasm|Cell membrane    M 0.09927366
    #> 13 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole    M 0.09753376
    #> 14                                              Nucleus    M 0.08832520
    #> 15                                        Cell membrane    M 0.08705451
    #> 16         Extracellular|Cell membrane|Lysosome/Vacuole    M 0.07669237
    #> 17                                    Cytoplasm|Nucleus    M 0.00000000
    #> 18                                                 <NA>    M 0.00000000
    #> 19                                        Extracellular    R 0.26691049
    #> 20                                            Cytoplasm    R 0.24152492
    #> 21                                                 <NA>    R 0.12873934
    #> 22                                        Cell membrane    R 0.12215409
    #> 23 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole    R 0.09399504
    #> 24         Extracellular|Cell membrane|Lysosome/Vacuole    R 0.09211110
    #> 25                              Cytoplasm|Cell membrane    R 0.05456501
    #> 26                                    Cytoplasm|Nucleus    R 0.00000000
    #> 27                                              Nucleus    R 0.00000000
    #> 28                                        Extracellular   Sr 0.32055034
    #> 29 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole   Sr 0.16482905
    #> 30         Extracellular|Cell membrane|Lysosome/Vacuole   Sr 0.15971312
    #> 31                                            Cytoplasm   Sr 0.15502484
    #> 32                                        Cell membrane   Sr 0.12482503
    #> 33                              Cytoplasm|Cell membrane   Sr 0.07505761
    #> 34                                    Cytoplasm|Nucleus   Sr 0.00000000
    #> 35                                              Nucleus   Sr 0.00000000
    #> 36                                                 <NA>   Sr 0.00000000
    

    Edit: counting occurrences of scloc.

    dfsc %>% 
      pivot_longer(contains("mean_row"), names_pattern="mean_row_([A-za-z]*)\\d*", 
                   names_to="frak", values_to = "vals") %>%
      group_by(scloc, frak) %>% 
      summarise(vals = n()) %>% 
      group_by(frak) %>%
      mutate(perc = vals/sum(vals)) %>% 
      arrange(frak, desc(perc)) %>%
      select(-vals) %>% 
      as.data.frame()
    #> `summarise()` has grouped output by 'scloc'. You can override using the
    #> `.groups` argument.
    #>                                                   scloc frak perc
    #> 1                                             Cytoplasm    E 0.35
    #> 2                                         Cell membrane    E 0.15
    #> 3                                         Extracellular    E 0.15
    #> 4                                                  <NA>    E 0.10
    #> 5                               Cytoplasm|Cell membrane    E 0.05
    #> 6                                     Cytoplasm|Nucleus    E 0.05
    #> 7          Extracellular|Cell membrane|Lysosome/Vacuole    E 0.05
    #> 8  Extracellular|Endoplasmic reticulum|Lysosome/Vacuole    E 0.05
    #> 9                                               Nucleus    E 0.05
    #> 10                                            Cytoplasm    M 0.35
    #> 11                                        Cell membrane    M 0.15
    #> 12                                        Extracellular    M 0.15
    #> 13                                                 <NA>    M 0.10
    #> 14                              Cytoplasm|Cell membrane    M 0.05
    #> 15                                    Cytoplasm|Nucleus    M 0.05
    #> 16         Extracellular|Cell membrane|Lysosome/Vacuole    M 0.05
    #> 17 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole    M 0.05
    #> 18                                              Nucleus    M 0.05
    #> 19                                            Cytoplasm    R 0.35
    #> 20                                        Cell membrane    R 0.15
    #> 21                                        Extracellular    R 0.15
    #> 22                                                 <NA>    R 0.10
    #> 23                              Cytoplasm|Cell membrane    R 0.05
    #> 24                                    Cytoplasm|Nucleus    R 0.05
    #> 25         Extracellular|Cell membrane|Lysosome/Vacuole    R 0.05
    #> 26 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole    R 0.05
    #> 27                                              Nucleus    R 0.05
    #> 28                                            Cytoplasm   Sr 0.35
    #> 29                                        Cell membrane   Sr 0.15
    #> 30                                        Extracellular   Sr 0.15
    #> 31                                                 <NA>   Sr 0.10
    #> 32                              Cytoplasm|Cell membrane   Sr 0.05
    #> 33                                    Cytoplasm|Nucleus   Sr 0.05
    #> 34         Extracellular|Cell membrane|Lysosome/Vacuole   Sr 0.05
    #> 35 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole   Sr 0.05
    #> 36                                              Nucleus   Sr 0.05
    

    Created on 2023-07-04 with reprex v2.0.2