Let's say I have
PG.UniProtIds scloc mean_row_R80080 mean_row_M80080 mean_row_E40040 mean_row_Sr80080
1 Q9UHI8 Extracellular 15.56592 NA NA 15.07543
2 P48643 Cytoplasm 14.32417 18.77053 18.24379 NA
3 P00734 Extracellular 25.51741 24.65632 25.91294 25.15333
4 P60201 Cell membrane NA 13.63354 NA NA
5 P01036;P01037 <NA> NA 11.88245 NA NA
6 P38646 Mitochondrion NA 17.28343 NA NA
7 Q9BVK6 Endoplasmic reticulum|Golgi apparatus NA 15.23676 NA NA
8 P13686 Extracellular 13.39949 13.99092 NA NA
9 P31939 Cytoplasm 15.64904 16.90990 16.51679 NA
10 Q8IUX7 Extracellular 17.85344 21.22067 20.26607 16.21366
df$scloc
consist of many different characters.
I want to group_by(scloc)
and calculate how many percetage the sum of these groups comprise of the entire sum within each column: dfsc$mean_row_R80080
, dfsc$mean_row_M80080
, dfsc$mean_row_E40040
, dfsc$mean_row_sr80080
.
I want to summarise these information in a new dataframe. I have started this for loop
, but I cannot get it to work:
library(dplyr)
df_ <- NULL
for (i in c("R", "M", "E", "Sr")) {
j <- ifelse(i == "E", "400", "800")
k <- ifelse(i == "E", "40", "80")
dfdata <- dfsc %>%
select(contains("PG"), matches(paste0("^mean_row_", i, j, k, "$")), scloc)
dfdata %>%
group_by(scloc) %>%
summarise(sum = sum(c_across(where(is.numeric)), na.rm = TRUE)) %>% #tried many combinations from here
mutate(frak = i)
if (is.null(df_)) {
df_ <- dfdata
} else {
df_ <- full_join(df_, dfdata)
}
}
Expected output:
scloc frak perc #should sum to 1 with each "frak"
1 Cytoplasm R 0.15
2 Extracellular R 0.05
3 Extracellular|Cell membrane|Lysosome/Vacuole R 0.10
4 Cell membrane R 0.08
5 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole R 0.12
6 Nucleus R 0.07
7 Cytoplasm|Nucleus R 0.10
8 NA R 0.09
9 Cytoplasm|Cell membrane R 0.24
My data:
dfsc <-
structure(list(PG.UniProtIds = c("Q14195", "Q6ZTR7", "P52907",
"P06310", "Q9P2P6", "Q8TCZ2", "P55011", "P04180", "O75351", "Q9Y3Z3",
"Q04323", "O00560;Q8IVU3;Q9H190;Q9UII4", "P19652", "O75173",
"Q5JY77", "P35268", "O43491", "P01111;P01116", "Q8WYK1", "Q5JZY3"
), scloc = c("Cytoplasm", "Cytoplasm", "Cytoplasm", "Extracellular",
"Cytoplasm", "Extracellular|Cell membrane|Lysosome/Vacuole",
"Cell membrane", "Extracellular|Endoplasmic reticulum|Lysosome/Vacuole",
"Cytoplasm", "Nucleus", "Cytoplasm|Nucleus", NA, "Extracellular",
"Extracellular", "Cytoplasm", "Cytoplasm", "Cytoplasm|Cell membrane",
NA, "Cell membrane", "Cell membrane"), mean_row_R80080 = c(19.6306684829868,
NA, 17.2895161634528, 19.4250092205753, NA, 20.117751088096,
NA, 20.5292180784938, NA, NA, NA, 16.2518076533184, 22.530625412224,
16.3396081224665, 15.8306614261866, NA, 11.9174064165071, 11.865825570229,
13.2777622148836, 13.4016030819706), mean_row_M80080 = c(21.9301990460461,
NA, 18.0559671254489, 16.3279816747423, NA, 14.41753373341, 16.3655313271373,
18.3355440802636, NA, 16.6044098066449, NA, NA, 16.112024925506,
13.3846412620776, NA, 17.7952994131055, 18.6626293542089, NA,
NA, NA), mean_row_E40040 = c(21.3779843457729, 12.6939238702458,
18.6188367607403, 20.4974200360208, 10.7600476279889, 14.6783675991137,
NA, 19.7946878082238, 14.9857897533375, 16.5373969851768, 12.8042679773528,
NA, 18.8525408958313, 12.4521848366364, NA, 15.7339178984839,
17.9755941567014, NA, NA, NA), mean_row_Sr80080 = c(19.1049545779682,
NA, NA, NA, NA, 19.6827289816135, NA, 20.3132062790538, NA, NA,
NA, NA, 23.085168299633, 16.4188217290661, NA, NA, 9.24995136945289,
NA, 15.3831902811252, NA)), class = "data.frame", row.names = c(NA,
-20L))
I can't exactly reproduce your expected output, so I'm assuming I may have one of the steps wrong. I would do this by pivoting the data longer and then working with it in long form to make the sums and percentages. You can pull out the letter you're looping over with a regular expression: "mean_row_([A-za-z]*)\\d*"
which takes all letters between the underscore after row
and the digits at the end of the string. I summed the values and then made them into proportions within each frak
group across all the scloc
groups.
library(dplyr)
library(tidyr)
dfsc <-
structure(list(PG.UniProtIds = c("Q14195", "Q6ZTR7", "P52907",
"P06310", "Q9P2P6", "Q8TCZ2", "P55011", "P04180", "O75351", "Q9Y3Z3",
"Q04323", "O00560;Q8IVU3;Q9H190;Q9UII4", "P19652", "O75173",
"Q5JY77", "P35268", "O43491", "P01111;P01116", "Q8WYK1", "Q5JZY3"
), scloc = c("Cytoplasm", "Cytoplasm", "Cytoplasm", "Extracellular",
"Cytoplasm", "Extracellular|Cell membrane|Lysosome/Vacuole",
"Cell membrane", "Extracellular|Endoplasmic reticulum|Lysosome/Vacuole",
"Cytoplasm", "Nucleus", "Cytoplasm|Nucleus", NA, "Extracellular",
"Extracellular", "Cytoplasm", "Cytoplasm", "Cytoplasm|Cell membrane",
NA, "Cell membrane", "Cell membrane"), mean_row_R80080 = c(19.6306684829868,
NA, 17.2895161634528, 19.4250092205753, NA, 20.117751088096,
NA, 20.5292180784938, NA, NA, NA, 16.2518076533184, 22.530625412224,
16.3396081224665, 15.8306614261866, NA, 11.9174064165071, 11.865825570229,
13.2777622148836, 13.4016030819706), mean_row_M80080 = c(21.9301990460461,
NA, 18.0559671254489, 16.3279816747423, NA, 14.41753373341, 16.3655313271373,
18.3355440802636, NA, 16.6044098066449, NA, NA, 16.112024925506,
13.3846412620776, NA, 17.7952994131055, 18.6626293542089, NA,
NA, NA), mean_row_E40040 = c(21.3779843457729, 12.6939238702458,
18.6188367607403, 20.4974200360208, 10.7600476279889, 14.6783675991137,
NA, 19.7946878082238, 14.9857897533375, 16.5373969851768, 12.8042679773528,
NA, 18.8525408958313, 12.4521848366364, NA, 15.7339178984839,
17.9755941567014, NA, NA, NA), mean_row_Sr80080 = c(19.1049545779682,
NA, NA, NA, NA, 19.6827289816135, NA, 20.3132062790538, NA, NA,
NA, NA, 23.085168299633, 16.4188217290661, NA, NA, 9.24995136945289,
NA, 15.3831902811252, NA)), class = "data.frame", row.names = c(NA,
-20L))
dfsc %>%
pivot_longer(contains("mean_row"), names_pattern="mean_row_([A-za-z]*)\\d*",
names_to="frak", values_to = "vals") %>%
group_by(scloc, frak) %>%
summarise(vals = sum(vals, na.rm=TRUE)) %>%
group_by(frak) %>%
mutate(perc = vals/sum(vals)) %>%
arrange(frak, desc(perc)) %>%
select(-vals) %>%
as.data.frame()
#> `summarise()` has grouped output by 'scloc'. You can override using the
#> `.groups` argument.
#> scloc frak perc
#> 1 Cytoplasm E 0.41345836
#> 2 Extracellular E 0.22743885
#> 3 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole E 0.08690916
#> 4 Cytoplasm|Cell membrane E 0.07892238
#> 5 Nucleus E 0.07260793
#> 6 Extracellular|Cell membrane|Lysosome/Vacuole E 0.06444581
#> 7 Cytoplasm|Nucleus E 0.05621752
#> 8 Cell membrane E 0.00000000
#> 9 <NA> E 0.00000000
#> 10 Cytoplasm M 0.30736169
#> 11 Extracellular M 0.24375881
#> 12 Cytoplasm|Cell membrane M 0.09927366
#> 13 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole M 0.09753376
#> 14 Nucleus M 0.08832520
#> 15 Cell membrane M 0.08705451
#> 16 Extracellular|Cell membrane|Lysosome/Vacuole M 0.07669237
#> 17 Cytoplasm|Nucleus M 0.00000000
#> 18 <NA> M 0.00000000
#> 19 Extracellular R 0.26691049
#> 20 Cytoplasm R 0.24152492
#> 21 <NA> R 0.12873934
#> 22 Cell membrane R 0.12215409
#> 23 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole R 0.09399504
#> 24 Extracellular|Cell membrane|Lysosome/Vacuole R 0.09211110
#> 25 Cytoplasm|Cell membrane R 0.05456501
#> 26 Cytoplasm|Nucleus R 0.00000000
#> 27 Nucleus R 0.00000000
#> 28 Extracellular Sr 0.32055034
#> 29 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole Sr 0.16482905
#> 30 Extracellular|Cell membrane|Lysosome/Vacuole Sr 0.15971312
#> 31 Cytoplasm Sr 0.15502484
#> 32 Cell membrane Sr 0.12482503
#> 33 Cytoplasm|Cell membrane Sr 0.07505761
#> 34 Cytoplasm|Nucleus Sr 0.00000000
#> 35 Nucleus Sr 0.00000000
#> 36 <NA> Sr 0.00000000
scloc
.dfsc %>%
pivot_longer(contains("mean_row"), names_pattern="mean_row_([A-za-z]*)\\d*",
names_to="frak", values_to = "vals") %>%
group_by(scloc, frak) %>%
summarise(vals = n()) %>%
group_by(frak) %>%
mutate(perc = vals/sum(vals)) %>%
arrange(frak, desc(perc)) %>%
select(-vals) %>%
as.data.frame()
#> `summarise()` has grouped output by 'scloc'. You can override using the
#> `.groups` argument.
#> scloc frak perc
#> 1 Cytoplasm E 0.35
#> 2 Cell membrane E 0.15
#> 3 Extracellular E 0.15
#> 4 <NA> E 0.10
#> 5 Cytoplasm|Cell membrane E 0.05
#> 6 Cytoplasm|Nucleus E 0.05
#> 7 Extracellular|Cell membrane|Lysosome/Vacuole E 0.05
#> 8 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole E 0.05
#> 9 Nucleus E 0.05
#> 10 Cytoplasm M 0.35
#> 11 Cell membrane M 0.15
#> 12 Extracellular M 0.15
#> 13 <NA> M 0.10
#> 14 Cytoplasm|Cell membrane M 0.05
#> 15 Cytoplasm|Nucleus M 0.05
#> 16 Extracellular|Cell membrane|Lysosome/Vacuole M 0.05
#> 17 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole M 0.05
#> 18 Nucleus M 0.05
#> 19 Cytoplasm R 0.35
#> 20 Cell membrane R 0.15
#> 21 Extracellular R 0.15
#> 22 <NA> R 0.10
#> 23 Cytoplasm|Cell membrane R 0.05
#> 24 Cytoplasm|Nucleus R 0.05
#> 25 Extracellular|Cell membrane|Lysosome/Vacuole R 0.05
#> 26 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole R 0.05
#> 27 Nucleus R 0.05
#> 28 Cytoplasm Sr 0.35
#> 29 Cell membrane Sr 0.15
#> 30 Extracellular Sr 0.15
#> 31 <NA> Sr 0.10
#> 32 Cytoplasm|Cell membrane Sr 0.05
#> 33 Cytoplasm|Nucleus Sr 0.05
#> 34 Extracellular|Cell membrane|Lysosome/Vacuole Sr 0.05
#> 35 Extracellular|Endoplasmic reticulum|Lysosome/Vacuole Sr 0.05
#> 36 Nucleus Sr 0.05
Created on 2023-07-04 with reprex v2.0.2