I'm looking to scale values out of 100% based on the sample. My df contains the col Project
that I'd like to group the col n
by to normalize those values of n
on a scale to 100%.
library("dplyr")
ex %>%
dplyr::group_by(Project) %>%
scale(n) -> perc
For my provided data, I'd expect the output perc
to be
Project | reaction | norm |
---|---|---|
Ga0598239 | arsenate-reduction | 0.0312 |
Ga0598239 | carbon-fixation | 0.0033 |
et cetera.
> dput(ex)
structure(list(Project = c("Ga0598239", "Ga0598239", "Ga0598239",
"Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239",
"Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239",
"Ga0598239", "Ga0598240", "Ga0598240", "Ga0598240", "Ga0598240",
"Ga0598240", "Ga0598240"), reaction = c("arsenate-reduction",
"carbon-fixation", "formaldehyde-oxidation", "halogenated-compounds-breakdown",
"hydrogen-oxidation", "iron-oxidation", "iron-reduction", "manganese-oxidation",
"methanol-oxidation", "selenate-reduction", "sulfide-oxidation",
"sulfite-reduction", "sulfur-oxidation", "thiosulfate-disproportionation",
"arsenate-reduction", "carbon-fixation", "formaldehyde-oxidation",
"halogenated-compounds-breakdown", "hydrogen-oxidation", "iron-oxidation"
), n = c(103L, 11L, 157L, 90L, 2296L, 85L, 33L, 156L, 17L, 38L,
8L, 9L, 259L, 13L, 90L, 21L, 202L, 81L, 2090L, 73L)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L), groups = structure(list(
Project = c("Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239",
"Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239",
"Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239",
"Ga0598240", "Ga0598240", "Ga0598240", "Ga0598240", "Ga0598240",
"Ga0598240"), reaction = c("arsenate-reduction", "carbon-fixation",
"formaldehyde-oxidation", "halogenated-compounds-breakdown",
"hydrogen-oxidation", "iron-oxidation", "iron-reduction",
"manganese-oxidation", "methanol-oxidation", "selenate-reduction",
"sulfide-oxidation", "sulfite-reduction", "sulfur-oxidation",
"thiosulfate-disproportionation", "arsenate-reduction", "carbon-fixation",
"formaldehyde-oxidation", "halogenated-compounds-breakdown",
"hydrogen-oxidation", "iron-oxidation"), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L), .drop = TRUE))
You want this:
Note your provided data is grouped. So first ungroup
!
library(dplyr) #> 1.1.0
df %>%
ungroup() %>%
mutate(norm = (n / sum(n)), .by=Project)
Project reaction n norm
<chr> <chr> <int> <dbl>
1 Ga0598239 arsenate-reduction 103 0.0315
2 Ga0598239 carbon-fixation 11 0.00336
3 Ga0598239 formaldehyde-oxidation 157 0.0479
4 Ga0598239 halogenated-compounds-breakdown 90 0.0275
5 Ga0598239 hydrogen-oxidation 2296 0.701
6 Ga0598239 iron-oxidation 85 0.0260
7 Ga0598239 iron-reduction 33 0.0101
8 Ga0598239 manganese-oxidation 156 0.0476
9 Ga0598239 methanol-oxidation 17 0.00519
10 Ga0598239 selenate-reduction 38 0.0116
11 Ga0598239 sulfide-oxidation 8 0.00244
12 Ga0598239 sulfite-reduction 9 0.00275
13 Ga0598239 sulfur-oxidation 259 0.0791
14 Ga0598239 thiosulfate-disproportionation 13 0.00397
15 Ga0598240 arsenate-reduction 90 0.0352
16 Ga0598240 carbon-fixation 21 0.00821
17 Ga0598240 formaldehyde-oxidation 202 0.0790
18 Ga0598240 halogenated-compounds-breakdown 81 0.0317
19 Ga0598240 hydrogen-oxidation 2090 0.817
20 Ga0598240 iron-oxidation 73 0.0285