I have df1
below. I would like to systematically calculate wilcox.test
p values to test whether the variable
is significantly higher/lower per color, as defined in the color1
and color2
columns.
I would like to test this for all samples, and per group in the column group
.
I am hoping to create a new data.frame
with the results, including the sample numbers per group
(n). Anticipated result is shown in df2
below. Note, though that the p values in df2
are made up as examples only.
df1 <- data.frame(
stringsAsFactors = FALSE,
sample = c(1L,2L,3L,4L,
5L,6L,7L,8L,9L,10L,11L,12L,13L,14L,15L,
16L,17L,18L,19L,20L,21L,22L,23L,24L,25L,
26L,27L,28L,29L,30L),
group = c("a","a","a",
"a","a","a","a","a","a","a","a","a","a",
"a","a","a","b","b","b","b","b","b","c",
"c","c","c","c","c","c","c"),
variable = c(5L,2L,4L,4L,
1L,3L,3L,5L,1L,7L,13L,9L,4L,4L,3L,12L,
0L,11L,1L,3L,0L,4L,5L,2L,6L,4L,6L,7L,5L,
3L),
color1 = c("black","white",
"white","black","black","white","white",
"black","black","black","black","black","white",
"white","black","white","black","white",
"black","white","black","white","white","white",
"black","white","black","black","white",
"black"),
color2 = c("red","blue",
"blue","blue","red","blue","blue","red","blue",
"red","red","blue","red","red","red",
"blue","blue","red","blue","red","red","blue",
"red","red","red","blue","red","blue","blue",
"blue")
)
df2 <- data.frame(
stringsAsFactors = FALSE,
group = c("all", "a", "b", "c"),
n = c(30L, 16L, 6L, 8L),
color1_pval = c(0.0485, 0.9641, 0.0832, 0.3882),
color2_pval = c(0.6727, 0.4121, 0.1282, 0.4344)
)
You can try :
library(dplyr)
df1 %>%
group_by(group) %>%
summarise(n = n(),
color1_pval = wilcox.test(variable[color1 == 'white'],
variable[color1 == 'black'])$p.value,
color2_pval = wilcox.test(variable[color2 == 'blue'],
variable[color2 == 'red'])$p.value)
# group n color1_pval color2_pval
# <chr> <int> <dbl> <dbl>
#1 a 16 0.556 0.457
#2 b 6 0.0765 0.825
#3 c 8 0.189 1