Consider dat1
created here:
set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
State = rep(c("NY","MA","FL","GA"), each = 10),
Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
ID = rep(c(1:10), each = 2),
var1 = rnorm(200),
var2 = rnorm(200),
var3 = rnorm(200),
var4 = rnorm(200),
var5 = rnorm(200))
dat1
has measurements for 5 variables, and observations (ID
s) can be grouped according to 3 grouping variables: Loc
, State
, and Region
I am having to perform various tasks on each response variable/grouping variable combination, so I have been writing functions to make it easier, and keep my analysis tidy. I am using the rstatix
package to do several operations. The following function will conduct a Kruskal Wallis test on the data I specify, calculate the effect size efsz
and return the results in a single data frame res
:
library(rstatix)
KruskTest <- function(dat, groupvar, var){
kt <- dat%>%kruskal_test(get(var) ~ get(groupvar))
efsz <- dat%>%kruskal_effsize(get(var) ~ get(groupvar))
res <<- cbind(kt, efsz[,3:5])
res[1,1] <<- var
res$groupvar <<- groupvar
res <<- res[,c(10,1:9)]
}
KruskTest(dat=dat1, groupvar = "Region", var = "var1")
Now I can use that function to loop over each response variable and get the results for a grouping variable (example shows it for Region
) in a single data frame, which is what I need:
vars <- paste(names(dat1[,5:9]))
a <- data.frame()
for(i in vars){
KruskTest(dat=dat1, groupvar="Region", var= i)
a <- rbind(a, res)
}
That works great for the Kruskal Wallis test, now I want to make a very similar function that will do a duns test, but watch what happens:
dunn <- function(dat, groupvar, var){
res <<- dat%>%rstatix::dunn_test(get(var) ~ get(groupvar), p.adjust.method = "bonferroni")
}
dunn(dat=dat1, groupvar="Region", var = "var1")
r:Error: Can't extract columns that don't exist. x The column `get(groupvar)` doesn't exist.
Outside of a user-written function, you specify data for the dunn_test()
and kruskal_test()
the exact same way. So what is the difference between specifying variables in these two funcitons, and why does the first one work but not the second?
Taking into account @Gregor 's comments about not writing to the environment and trying to clean up some other rough edges's I have a proposed improvement although Gregor is correct your biggest problem was nothing but a typo.
library(rstatix)
library(purrr)
# rewritten to avoid writing to environment
NewKruskTest <- function(dat, groupvar, var) {
kt <- dat %>% kruskal_test(as.formula(paste(var, "~", groupvar)))
efsz <- dat %>% kruskal_effsize(as.formula(paste(var, "~", groupvar)))
results <- cbind(kt, efsz[,3:5])
results$groupvar <- groupvar
results <- results[,c(10,1:9)]
return(results)
}
# works on a single if you want to test
# NewKruskTest(dat = dat1, groupvar = "Region", var = "var1")
# No paste needed
vars <- names(dat1[,5:9])
# NewKruskTest will work in your existing for loop but you
# may find `purrr:map_dfr` cleaner
map_dfr(vars, ~ NewKruskTest(dat = dat1, groupvar = "Region", var = .))
#> groupvar .y. n statistic df p method effsize method.1
#> 1 Region var1 200 3.0520896 1 0.0806 Kruskal-Wallis 0.010364089 eta2[H]
#> 2 Region var2 200 0.5961552 1 0.4400 Kruskal-Wallis -0.002039620 eta2[H]
#> 3 Region var3 200 1.6330090 1 0.2010 Kruskal-Wallis 0.003197015 eta2[H]
#> 4 Region var4 200 3.4031343 1 0.0651 Kruskal-Wallis 0.012137042 eta2[H]
#> 5 Region var5 200 0.7230090 1 0.3950 Kruskal-Wallis -0.001398945 eta2[H]
#> magnitude
#> 1 small
#> 2 small
#> 3 small
#> 4 small
#> 5 small
# NewDunn rewritten
NewDunn <- function(dat, groupvar, var) {
results <- dat %>% rstatix::dunn_test(as.formula(paste(var, "~", groupvar)),
p.adjust.method = "bonferroni")
results$groupvar <- groupvar
results <- results[,c(10,1:9)]
return(results)
}
# works on a single if you want to test
# NewDunn(dat=dat1, groupvar ="Region", var = "var1")
map_dfr(vars, ~ NewDunn(dat = dat1, groupvar = "Region", var = .))
#> # A tibble: 5 x 10
#> groupvar .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
#> <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
#> 1 Region var1 r1 r2 100 100 -1.75 0.0806 0.0806 ns
#> 2 Region var2 r1 r2 100 100 -0.772 0.440 0.440 ns
#> 3 Region var3 r1 r2 100 100 -1.28 0.201 0.201 ns
#> 4 Region var4 r1 r2 100 100 1.84 0.0651 0.0651 ns
#> 5 Region var5 r1 r2 100 100 -0.850 0.395 0.395 ns
based on your data
set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
State = rep(c("NY","MA","FL","GA"), each = 10),
Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
ID = rep(c(1:10), each = 2),
var1 = rnorm(200),
var2 = rnorm(200),
var3 = rnorm(200),
var4 = rnorm(200),
var5 = rnorm(200))