I'm struggling to create a function that makes ANOVA analysis for a set of dataframes returning different number of objects to the global environment depending on wheter I want to split between the positive/negative trend values (4 objects expected) or not (2 objects expected).
My data is like this:
set.seed(88)
A <- data.frame(ID = 1:100, na.cnt.2 = round(rnorm(n=100,mean = 24.50, sd=5.877722),0),
trend = rnorm(100, mean = -0.0029446, sd=0.004951971),
p.val= rnorm(100, mean = 0.1983439, sd=0.2747593))
B <- data.frame(ID = 1:100, na.cnt.2 = round(rnorm(n=100,mean = 22.40, sd=6.180722),0 ),
trend = rnorm(100, mean = -0.0030070, sd=0.005016312),
p.val= rnorm(100, mean = 0.1657485, sd=0.3297854))
This is my main code:
anova_subgroups <- function(mp= 0.5, p_val = 0.05, split.sample=FALSE){
n <- 0
for (df in list(A, B)){
for (name in c("pos", "neg")){
index <- ifelse(n==0,"A","B")
letter <- "Myletter"
inner_sample.n <- df %>% mutate(mpe = na.cnt.2/49) %>%
{if (split.sample==TRUE) {if(name =="pos") dplyr::filter(mpe<mp, p.val<p_val, trend>0)
else dplyr::filter(mpe<mp, p.val<p_val, trend<0)}
else dplyr::filter(mpe<mp, p.val<p_val)} %>%
nrow()
alpha = 0.05
degrees.freedom = inner_sample.n - 1
t.score = qt(p=alpha/2, df=degrees.freedom, lower.tail = F)
inner_mean <- df %>% mutate(mpe = na.cnt.2/49) %>%
{if (split.sample==TRUE) {if(name =="pos")
dplyr::filter(mpe<mp, p.val<p_val, trend>0)
else dplyr::filter(mpe<mp, p.val<p_val, trend<0)}
else dplyr::filter(mpe<mp, p.val<p_val)} %>%
summarise(mean(trend)) %>% as.numeric()
inner_sd <- df %>% mutate(mpe = na.cnt.2/49)%>%
{if (split.sample==TRUE) {if(name =="pos")
dplyr::filter(mpe<mp, p.val<p_val, trend>0)
else dplyr::filter(mpe<mp, p.val<p_val, trend<0)}
else dplyr::filter(mpe<mp, p.val<p_val)} %>%
summarise(sd(trend)/sqrt(inner_sample.n)) %>% as.numeric()
margin.error <- t.score * inner_sd
lower.bound <- inner_mean - margin.error
upper.bound <- inner_mean + margin.error
CI <- c(lower.bound, upper.bound)
data <- df %>% mutate(mpe = na.cnt.2/49) %>%
{if (split.sample==TRUE) {if(name =="pos")
dplyr::filter(mpe<mp, p.val<p_val, trend>0)
else dplyr::filter(mpe<mp, p.val<p_val, trend<0)}
else dplyr::filter(mpe<mp, p.val<p_val)}
if (split.sample== FALSE){
assign(paste0(index,"_", letter, "_CI"), CI, envir = globalenv())
assign(paste0(index,"_", letter, "_MEAN"), inner_mean, envir = globalenv())
assign(paste0(index,"_", letter, "_DF"), data, envir = globalenv())
break
} else {
assign(paste0(name,"_",index,"_",letter, "_CI"), CI,
envir = globalenv())
assign(paste0(name,"_", index, "_", letter, "_MEAN"), inner_mean,
envir = globalenv())
assign(paste0(name,"_", index, "_", letter, "_DF"), data,
envir = globalenv())
}
}
n <- n+1
}
}
I'm getting the following error when using the anova_subgroups()
function:
Error in dplyr::filter(mpe < mp, p.val < p_val) : object 'mpe' not found
I tried to solve the problem using this approach. That's why in the code you can find the dplyr:: filter()
form. Also I'm using this approach to make the conditionals for the filter section.
I even tried to debug the problem by myself making a small version of the logic in the code:
a <- matrix(-10:10, ncol = 1) %>% as.data.frame()
names(a) <- "v"
split <- TRUE
for(df in c("A", "B")){
for (name in c("pos","neg")){
a %>% mutate(new= v/49) %>% {if (split==TRUE) {if(name=="pos")
filter(., new>0)
else filter(., new<0)}
else filter(., new==10)}
if (split == FALSE){
print(paste0(df, a))
break
} else{ print(paste0(df, name, a))
}
Getting the following output:
[1] "Apos-10:10"
[1] "Aneg-10:10"
[1] "Bpos-10:10"
[1] "Bneg-10:10"
I thought that my problem was related to the combination of both dplyr::filter()
and the {if (something) else}
statements. However it seems not to be the case. I really can't understand where the problem relies.
Can anyone explain and guide me to a solution?
I found my problem.
I was needing to to add a .
inside the filter()
statement
as following:
data <- df %>% mutate(mpe = na.cnt.2/49) %>%
{if (split.sample==TRUE) {if(name =="pos")
dplyr::filter(., mpe<mp, p.val<p_val, trend>0)
else dplyr::filter(., mpe<mp, p.val<p_val, trend<0)}
else dplyr::filter(., mpe<mp, p.val<p_val)}
I think that adding this point is to specify to the filter function that look for the variable inside the main data being used (df in this case).