I'm trying to write a function that takes in a data frame. The df$x
column of the data frame consists of two factor levels. df$y
is a continuous random variable. This is what I have so far:
compare_tests = function(df) {
p5Model = lm(y ~ x, df)
p5_Regression_P_Value = anova(p5Model)$'Pr(>F)'[1]
p5_xFactorLevels = factor(df$x)
p5_T_Test = t.test(p5_xFactorLevels[1], p5_xFactorLevels[2])
p5_T_Test_P_Value = p5_T_Test$p.value
p5Vector = c(regression = p5_Regression_P_Value , t.test = p5_T_Test_P_Value)
return(p5Vector)
}
My regression p value works but not the factor t.test p value.
So for example, sim2 is :
# A tibble: 40 x 2
x y
<chr> <dbl>
1 a 1.94
2 a 1.18
3 a 1.24
4 a 2.62
5 a 1.11
6 a 0.866
7 a -0.910
8 a 0.721
9 a 0.687
10 a 2.07
11 b 8.07
12 b 7.36
13 b 7.95
14 b 7.75
15 b 8.44
16 b 10.8
17 b 8.05
18 b 8.58
19 b 8.12
20 b 6.09
21 c 6.86
22 c 5.76
23 c 5.79
24 c 6.02
25 c 6.03
26 c 6.55
27 c 3.73
28 c 8.68
29 c 5.64
30 c 6.21
31 d 3.07
32 d 1.33
33 d 3.11
34 d 1.75
35 d 0.822
36 d 1.02
37 d 3.07
38 d 2.13
39 d 2.49
40 d 0.301
For those who rather would look at dput(sim2):
structure(list(x = c("a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "c",
"c", "c", "c", "c", "c", "c", "c", "c", "c", "d", "d", "d", "d",
"d", "d", "d", "d", "d", "d"), y = c(1.93536318980109, 1.17648861056246,
1.2436854647462, 2.6235488834436, 1.11203808286976, 0.866002986937445,
-0.910087467722212, 0.720762758415155, 0.68655402174211, 2.06730787876151,
8.07003485029664, 7.36087667611434, 7.95003510095185, 7.74851655674979,
8.44479711579273, 10.7554175753369, 8.04653138044419, 8.57770906930663,
8.11819487440968, 6.0882795089718, 6.86208648183857, 5.75676326036652,
5.79391280521842, 6.01917759220915, 6.02956075431977, 6.54982754180169,
3.72588514310706, 8.68255718355635, 5.63877874450629, 6.21335574971003,
3.07434588225969, 1.33491175145449, 3.11395241896922, 1.75410358832085,
0.822436691056719, 1.02414938384014, 3.06505732002715, 2.13167063477289,
2.48862880920098, 0.300549432154306)), .Names = c("x", "y"), class =
c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -40L))
My function:
compare_tests(sim2 %>% filter(x %in% c('a', 'd')))
should return
regression t.test
0.1051552 0.1052173
Your function has a problem with the t.test values.
The line p5_xFactorLevels = factor(df$x)
convert the column into factor (Ok, but not necessary). Then p5_T_Test = t.test(p5_xFactorLevels[1], p5_xFactorLevels[2])
is incorrectly performing the t-test against the first 2 elements of x column.
The test is: y column versus the x column: p5_T_Test = t.test(df$y ~df$x)
compare_tests = function(df) {
p5Model = lm(y ~ x, df)
p5_Regression_P_Value = anova(p5Model)$'Pr(>F)'[1]
#Correct line added below:
p5_T_Test = t.test(df$y ~df$x)
p5_T_Test_P_Value = p5_T_Test$p.value
p5Vector = c(regression = p5_Regression_P_Value , t.test = p5_T_Test_P_Value)
return(p5Vector)
}
sim2<-structure(list(x = c("a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "c",
"c", "c", "c", "c", "c", "c", "c", "c", "c", "d", "d", "d", "d",
"d", "d", "d", "d", "d", "d"), y = c(1.93536318980109, 1.17648861056246,
1.2436854647462, 2.6235488834436, 1.11203808286976, 0.866002986937445,
-0.910087467722212, 0.720762758415155, 0.68655402174211, 2.06730787876151,
8.07003485029664, 7.36087667611434, 7.95003510095185, 7.74851655674979,
8.44479711579273, 10.7554175753369, 8.04653138044419, 8.57770906930663,
8.11819487440968, 6.0882795089718, 6.86208648183857, 5.75676326036652,
5.79391280521842, 6.01917759220915, 6.02956075431977, 6.54982754180169,
3.72588514310706, 8.68255718355635, 5.63877874450629, 6.21335574971003,
3.07434588225969, 1.33491175145449, 3.11395241896922, 1.75410358832085,
0.822436691056719, 1.02414938384014, 3.06505732002715, 2.13167063477289,
2.48862880920098, 0.300549432154306)), .Names = c("x", "y"), class =
c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -40L))
compare_tests(sim2 %>% filter(x %in% c('a', 'd')))