Search code examples
rvectorp-valuet-test

Return regression p-value and t.test p-value of a data frame


I'm trying to write a function that takes in a data frame. The df$x column of the data frame consists of two factor levels. df$y is a continuous random variable. This is what I have so far:

compare_tests = function(df) {
    p5Model = lm(y ~ x, df)
    p5_Regression_P_Value = anova(p5Model)$'Pr(>F)'[1]
    p5_xFactorLevels = factor(df$x)
    p5_T_Test = t.test(p5_xFactorLevels[1], p5_xFactorLevels[2])
    p5_T_Test_P_Value = p5_T_Test$p.value
    p5Vector = c(regression = p5_Regression_P_Value , t.test = p5_T_Test_P_Value)
    return(p5Vector)
}

My regression p value works but not the factor t.test p value.

So for example, sim2 is :

# A tibble: 40 x 2
   x           y
   <chr>   <dbl>
 1 a       1.94 
 2 a       1.18 
 3 a       1.24 
 4 a       2.62 
 5 a       1.11 
 6 a       0.866
 7 a      -0.910
 8 a       0.721
 9 a       0.687
10 a       2.07 
11 b       8.07 
12 b       7.36 
13 b       7.95 
14 b       7.75 
15 b       8.44 
16 b      10.8  
17 b       8.05 
18 b       8.58 
19 b       8.12 
20 b       6.09 
21 c       6.86 
22 c       5.76 
23 c       5.79 
24 c       6.02 
25 c       6.03 
26 c       6.55 
27 c       3.73 
28 c       8.68 
29 c       5.64 
30 c       6.21 
31 d       3.07 
32 d       1.33 
33 d       3.11 
34 d       1.75 
35 d       0.822
36 d       1.02 
37 d       3.07   
38 d       2.13 
39 d       2.49 
40 d       0.301

For those who rather would look at dput(sim2):

structure(list(x = c("a", "a", "a", "a", "a", "a", "a", "a", 
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "c", 
"c", "c", "c", "c", "c", "c", "c", "c", "c", "d", "d", "d", "d", 
"d", "d", "d", "d", "d", "d"), y = c(1.93536318980109, 1.17648861056246, 
1.2436854647462, 2.6235488834436, 1.11203808286976, 0.866002986937445, 
-0.910087467722212, 0.720762758415155, 0.68655402174211, 2.06730787876151, 
8.07003485029664, 7.36087667611434, 7.95003510095185, 7.74851655674979, 
8.44479711579273, 10.7554175753369, 8.04653138044419, 8.57770906930663, 
8.11819487440968, 6.0882795089718, 6.86208648183857, 5.75676326036652, 
5.79391280521842, 6.01917759220915, 6.02956075431977, 6.54982754180169, 
3.72588514310706, 8.68255718355635, 5.63877874450629, 6.21335574971003, 
3.07434588225969, 1.33491175145449, 3.11395241896922, 1.75410358832085, 
0.822436691056719, 1.02414938384014, 3.06505732002715, 2.13167063477289, 
2.48862880920098, 0.300549432154306)), .Names = c("x", "y"), class = 
c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -40L))

My function:

 compare_tests(sim2 %>% filter(x %in% c('a', 'd')))

should return

regression     t.test 
 0.1051552  0.1052173

Solution

  • Your function has a problem with the t.test values.
    The line p5_xFactorLevels = factor(df$x) convert the column into factor (Ok, but not necessary). Then p5_T_Test = t.test(p5_xFactorLevels[1], p5_xFactorLevels[2]) is incorrectly performing the t-test against the first 2 elements of x column.

    The test is: y column versus the x column: p5_T_Test = t.test(df$y ~df$x)

    compare_tests = function(df) {
      p5Model = lm(y ~ x, df)
      p5_Regression_P_Value = anova(p5Model)$'Pr(>F)'[1]
      #Correct line added below:        
      p5_T_Test = t.test(df$y ~df$x)
    
      p5_T_Test_P_Value = p5_T_Test$p.value
      p5Vector = c(regression = p5_Regression_P_Value , t.test = p5_T_Test_P_Value)
      return(p5Vector)
    }
    
    sim2<-structure(list(x = c("a", "a", "a", "a", "a", "a", "a", "a", 
                               "a", "a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "c", 
                               "c", "c", "c", "c", "c", "c", "c", "c", "c", "d", "d", "d", "d", 
                               "d", "d", "d", "d", "d", "d"), y = c(1.93536318980109, 1.17648861056246, 
                                                                    1.2436854647462, 2.6235488834436, 1.11203808286976, 0.866002986937445, 
                                                                    -0.910087467722212, 0.720762758415155, 0.68655402174211, 2.06730787876151, 
                                                                    8.07003485029664, 7.36087667611434, 7.95003510095185, 7.74851655674979, 
                                                                    8.44479711579273, 10.7554175753369, 8.04653138044419, 8.57770906930663, 
                                                                    8.11819487440968, 6.0882795089718, 6.86208648183857, 5.75676326036652, 
                                                                    5.79391280521842, 6.01917759220915, 6.02956075431977, 6.54982754180169, 
                                                                    3.72588514310706, 8.68255718355635, 5.63877874450629, 6.21335574971003, 
                                                                    3.07434588225969, 1.33491175145449, 3.11395241896922, 1.75410358832085, 
                                                                    0.822436691056719, 1.02414938384014, 3.06505732002715, 2.13167063477289, 
                                                                    2.48862880920098, 0.300549432154306)), .Names = c("x", "y"), class = 
                      c("tbl_df", 
                        "tbl", "data.frame"), row.names = c(NA, -40L))
    compare_tests(sim2 %>% filter(x %in% c('a', 'd')))