Search code examples
ranova

Manual calculation of F-Statistic in R: mtcars mpg~cyl ANOVA


enter image description here

sumsqwithgrp   <- 0
ybar_j_mean    <- tapply(mtcars$mpg, mtcars$cyl, mean)
mtcars_ordered <- mtcars[order(mtcars$cyl), ]
count          <- 1

# Number of levels.
m <- length(ybar_j_mean)

for(j in 1:m) {

    # The corresponding 'cyl'-value of 'ybar_j_mean',
    # i.e. the value between '*':
    # > ybar_j_mean
    #       *4*        6        8 
    #  26.66364 19.74286 15.10000 
    ybar_j_index <- as.numeric(names(ybar_j_mean)[j])

    # Relevant subset of observations at 'j' level.
    mtcars_j     <- mtcars[mtcars$cyl==ybar_j_index, ]

    # The number of observations of the subset.
    nj           <- nrow(mtcars_j)

    sum_j <- 0
    for(i in 1:nj) {
        sum_j <- sum_j + (mtcars_j[i, "mpg"] - ybar_j_mean[j])^2 
    }
    sumsqwithgrp <- sumsqwithgrp + sum_j
} 

and

sumsqbtwgrp <- 0
total_mean  <- mean(mtcars$mpg)
count <- 1

for(j in 1:m) {
    ybar_j_index <- as.numeric(names(ybar_j_mean)[j])
    mtcars_j     <- mtcars[mtcars$cyl==ybar_j_index, ]
    nj           <- nrow(mtcars_j)

    sumsqbtwgrp  <- sumsqbtwgrp + nj*(ybar_j_mean[j] - total_mean)^2
}

Now:

> df_1 <- m-1
> df_1 <- nrow(mtcars)-m
> (sumsqbtwgrp/df_1)/(sumsqwithgrp/df_2)
39.69752 

But:

> summary(aov(mpg~cyl, data=mtcars))
            Df Sum Sq Mean Sq F value   Pr(>F)    
cyl          1  817.7   817.7   79.56 6.11e-10 ***
Residuals   30  308.3    10.3                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

$39.7 \neq 79.6$.

What am I doing wrong?


Solution

  • You need to set cyl as a factor:

    summary(aov(mpg~as.factor(cyl), data=mtcars))
    

    See here for more: https://stats.stackexchange.com/questions/112552/how-to-correct-the-degrees-of-freedom