sumsqwithgrp <- 0
ybar_j_mean <- tapply(mtcars$mpg, mtcars$cyl, mean)
mtcars_ordered <- mtcars[order(mtcars$cyl), ]
count <- 1
# Number of levels.
m <- length(ybar_j_mean)
for(j in 1:m) {
# The corresponding 'cyl'-value of 'ybar_j_mean',
# i.e. the value between '*':
# > ybar_j_mean
# *4* 6 8
# 26.66364 19.74286 15.10000
ybar_j_index <- as.numeric(names(ybar_j_mean)[j])
# Relevant subset of observations at 'j' level.
mtcars_j <- mtcars[mtcars$cyl==ybar_j_index, ]
# The number of observations of the subset.
nj <- nrow(mtcars_j)
sum_j <- 0
for(i in 1:nj) {
sum_j <- sum_j + (mtcars_j[i, "mpg"] - ybar_j_mean[j])^2
}
sumsqwithgrp <- sumsqwithgrp + sum_j
}
and
sumsqbtwgrp <- 0
total_mean <- mean(mtcars$mpg)
count <- 1
for(j in 1:m) {
ybar_j_index <- as.numeric(names(ybar_j_mean)[j])
mtcars_j <- mtcars[mtcars$cyl==ybar_j_index, ]
nj <- nrow(mtcars_j)
sumsqbtwgrp <- sumsqbtwgrp + nj*(ybar_j_mean[j] - total_mean)^2
}
Now:
> df_1 <- m-1
> df_1 <- nrow(mtcars)-m
> (sumsqbtwgrp/df_1)/(sumsqwithgrp/df_2)
39.69752
But:
> summary(aov(mpg~cyl, data=mtcars))
Df Sum Sq Mean Sq F value Pr(>F)
cyl 1 817.7 817.7 79.56 6.11e-10 ***
Residuals 30 308.3 10.3
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
$39.7 \neq 79.6$.
What am I doing wrong?
You need to set cyl as a factor:
summary(aov(mpg~as.factor(cyl), data=mtcars))
See here for more: https://stats.stackexchange.com/questions/112552/how-to-correct-the-degrees-of-freedom