I want to check for normality using Kolmogorov-Smirnov test (ks-test) and visualize it to support my presentation.
I used this default code
sample1 <- rnorm(10000, 10, 5)
sample2 <- rnorm(10000, 1, 5)
group <- c(rep("sample1", length(sample1)), rep("sample2", length(sample2)))
dat <- data.frame(KSD = c(sample1,sample2), group = group)
# create ECDF of data
cdf1 <- ecdf(sample1)
cdf2 <- ecdf(sample2)
# find min and max statistics to draw line between points of greatest distance
minMax <- seq(min(sample1, sample2), max(sample1, sample2), length.out=length(sample1))
x0 <- minMax[which( abs(cdf1(minMax) - cdf2(minMax)) == max(abs(cdf1(minMax) - cdf2(minMax))) )]
y0 <- cdf1(x0)
y1 <- cdf2(x0)
ggplot(dat, aes(x = KSD, group = group, color = group))+
stat_ecdf(size=1) +
theme_bw(base_size = 28) +
theme(legend.position ="top") +
xlab("Sample") +
ylab("ECDF") +
#geom_line(size=1) +
geom_segment(aes(x = x0[1], y = y0[1], xend = x0[1], yend = y1[1]),
linetype = "dashed", color = "red") +
geom_point(aes(x = x0[1] , y= y0[1]), color="red", size=8) +
geom_point(aes(x = x0[1] , y= y1[1]), color="red", size=8) +
ggtitle("K-S Test: Sample 1 / Sample 2") +
theme(legend.title=element_blank())
Now, I want to replace it with my own data, which
mydata <- c(33, 28, 23, 10, 27, 22, 25, 32, 26, 14, 37, 16, 30, 34, 26, 27, 18, 22, 23, 37, 16, 26, 31, 20, 25, 34, 24, 12, 28, 27, 40, 25, 23, 32, 23, 32, 17, 27, 18, 28, 17, 29, 12, 36, 32, 25, 32, 23, 16, 29, 18, 18, 46, 33, 30, 22, 21, 30, 35, 25, 13, 17, 28, 24, 32, 26, 10, 35, 23, 28, 19, 13, 24, 30, 20, 19, 26, 36, 28, 32, 28, 29, 32, 13, 32, 30, 36, 25, 20, 32, 18, 30, 27, 32, 23, 33, 30, 31, 20, 35, 29, 24, 35, 21, 18, 16, 23, 29, 26, 14, 13, 29, 24, 30, 13, 28, 24, 19, 28, 16, 25, 28, 30, 32, 14, 10, 12, 39, 16, 20, 20, 12, 30, 17, 28, 28, 37, 22, 16, 21, 32, 20, 25, 19, 37, 26, 13, 22, 18, 18, 16, 30, 16, 34, 11, 16, 14, 32, 35, 16, 17, 29, 17, 24, 28, 15, 21, 24, 18, 20, 16, 24, 20, 22, 30, 24, 27, 26, 42, 22, 35, 24, 19, 10, 34, 19, 18, 30, 19, 13, 17, 27, 28, 36, 23, 23, 25, 19, 12, 30, 15, 38, 15, 36, 19, 28, 27, 12, 31, 18, 26, 15, 34, 29, 32, 19, 14, 34, 24, 11, 23, 29, 15, 23, 16, 28, 11, 37, 23, 31, 14, 32, 19, 27, 29, 10, 19, 12, 19, 17, 34, 24, 26, 17, 15, 28, 18, 14, 24, 22, 13, 10, 23, 22, 19, 31, 22, 12, 23, 27, 16, 13, 22, 34, 12, 30, 20, 30, 29, 25, 35, 19, 11, 27, 16, 22, 10, 35, 26, 14, 32, 16, 24, 15, 19, 18, 37, 24, 30, 16, 22, 38, 25, 32, 28, 21, 26, 23, 10, 19, 20, 23, 15, 25, 23, 14, 20, 33, 29, 14, 11, 13, 28, 17, 24, 19, 12, 35, 30, 25, 15, 25, 24, 37, 17, 25, 11, 24, 18, 19, 27, 29, 15, 29, 12, 10, 22, 14, 18, 15, 15, 15, 23, 15, 23, 27, 21, 39, 13, 23, 13, 26, 20, 22, 20, 26, 37, 21, 20, 17, 34, 30, 18, 30, 34, 27, 10, 29, 10, 18, 18, 12, 19, 27, 20, 26, 27, 13, 23, 24, 10, 14, 22, 17, 19, 15, 24, 15, 16, 21, 24, 27, 17, 14, 29, 22, 26, 16, 17, 15, 28, 20, 32, 18, 25, 24, 15, 10, 25, 35, 30, 10, 34, 20, 36, 25, 15, 35, 22, 22, 12, 23, 20, 12, 27, 16, 30, 28, 12, 19, 15, 25, 12, 26, 21, 13, 19, 23, 31, 12, 11, 16, 12, 11, 33, 30, 30, 20, 23, 30, 19, 32, 21, 32, 36, 18, 13, 24, 26, 21, 20, 19, 34, 42, 20, 19, 31, 12, 24, 21, 30, 13, 14, 24, 18, 17, 13, 22, 37, 24, 21, 17, 13, 18, 50, 16, 25, 19, 20, 14, 15, 18, 44, 29, 21, 33, 18, 25, 35, 29, 17, 27, 41, 22, 22, 32, 34, 33, 20, 31, 30, 27, 22, 14, 23, 19, 27, 17, 26, 12, 27, 24, 36, 19, 21, 23, 40, 29, 27)
I can use mydata
as empirical CDF, but how can I change sample2
to the theoritical CDF?
Aim:
mydata
which substitute sample1
), and theoritical CDF (substituting sample2
distributiongeom_pint
for additionThanks in advance.
The theoretical cumulative probability functions in R are given by the p*
versions of distribution functions. If you want to test for normality, the function would be pnorm
. Below you'll find a way you can plot this. The black line indicates the point of maximum difference.
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.0.5
mydata <- c(33, 28, 23, 10, 27, 22, 25, 32, 26, 14, 37, 16, 30, 34, 26, 27, 18, 22, 23, 37, 16, 26, 31, 20, 25, 34, 24, 12, 28, 27, 40, 25, 23, 32, 23, 32, 17, 27, 18, 28, 17, 29, 12, 36, 32, 25, 32, 23, 16, 29, 18, 18, 46, 33, 30, 22, 21, 30, 35, 25, 13, 17, 28, 24, 32, 26, 10, 35, 23, 28, 19, 13, 24, 30, 20, 19, 26, 36, 28, 32, 28, 29, 32, 13, 32, 30, 36, 25, 20, 32, 18, 30, 27, 32, 23, 33, 30, 31, 20, 35, 29, 24, 35, 21, 18, 16, 23, 29, 26, 14, 13, 29, 24, 30, 13, 28, 24, 19, 28, 16, 25, 28, 30, 32, 14, 10, 12, 39, 16, 20, 20, 12, 30, 17, 28, 28, 37, 22, 16, 21, 32, 20, 25, 19, 37, 26, 13, 22, 18, 18, 16, 30, 16, 34, 11, 16, 14, 32, 35, 16, 17, 29, 17, 24, 28, 15, 21, 24, 18, 20, 16, 24, 20, 22, 30, 24, 27, 26, 42, 22, 35, 24, 19, 10, 34, 19, 18, 30, 19, 13, 17, 27, 28, 36, 23, 23, 25, 19, 12, 30, 15, 38, 15, 36, 19, 28, 27, 12, 31, 18, 26, 15, 34, 29, 32, 19, 14, 34, 24, 11, 23, 29, 15, 23, 16, 28, 11, 37, 23, 31, 14, 32, 19, 27, 29, 10, 19, 12, 19, 17, 34, 24, 26, 17, 15, 28, 18, 14, 24, 22, 13, 10, 23, 22, 19, 31, 22, 12, 23, 27, 16, 13, 22, 34, 12, 30, 20, 30, 29, 25, 35, 19, 11, 27, 16, 22, 10, 35, 26, 14, 32, 16, 24, 15, 19, 18, 37, 24, 30, 16, 22, 38, 25, 32, 28, 21, 26, 23, 10, 19, 20, 23, 15, 25, 23, 14, 20, 33, 29, 14, 11, 13, 28, 17, 24, 19, 12, 35, 30, 25, 15, 25, 24, 37, 17, 25, 11, 24, 18, 19, 27, 29, 15, 29, 12, 10, 22, 14, 18, 15, 15, 15, 23, 15, 23, 27, 21, 39, 13, 23, 13, 26, 20, 22, 20, 26, 37, 21, 20, 17, 34, 30, 18, 30, 34, 27, 10, 29, 10, 18, 18, 12, 19, 27, 20, 26, 27, 13, 23, 24, 10, 14, 22, 17, 19, 15, 24, 15, 16, 21, 24, 27, 17, 14, 29, 22, 26, 16, 17, 15, 28, 20, 32, 18, 25, 24, 15, 10, 25, 35, 30, 10, 34, 20, 36, 25, 15, 35, 22, 22, 12, 23, 20, 12, 27, 16, 30, 28, 12, 19, 15, 25, 12, 26, 21, 13, 19, 23, 31, 12, 11, 16, 12, 11, 33, 30, 30, 20, 23, 30, 19, 32, 21, 32, 36, 18, 13, 24, 26, 21, 20, 19, 34, 42, 20, 19, 31, 12, 24, 21, 30, 13, 14, 24, 18, 17, 13, 22, 37, 24, 21, 17, 13, 18, 50, 16, 25, 19, 20, 14, 15, 18, 44, 29, 21, 33, 18, 25, 35, 29, 17, 27, 41, 22, 22, 32, 34, 33, 20, 31, 30, 27, 22, 14, 23, 19, 27, 17, 26, 12, 27, 24, 36, 19, 21, 23, 40, 29, 27)
# Find x position of maximum difference.
# You could precalculate `mean(mydata)` and `sd(mydata)` if you wish,
# we use it later in plotting too.
maxdiff <- mydata[which.max(
abs(ecdf(mydata)(mydata) - pnorm(mydata, mean(mydata), sd = sd(mydata)))
)]
df <- data.frame(x = mydata)
ggplot(df, aes(x)) +
stat_ecdf(aes(colour = "Empirical")) +
stat_function(fun = pnorm, args = list(mean = mean(mydata), sd = sd(mydata)),
aes(colour = "Theoretical")) +
annotate(
"segment",
x = maxdiff, xend = maxdiff,
y = ecdf(mydata)(maxdiff),
yend = pnorm(maxdiff, mean(mydata), sd(mydata)),
size = 1
)
Created on 2021-04-25 by the reprex package (v1.0.0)