r simulation probability normal-distribution

How to get normal probability distribution from a simulation on car agains car?

I want to understand why I am not getting a probability distribution when I use a simulation from a random normal distribution:

library(tidyverse)
df <- mtcars # data

df$sd <- sd(df$mpg) # standard deviation of the sample

set.seed(123)
f <- function(n1, s1, n2, s2){
  mean(rnorm(10000, n1, s1) < rnorm(10000, n2, s2)) # function for probability distribution
  
}

g <- Vectorize(f, c("n1", "s1", "n2", "s2")) 
set.seed(123)
res <- outer(df$mpg, df$sd, df$mpg, df$sd, FUN = g)
dimnames(res) <- list(row.names(df), row.names(df))
res <- data.frame(res)
res <- tibble::rownames_to_column(res, 'p1')

datalong_2 <- tidyr::gather(res, 'p2', 'value', 2:33) # output

I did this simulation but for some reason, I am not getting an actual probability distribution, my goal is to evaluate the probability of a car has less mpg than another car. But the sum of the probability does not add to one. I expect that this can be added to one or lower given that a tight might happens.

For example, the probability that Mazda Rx4 has a lower mpg than Mazda Rx4 wag is 0.5094 while the probability that Mazda Rx4 wag has a lower mpg than Mazda Rx4 is 0.5029, the sum of this probability is 1.0123. How can I change this code to get an actual probability distribution of one car has lower mpg than another car?

Solution

Unless you absolutely have to run simulations, you can use the pnorm() function to calculate the probabilities precisely.

We assume that X~N(u1,s1) and Y~N(u2,s2) where s1 and s2 are variances.

Also we know that P(X<Y) = P(X-Y<0), where X-Y ~ N(u1-u2,s1+s2). From this, we can calculate the probabilities precisely:

df <- mtcars # data
df$sd <- sd(df$mpg) # standard deviation of the sample

f <- function(n1, n2){
  pnorm(0, mean = n1 - n2, sd = sqrt(2*df$sd^2))
}

res <- outer(X = df$mpg, Y = df$mpg, FUN = f)
dimnames(res) <- list(row.names(df), row.names(df))
res <- data.frame(res)
res <- tibble::rownames_to_column(res, 'p1')

datalong_2 <- tidyr::gather(res, 'p2', 'value', 2:33) # output

> datalong_2
                     p1                p2      value
1             Mazda RX4         Mazda.RX4 0.50000000
2         Mazda RX4 Wag         Mazda.RX4 0.50000000
3            Datsun 710         Mazda.RX4 0.41637203
4        Hornet 4 Drive         Mazda.RX4 0.48128464
5     Hornet Sportabout         Mazda.RX4 0.60636049
..                   ..                ..         ..

Also, I think your main problem was in the function outer(), which requires 2 inputs X and Y. It worked for me once I changed it.

Edits 2 & 3:

df1 <- mtcars; df1$rownames = rownames(df1)
df2 <- mtcars; df2$rownames = rownames(df2)
df2$mpg = df2$mpg + rnorm(nrow(df2),0,3)
data = rbind(df1, df2)


df = ddply(data,~rownames,summarise,mean=mean(mpg),sd=sd(mpg))
df = rbind(df, c("car1",-1.02, 2.66))
df = rbind(df, c("car2",0.13, 0.06))
df$mean <- as.numeric(df$mean)
df$sd <- as.numeric(df$sd)

f <- function(x, y){
  n1 = df$mean[x]; n2 = df$mean[y]; sd1 = df$sd[x]; sd2 = df$sd[y]
  pnorm(0, mean = n1 - n2, sd = sqrt(sd1^2 + sd2^2))
}

res <- outer(X = 1:nrow(df), Y = 1:nrow(df), f)
dimnames(res) <- list(df$rownames, df$rownames)
res <- data.frame(res)
res <- tibble::rownames_to_column(res, 'p1')

datalong_2 <- tidyr::gather(res, 'p2', 'value', -1) # output

subset(datalong_2, p1 %in% c("car1","car2") & p2 %in% c("car1","car2"))

> subset(datalong_2, p1 %in% c("car1","car2") & p2 %in% c("car1","car2"))
       p1   p2     value
1121 car1 car1 0.5000000
1122 car2 car1 0.3327904
1155 car1 car2 0.6672096
1156 car2 car2 0.5000000