Search code examples
rprobabilitynormal-distribution

How to get the probabilities of one car having the lowest mpg than the rest of the cars?


I am using this to get the mean and standard deviation of a car mpg

df1 <- mtcars; df1$rownames = rownames(df1)
df2 <- mtcars; df2$rownames = rownames(df2)
df2$mpg = df2$mpg + rnorm(nrow(df2),0,3)
data = rbind(df1, df2)

I use a function to get the probabilities that a car has lower mpg than other cars

df = plyr::ddply(data,~rownames,summarise,mean=mean(mpg),sd=sd(mpg))

f <- function(x, y){
  n1 = df$mean[x]; n2 = df$mean[y]; sd1 = df$sd[x]; sd2 = df$sd[y]
  pnorm(0, mean = n1 - n2, sd = sqrt(sd1^2 + sd2^2))
}

res <- outer(X = 1:nrow(df), Y = 1:nrow(df), f)
dimnames(res) <- list(df$rownames, df$rownames)
res <- data.frame(res)
res <- tibble::rownames_to_column(res, 'p1')

datalong_2 <- tidyr::gather(res, 'p2', 'value', -1) # output

Now I want to have the probabilities of a car have the lowest mpg than the rest of the cars. I tied this:

cars = unique(datalong_2$p1)
win <- data.frame(sapply(1:length(cars), function(x) setNames(prod(subset(datalong_2, p1 == cars[x] & p2 != cars[x])$value),cars[x])))

colnames(win) <- "prob"
win$prob <- round(win$prob,4)

But the probabilities do not add up to one. How can I change this code to get a table that have the probability of each car of having the lowest mpg than the rest of the cars?


Solution

  • Here is a comparison of dplyr/data.table methods to return the probabilities

    library(dplyr)
    library(data.table)
    library(tidyr)
    library(tibble)
    # // input data
    df <- mtcars[1] %>% 
             rownames_to_column("car")
    

    -testing

    # // dplyr
    system.time({
            out <- df %>%
                    uncount(10000, .id = "run") %>%
                    rowwise() %>%
                    mutate(sim_mpg = rpois(1, lambda = mpg)) %>%
    
                    group_by(run) %>%
                    arrange(sim_mpg) %>%
                    mutate(lowest_mpg = row_number() == 1) %>%
    
                    group_by(car) %>%
                    summarize(chance_lowest = mean(lowest_mpg),
                    orig_mpg = first(mpg))
    
    })
    # user system elapsed
    # 1.715 0.074 1.787
    
    
    # // data.table
    system.time({
    
         df_expand <- setDT(df)[rep(seq_len(.N), 10000)][, run := rowid(car)]
    
         out2 <- df_expand[, sim_mpg := rpois(1, lambda = mpg), 1:nrow(df_expand)
            ][order(sim_mpg), lowest_mpg := seq_len(.N) == 1 ,run
            ][, .(chance_lowest = mean(lowest_mpg), orig_mpg = first(mpg)), .(car)]
    })
    # user system elapsed
    # 0.704 0.050 0.757
    sum(out$chance_lowest)
    #[1] 1