Search code examples
rdistributioncurve

Calculate y value for distribution fitting functions R


I am plotting curves for different distribution functions and I need to know the highest y-value for each curve. Later I will plot only the one curve, which is selected as the best fitting.

This is the function (it is a bit hard-coded, I am working on it):

library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
        
        
fdistr <- function(d) {
  
  #  Uncomment to try  run line by line
  # d <- data_to_plot
  
  TLT <- d$TLT
  if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
  gev <- fgev(TLT, std.err=FALSE)
  distr <- c('norm', 'lnorm', 'weibull', 'gamma')
  fit <- lapply(X=distr, FUN=fitdist, data=TLT)
  fit[[5]] <- gev
  distr[5] <- 'gev'
  names(fit) <- distr
  Loglike <- sapply(X=fit, FUN=logLik)
  Loglike_Best <- which(Loglike == max(Loglike))
  
  #  Uncomment to try  run line by line
  # max <- which.max(density(d$TLT)$y)
  # max_density <- stats::density(d$TLT)$y[max]
  # max_y <- max_density
  
  x_data <- max(d$TLT)
  
  hist(TLT, prob=TRUE, breaks= x_data,
       main=paste(d$DLT_Code[1], 
                  '- best :',
                  names(Loglike[Loglike_Best])),
       sub = 'Total Lead Times',
       col='lightgrey',
       border='white'
       # ylim=  c(0,max_y)
  )
  
  lines(density(TLT),
        col='darkgrey',
        lty=2,
        lwd=2)
  
  grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
       lwd = .5, equilogs = TRUE)
  
  curve(dnorm(x, 
              mean=fit[['norm']]$estimate[1], 
              sd=fit[['norm']]$estimate[2]), 
        add=TRUE, col='blue', lwd=2)
  
  curve(dlnorm(x, 
               meanlog=fit[['lnorm']]$estimate[1], 
               sdlog=fit[['lnorm']]$estimate[2]), 
        add=TRUE, col='darkgreen', lwd=2)
  
  curve(dweibull(x, 
                 shape=fit[['weibull']]$estimate[1], 
                 scale=fit[['weibull']]$estimate[2]), 
        add=TRUE, col='purple', lwd=2)
  
  curve(dgamma(x, 
               shape=fit[['gamma']]$estimate[1], 
               rate=fit[['gamma']]$estimate[2]), 
        add=TRUE, col='Gold', lwd=2)
  
  
  curve(dgev(x, 
             loc=fit[['gev']]$estimate[1],
             scale=fit[['gev']]$estimate[2], 
             shape=fit[['gev']]$estimate[3]), 
        add=TRUE, col='red', lwd=2)
  
  
  legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
                         round(Loglike, digits=2))
  
  legend("topright", legend=legend_loglik, 
         col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
         lty=1, lwd=2,
         bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')  
  
  return(data.frame(DLT_Code = d$DLT_Code[1],
                    n = length(d$TLT),
                    Best = names(Loglike[Loglike_Best]),
                    lnorm = Loglike[1],
                    norm = Loglike[2],
                    weibul = Loglike[3],
                    gamma = Loglike[4],
                    GEV = Loglike[5]))
  
}



#  Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14),  rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))

data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))


DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))

I was trying to play with max_y and then to use it in ylim. I could do it only for normal density, but not for the rest curves.

Currently plot looks like this (some curves are cut):

enter image description here

If to set up ylim = c(0,2) we can see, that lognormal and gamma distribution goes beyond 1:

enter image description here

I need to know the max value for each curve, so, when I choose which curve will be printed, to set up the correct ylim.


Solution

  • You could use purrr::map_dbl to map the function optimize over your densities if you rearrange your code slightly and you have an idea over what input values you want to find their maxima/the density exists.

    You can set your densities with whatever your parameters are ahead of time, that way you can find their peak values using optimize and also pass them to the curve function.

    As a small reproducible example:

    library(purrr)
    
    # parameterize your densities
    mynorm <- function(x) dnorm(x, mean = 0, sd = 1) 
    mygamma <- function(x) dgamma(x, rate = .5, shape = 1) 
    
    # get largest maximum over interval
    ymax <- max(purrr::map_dbl(c(mynorm, mygamma), ~ optimize(., interval = c(0, 3), maximum = T)$objective))
    
    # 0.4999811
    
    # plot data
    curve(mynorm, col = "blue", lwd = 2, xlim = c(0, 3), ylim = c(0, ymax * 1.1))
    curve(mygamma, col = "red", lwd = 2, add = T)
    

    Using your code I've implemented the above solution and adjusted the x grid of the curve function to show you what I mean after our discussion in the comments to make things more clear and show you what you should actually be plotting:

    library(plyr)
    library(dplyr)
    library(fitdistrplus)
    library(evd)
    library(gamlss)
    library(purrr) # <- add this library
    
    
    fdistr <- function(d) {
      
      #  Uncomment to try  run line by line
      # d <- data_to_plot
      
      TLT <- d$TLT
      if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
      gev <- fgev(TLT, std.err=FALSE)
      distr <- c('norm', 'lnorm', 'weibull', 'gamma')
      fit <- lapply(X=distr, FUN=fitdist, data=TLT)
      fit[[5]] <- gev
      distr[5] <- 'gev'
      names(fit) <- distr
      Loglike <- sapply(X=fit, FUN=logLik)
      Loglike_Best <- which(Loglike == max(Loglike))
      
      #  Uncomment to try  run line by line
      # max <- which.max(density(d$TLT)$y)
      # max_density <- stats::density(d$TLT)$y[max]
      # max_y <- max_density
      
      x_data <- max(d$TLT)
      
      # parameterize your densities before plotting
      mynorm <- function(x) {
        dnorm(x, 
              mean=fit[['norm']]$estimate[1], 
              sd=fit[['norm']]$estimate[2])
      }
      
      mylnorm <- function(x){
        dlnorm(x, 
               meanlog=fit[['lnorm']]$estimate[1], 
               sdlog=fit[['lnorm']]$estimate[2])
      }
      
      myweibull <- function(x) {
        dweibull(x, 
                 shape=fit[['weibull']]$estimate[1], 
                 scale=fit[['weibull']]$estimate[2])
      }
      
      mygamma <- function(x) {
        dgamma(x, 
               shape=fit[['gamma']]$estimate[1], 
               rate=fit[['gamma']]$estimate[2])
      }
      
      mygev <- function(x){
        dgev(x, 
             loc=fit[['gev']]$estimate[1],
             scale=fit[['gev']]$estimate[2], 
             shape=fit[['gev']]$estimate[3])
      }
      
      distributions <- c(mynorm, mylnorm, myweibull, mygamma, mygev)
      
      # get the max of each density
      y <- purrr::map_dbl(distributions, ~ optimize(., interval = c(0, x_data), maximum = T)$objective)
    
      # find the max (excluding infinity)
      ymax <- max(y[abs(y) < Inf])
      
      
      hist(TLT, prob=TRUE, breaks= x_data,
           main=paste(d$DLT_Code[1], 
                      '- best :',
                      names(Loglike[Loglike_Best])),
           sub = 'Total Lead Times',
           col='lightgrey',
           border='white',
           ylim=  c(0, ymax)
      )
      
      lines(density(TLT),
            col='darkgrey',
            lty=2,
            lwd=2)
      
      grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
           lwd = .5, equilogs = TRUE)
      
      curve(mynorm, 
            add=TRUE, col='blue', lwd=2, n = 1E5) # <- increase x grid
      
      curve(mylnorm, 
            add=TRUE, col='darkgreen', lwd=2, n = 1E5) # <- increase x grid
      
      curve(myweibull, 
            add=TRUE, col='purple', lwd=2, n = 1E5) # <- increase x grid
      
      curve(mygamma, 
            add=TRUE, col='Gold', lwd=2, n = 1E5) # <- increase x grid
      
      
      curve(mygev, 
            add=TRUE, col='red', lwd=2, n = 1E5) # <- increase x grid
      
      
      legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
                             round(Loglike, digits=2))
      
      legend("topright", legend=legend_loglik, 
             col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
             lty=1, lwd=2,
             bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')  
      
      return(data.frame(DLT_Code = d$DLT_Code[1],
                        n = length(d$TLT),
                        Best = names(Loglike[Loglike_Best]),
                        lnorm = Loglike[1],
                        norm = Loglike[2],
                        weibul = Loglike[3],
                        gamma = Loglike[4],
                        GEV = Loglike[5]))
      
    }
    
    
    
    #  Creating data set
    TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14),  rep(5,7), 6)
    DLT_Code <- c(rep('DLT_Code',251))
    
    data_to_plot <- data.frame(cbind(DLT_Code,TLT))
    data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
    
    
    DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
    

    enter image description here


    Why your plot height isn't matching the solution output

    To illustrate further what's going on with your plot and some of the confusion you might have you need to understand how the curve function is plotting your data. By default curve takes 101 x-values and evaluates your functions to get their y-values and then plots those points as a line. Because the peaks on some of your density are so sharp, the curve function isn't evaluating enough x-values to plot your density peaks. To show you want I mean I will focus on your gamma density. Don't worry too much about the code as much as the output. Below I have the first few (x,y) coordinates for different values of n.

    library(purrr)
    
    mygamma <- function(x) {
      dgamma(x, 
             shape=fit[['gamma']]$estimate[1], # 0.6225622
             rate=fit[['gamma']]$estimate[2]) # 0.3568242
    }
    
    number_of_x <- c(5, 10, 101, 75000)
    purrr::imap_dfr(number_of_x, ~ curve(mygamma, xlim = c(0, 6), n = .), .id = "n") %>% 
      dplyr::mutate_at(1, ~ sprintf("n = %i", number_of_x[as.numeric(.)])) %>% 
      dplyr::mutate(n = factor(n, unique(n))) %>% 
      dplyr::filter(x > 0) %>% 
      dplyr::group_by(n) %>% 
      dplyr::slice_min(order_by = x, n = 5)
    
     n                 x       y
       <fct>         <dbl>   <dbl>
     1 n = 5     1.5        0.184 
     2 n = 5     3          0.0828
     3 n = 5     4.5        0.0416
     4 n = 5     6          0.0219
     5 n = 10    0.667      0.336 
     6 n = 10    1.33       0.204 
     7 n = 10    2          0.138 
     8 n = 10    2.67       0.0975
     9 n = 10    3.33       0.0707
    10 n = 101   0.06       1.04  
    11 n = 101   0.12       0.780 
    12 n = 101   0.18       0.655 
    13 n = 101   0.24       0.575 
    14 n = 101   0.3        0.518 
    15 n = 75000 0.0000800 12.9   
    16 n = 75000 0.000160   9.90  
    17 n = 75000 0.000240   8.50  
    18 n = 75000 0.000320   7.62  
    19 n = 75000 0.000400   7.01  
    

    Notice that when n = 5 you have very few values plotted. As n increases, the distance between the x-values gets smaller. Since these functions are continuous, there are infinite number of points to plot, but that cannot be done computationally so a subset of x-values are plotted to approximate. The more x-values the better the approximation. Normally, the default n = 101 works fine, but because the gamma and log-normal densities have such sharp peaks the plot function is stepping over the maximum value. Below is a full plot of the data for n = 5, 10, 101, 75000 with points added.

    ![enter image description here