I am plotting curves for different distribution functions and I need to know the highest y-value for each curve. Later I will plot only the one curve, which is selected as the best fitting.
This is the function (it is a bit hard-coded, I am working on it):
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white'
# ylim= c(0,max_y)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2]),
add=TRUE, col='blue', lwd=2)
curve(dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2]),
add=TRUE, col='darkgreen', lwd=2)
curve(dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2]),
add=TRUE, col='purple', lwd=2)
curve(dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2]),
add=TRUE, col='Gold', lwd=2)
curve(dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3]),
add=TRUE, col='red', lwd=2)
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
I was trying to play with max_y
and then to use it in ylim
. I could do it only for normal density, but not for the rest curves.
Currently plot looks like this (some curves are cut):
If to set up ylim = c(0,2)
we can see, that lognormal and gamma distribution goes beyond 1:
I need to know the max value for each curve, so, when I choose which curve will be printed, to set up the correct ylim
.
You could use purrr::map_dbl
to map the function optimize
over your densities if you rearrange your code slightly and you have an idea over what input values you want to find their maxima/the density exists.
You can set your densities with whatever your parameters are ahead of time, that way you can find their peak values using optimize
and also pass them to the curve
function.
As a small reproducible example:
library(purrr)
# parameterize your densities
mynorm <- function(x) dnorm(x, mean = 0, sd = 1)
mygamma <- function(x) dgamma(x, rate = .5, shape = 1)
# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm, mygamma), ~ optimize(., interval = c(0, 3), maximum = T)$objective))
# 0.4999811
# plot data
curve(mynorm, col = "blue", lwd = 2, xlim = c(0, 3), ylim = c(0, ymax * 1.1))
curve(mygamma, col = "red", lwd = 2, add = T)
Using your code I've implemented the above solution and adjusted the x
grid of the curve
function to show you what I mean after our discussion in the comments to make things more clear and show you what you should actually be plotting:
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
# parameterize your densities before plotting
mynorm <- function(x) {
dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2])
}
mylnorm <- function(x){
dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2])
}
myweibull <- function(x) {
dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2])
}
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
}
mygev <- function(x){
dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3])
}
distributions <- c(mynorm, mylnorm, myweibull, mygamma, mygev)
# get the max of each density
y <- purrr::map_dbl(distributions, ~ optimize(., interval = c(0, x_data), maximum = T)$objective)
# find the max (excluding infinity)
ymax <- max(y[abs(y) < Inf])
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white',
ylim= c(0, ymax)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(mynorm,
add=TRUE, col='blue', lwd=2, n = 1E5) # <- increase x grid
curve(mylnorm,
add=TRUE, col='darkgreen', lwd=2, n = 1E5) # <- increase x grid
curve(myweibull,
add=TRUE, col='purple', lwd=2, n = 1E5) # <- increase x grid
curve(mygamma,
add=TRUE, col='Gold', lwd=2, n = 1E5) # <- increase x grid
curve(mygev,
add=TRUE, col='red', lwd=2, n = 1E5) # <- increase x grid
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
Why your plot height isn't matching the solution output
To illustrate further what's going on with your plot and some of the confusion you might have you need to understand how the curve
function is plotting your data. By default curve
takes 101 x-values and evaluates your functions to get their y-values and then plots those points as a line. Because the peaks on some of your density are so sharp, the curve
function isn't evaluating enough x-values to plot your density peaks. To show you want I mean I will focus on your gamma density. Don't worry too much about the code as much as the output. Below I have the first few (x,y) coordinates for different values of n
.
library(purrr)
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1], # 0.6225622
rate=fit[['gamma']]$estimate[2]) # 0.3568242
}
number_of_x <- c(5, 10, 101, 75000)
purrr::imap_dfr(number_of_x, ~ curve(mygamma, xlim = c(0, 6), n = .), .id = "n") %>%
dplyr::mutate_at(1, ~ sprintf("n = %i", number_of_x[as.numeric(.)])) %>%
dplyr::mutate(n = factor(n, unique(n))) %>%
dplyr::filter(x > 0) %>%
dplyr::group_by(n) %>%
dplyr::slice_min(order_by = x, n = 5)
n x y
<fct> <dbl> <dbl>
1 n = 5 1.5 0.184
2 n = 5 3 0.0828
3 n = 5 4.5 0.0416
4 n = 5 6 0.0219
5 n = 10 0.667 0.336
6 n = 10 1.33 0.204
7 n = 10 2 0.138
8 n = 10 2.67 0.0975
9 n = 10 3.33 0.0707
10 n = 101 0.06 1.04
11 n = 101 0.12 0.780
12 n = 101 0.18 0.655
13 n = 101 0.24 0.575
14 n = 101 0.3 0.518
15 n = 75000 0.0000800 12.9
16 n = 75000 0.000160 9.90
17 n = 75000 0.000240 8.50
18 n = 75000 0.000320 7.62
19 n = 75000 0.000400 7.01
Notice that when n = 5
you have very few values plotted. As n
increases, the distance between the x-values gets smaller. Since these functions are continuous, there are infinite number of points to plot, but that cannot be done computationally so a subset of x-values are plotted to approximate. The more x-values the better the approximation. Normally, the default n = 101
works fine, but because the gamma and log-normal densities have such sharp peaks the plot function is stepping over the maximum value. Below is a full plot of the data for n = 5, 10, 101, 75000
with points added.