Search code examples
rggplot2curve-fittingpredictnon-linear-regression

Improving visual fit on log-log scale


I am trying to fit a model to match a specific curve. The curve has to be plotted on a log10 scale to be meaningful.

Libraries:

library(data.table)
library(ggplot2)
library(minpack.lm)

The data:

selected_data <- structure(list(shear_rate = c(1000, 794.329, 630.957, 501.187, 398.107, 316.228, 251.189, 199.527, 158.488, 125.893, 100, 79.4327, 63.0956, 50.1188, 39.8107, 31.6228, 25.1188, 19.9526, 15.8489, 12.5893, 10, 7.94327, 6.30959, 5.01187, 3.98106, 3.16227, 2.51188, 1.99526, 1.58489, 1.25892, 0.999992, 0.794326, 0.630961, 0.501198, 0.39811, 0.31623, 0.251185, 0.199522, 0.158491, 0.125895, 0.0999949, 0.0794332, 0.0630955, 0.0501237, 0.0398085, 0.0316226, 0.0251186, 0.0199599, 0.0158465, 0.0125871, 0.00999673, 0.00794292, 0.00630626, 0.0050086, 0.00398398, 0.00316298, 0.00250931, 0.00199656, 0.0015839, 0.0012584, 0.000999712, 0.000792728, 0.00063262, 0.000508155, 0.000395346, 0.000317991, 0.000249344, 0.000196793, 0.000160254, 0.000127184, 0.000104187, 8.23117e-05, 6.47387e-05, 4.92171e-05, 4.36566e-05, 3.17533e-05, 1.73707e-05, 1.96217e-05, 7.36872e-06, 1.29909e-05, 1.1677e-05, 1.02765e-05, 8.38507e-06, 7.018e-06, 2.97303e-06, 1.43835e-06, 6.66051e-06), viscosity = c(0.078019, 0.0871012, 0.0978244, 0.110653, 0.125792, 0.142472, 0.160574, 0.180504, 0.202741, 0.228238, 0.257929, 0.293168, 0.334648, 0.383216, 0.440372, 0.508067, 0.588533, 0.684139, 0.798329, 0.934592, 1.09783, 1.29358, 1.52916, 1.81312, 2.15566, 2.57134, 3.07472, 3.68667, 4.43435, 5.34438, 6.45903, 7.82502, 9.50057, 11.5573, 14.0901,17.2101, 21.056, 25.8055, 31.6732, 38.9193, 47.895, 58.9914,72.7054, 89.6858, 110.719, 136.708, 168.881, 208.623, 258.156, 319.473, 395.938, 490.966, 610.17, 759.098, 944.226, 1177.97, 1472.51, 1837.24, 2301.57, 2880.84, 3609.73, 4529.95, 5655.8, 7014.35, 8985.18, 11142.5, 14172.5, 17910.1, 21937.4, 27576.4, 33589.4, 42421.1, 53821.7, 70652.6, 79480.7, 109057, 199001, 175881, 467589, 264795, 294139, 333755, 408477, 487371, 1149050, 2372100, 511717), 
sample_name = c("x", "x", "x", "x", "x", "x", "x", "x", "x","x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x",  "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x")), .Names = c("shear_rate", "viscosity", "sample_name"), class = c("data.table", "data.frame"), row.names = c(NA, -87L))

The model:

fitted_models <- lapply(split(selected_data, by = "sample_name"), function(d) {
d <- d[order(shear_rate)]
start_viscosity_zero <- mean(d[seq(5), viscosity])
start_viscosity_inf <- mean(d[seq(nrow(d) - 5, nrow(d)), viscosity])
d[, start_viscosity :=  (start_viscosity_zero - viscosity) / (viscosity - start_viscosity_inf)]
m1 <- lm(log(start_viscosity) ~ log(shear_rate), data = d)
start_C <- (coef(m1)[[1]])^(1/coef(m1)[[2]])
start_m <- coef(m1)[[2]]/10
nlsLM(viscosity ~ C * (shear_rate)^(m-1) + viscosity_inf,
    data = d,
    start = list(viscosity_inf = 24, C = 0.11, m = 0.4))
})

Some in between code:

selected_data[, prediction := predict(fitted_models[[ .BY[["sample_name"]] ]], .SD), by = "sample_name"]

The plot:

p <- ggplot(data = selected_data, aes(x = shear_rate, y = viscosity, group = sample_name, colour = sample_name))
p <- p + geom_point()
p <- p + geom_line(aes(y = prediction))
p <- p + coord_trans(x = "log10", y = "log10")
p <- p + scale_colour_discrete("Sample")
p <- p + scale_fill_discrete(guide = FALSE)
p

This code gives me something like the following plot (the code above produces the graph only for one sample though, and is simplified):

ggplot

You can see my problem straight away. The 'wandering off' of the prediction curve is due to the log axis, I know that. But how do I alter this line so that it will graphically fit?


Solution

  • Your problem is not a visualization problem. The problem is that you fit in linear scale and expect the plot to look good in log scale. That just doesn't work. (And in general, fitting in linear scale is not a good idea for data that vary on a log scale.)

    Simple example:

    # create exponential decay data
    set.seed(342)
    x <- seq(0, 10, by = .5)
    y <- exp(-x) * rnorm(length(x), mean = 1, sd = .3)
    d <- data.frame(x, y)
    
    # fit model to data
    model <- nls(y ~ a*exp(-b*x), data = d, start = list(a = 1, b = 1))
    
    # visualize in linear scale---looks good
    xpred <- seq(0, 10, by = .1)
    ypred <- predict(model, data.frame(x = xpred))
    ggplot(d, aes(x, y)) + geom_point() + 
      geom_line(data = data.frame(x = xpred, y = ypred))
    

    enter image description here

    # visualize in log scale---oops
    ggplot(d, aes(x, y)) + geom_point() + 
      geom_line(data = data.frame(x = xpred, y = ypred)) + scale_y_log10()
    

    enter image description here