I hope you're doing well!
I'm trying to produce an R-squared value from my model fit in R, but the model fit itself is based on a specific function from a script. Unfortunately, I haven't been able to find a solution so far.
First things first, a list of the required packages:
library(ggplot2)
library(deSolve)
library(rootSolve)
library(coda)
library(minpack.lm)
library(FME)
library(phytotools)
The minimum reproducible data example:
kelppercent <- structure(list(Kelp_Cover = c(89.8, 98.5, 87.3, 8.733333333,
4.866666667, 6.733333333, 96.6, 71.26666667, 71.64, 3.2, 0, 85.66666667,
10.6, 0, 0, 100, 92.70333333, 74.03333333, 62.53333333, 25.23333333,
3.033333333, 11.9, 1.466666667, 2.2, 0, 96.8, 86.26666667, 43.26666667,
96.5, 7.633333333, 87.33333333, 83.56666667, 48.93333333, 0,
0, 0, 86.8, 74.9, 48.33333333, 4.766666667, 0, 0, 0, 0, 0, 0,
0, 61.63333333, 62.6, 23.53333333, 5.766666667, 3, 57.53333333,
80.3, 66.43333333, 45.16666667, 21.33333333, 4.866666667, 96.26666667,
0, 0, 5.033333333, 0, 59.93333333, 40.16666667, 9.633333333,
10.8, 0, 7.066666667, 43.6, 73.36666667, 2.766666667, 8.3, 31.33333333,
77.96666667, 11.46666667, 3.333333333, 98.66666667, 83.83333333,
0, 85.4, 2.766666667, 97.3, 79.7, 65.83333333, 4.233333333, 3.533333333,
99.23333333, 85.63333333, 74, 68.16666667, 18.86666667, 0, 66.6
), averagePAR = c(411.0328394, 415.1633577, 214.5924198, 57.33305713,
29.63469497, 15.31777983, 345.0506636, 148.232038, 63.6797416,
11.75221473, 5.048692624, 359.1309589, 32.10261553, 14.35388982,
6.417986499, 294.0376578, 107.6421231, 39.40592762, 14.42583152,
5.2810485, 1.933300914, 385.168096, 184.7042801, 88.57346033,
42.47469453, 415.1633577, 214.5924198, 110.9199687, 415.1633577,
57.33305713, 94.52133048, 11.12336013, 151.2262102, 1.009339599,
0.190038121, 0.035780313, 173.9490223, 80.95080298, 37.67214334,
17.53151707, 8.15865686, 160.5765179, 32.10261553, 6.417986499,
1.283090179, 0.256516652, 0.051283062, 210.3436414, 107.6421231,
55.08522426, 28.18953998, 14.42583152, 315.3555047, 123.8161165,
48.61316984, 19.08669362, 7.493892595, 2.942281535, 199.0882679,
3.031877918, 0.7515082, 32.10261553, 14.35388982, 182.8666314,
87.25490901, 41.63372557, 19.86555398, 9.478859492, 408.9830166,
106.0397056, 53.99457571, 27.49360902, 13.99952731, 89.91193692,
404.9139874, 26.15286547, 13.18434033, 396.8969763, 434.2704123,
20.06518531, 400.8854416, 24.87750414, 373.7858211, 80.95080298,
37.67214334, 17.53151707, 8.15865686, 360.9309232, 162.1901705,
72.8827865, 32.75106347, 14.71722213, 6.613422716, 408.9830166
)), row.names = c(NA, -94L), class = c("tbl_df", "tbl", "data.frame"
))
The fully functional script for the model function:
fitPGH <- function(x, #E
y, #Quantum Efficiency, rETR or P
normalize=FALSE, #Should curve be normalized to E (Default=TRUE for modeling Quantum Efficiency)
lowerlim=c(-Inf), #Lower bounds of parameter estimates (alpha,Beta,Ps)
upperlim=c(Inf), #Upper bounds of parameter estimates (alpha,Beta,Ps)
fitmethod=c("Nelder-Mead")) #Fitting method passed to modFit
{
#If normalize =T, assign E = 0 to very small number
if (normalize==T) x[x==0] <- 1e-9
#Remove NA values
ind <- is.finite(x) & is.finite(y)
res <- rep(NA,length(x))
x <- x[ind==T]
y <- y[ind==T]
#Intitial Parameter Estimates
if (normalize==T){
alpha <- max(y)
beta <- 0
ps <- max(x*y)
}
if (normalize==F){
PE <- y/x
alpha <- max(PE[is.finite(PE)])
beta <- 0
ps <- max(y)
}
#Load the model
PGH <- function(p,x) return(data.frame(x = x, y = p[3]*(1-exp(-1*p[1]*x/p[3]))*exp(-1*p[2]*x/p[3])))
PGH.E <- function(p,x) return(data.frame(x = x, y = p[3]*(1-exp(-1*p[1]*x/p[3]))*exp(-1*p[2]*x/p[3])/x))
if (normalize==F) model.1 <- function(p) (y - PGH(p, x)$y)
if (normalize==T) model.1 <- function(p) (y - PGH.E(p, x)$y)
#In case of non-convergence, NAs are returned
if (class(try(modFit(f = model.1,p = c(alpha,beta,ps),method = fitmethod,
lower=lowerlim,upper=upperlim,
hessian = TRUE),silent=T))=="try-error"){
fit <- list(alpha=NA,beta=NA,ps=NA,ssr=NA,residuals=rep(NA,c(length(x))))
}else{
fit <- modFit(f = model.1,p = c(alpha,beta,ps),method = fitmethod,
lower=lowerlim,upper=upperlim, hessian = TRUE)
fit <- list(alpha=summary(fit)$par[1,],beta=summary(fit)$par[2,],ps=summary(fit)$par[3,],
ssr=fit$ssr,residuals=fit$residuals,model="PGH",normalize=normalize)
}
return(fit)
}
And plotting the model fit onto the data:
PAR <- kelppercent$averagePAR
Pc <- kelppercent$Kelp_Cover
#Call function
myfit <- fitPGH(PAR, Pc)
#Plot input data
plot(PAR, Pc, xlim=c(0,450), ylim=c(0,100), xlab="PAR", ylab="Pc")
#Add model fit
E <- seq(0,450,by=1)
with(myfit,{
P <- ps[1]*(1-exp(-1*alpha[1]*E/ps[1]))*exp(-1*beta[1]*E/ps[1])
lines(E,P)
})
Plotting out the lines from E and the necessary P formula puts the fit onto the graph. Now, this is where I'm stuck: I want to generate the R-squared value to understand the relationship between the fit and the base values.
Thank you for your time, and any direction, pointers, or answers you may have would be incredible!
The model is reproducible, but could be smaller and more generalized to contribute better to the knowledge base of Stackoverflow. Nevertheless, an r2 can be estimated.
The coefficient of determination r2 is the fraction of variance explained by the model, var(explained), related to the total variance of the data, var(y). The explained variance is estimated from the difference of the residual variance and the total variance var(y), the residuals are the difference between the measured values (y) and the estimated values (y_hat).
Therefore:
r2 = var(explained)/var(y) = 1 - var(residuals)/var(y)
and in code:
## put the model equation in a function
model <- function(E, fit) {
with(fit, {
ps[1]*(1-exp(-1*alpha[1]*E/ps[1]))*exp(-1*beta[1]*E/ps[1])
})
}
# copy Pc and PAR to x and y just for didactic reasons
# to generalize the calculation
y <- Pc
x <- PAR
y_hat <- model(x, myfit)
residuals <- y - y_hat
1 - var(residuals)/var(y)