Background
I am trying to replicate figure 2.6 in the book An Introduction to Statistical Learning:
A rough thin-plate spline fit to the Income data from Figure 2.3. This fit makes zero errors on the training data.
What have I tried so far?
I tried to replicate the previous figure 2.5, a smooth thin-plate spline fit, not sure if succesfully.
income_2 <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Income2.csv")
library(mgcv)
model1 <- gam(Income ~ te(Education, Seniority, bs=c("tp", "tp")), data = income_2)
x <- range(income_2$Education)
x <- seq(x[1], x[2], length.out=30)
y <- range(income_2$Seniority)
y <- seq(y[1], y[2], length.out=30)
z <- outer(x,y,
function(Education,Seniority)
predict(model1, data.frame(Education,Seniority)))
p <- persp(x,y,z, theta=30, phi=30,
col="yellow",expand = 0.5,shade = 0.2,
xlab="Education", ylab="Seniority", zlab="Income")
obs <- trans3d(income_2$Education, income_2$Seniority,income_2$Income,p)
pred <- trans3d(income_2$Education, income_2$Seniority,fitted(model1),p)
points(obs, col="red",pch=16)
segments(obs$x, obs$y, pred$x, pred$y)
Twofold question
bs="tp"
, and the documentation says 'They are reduced rank versions of the thin plate splines and use the thin plate spline penalty.'income_2 <- structure(list(Education = c(21.5862068965517, 18.2758620689655,
12.0689655172414, 17.0344827586207, 19.9310344827586, 18.2758620689655,
19.9310344827586, 21.1724137931034, 20.3448275862069, 10, 13.7241379310345,
18.6896551724138, 11.6551724137931, 16.6206896551724, 10, 20.3448275862069,
14.1379310344828, 16.6206896551724, 16.6206896551724, 20.3448275862069,
18.2758620689655, 14.551724137931, 17.448275862069, 10.4137931034483,
21.5862068965517, 11.2413793103448, 19.9310344827586, 11.6551724137931,
12.0689655172414, 17.0344827586207), Seniority = c(113.103448275862,
119.310344827586, 100.689655172414, 187.586206896552, 20, 26.2068965517241,
150.344827586207, 82.0689655172414, 88.2758620689655, 113.103448275862,
51.0344827586207, 144.137931034483, 20, 94.4827586206897, 187.586206896552,
94.4827586206897, 20, 44.8275862068966, 175.172413793103, 187.586206896552,
100.689655172414, 137.931034482759, 94.4827586206897, 32.4137931034483,
20, 44.8275862068966, 168.965517241379, 57.2413793103448, 32.4137931034483,
106.896551724138), Income = c(99.9171726114381, 92.579134855529,
34.6787271520874, 78.7028062353695, 68.0099216471551, 71.5044853814318,
87.9704669939115, 79.8110298331255, 90.00632710858, 45.6555294997364,
31.9138079371295, 96.2829968022869, 27.9825049000603, 66.601792415137,
41.5319924201478, 89.00070081522, 28.8163007592387, 57.6816942573605,
70.1050960424457, 98.8340115435447, 74.7046991976891, 53.5321056283034,
72.0789236655191, 18.5706650327685, 78.8057842852386, 21.388561306174,
90.8140351180409, 22.6361626208955, 17.613593041445, 74.6109601985289
)), .Names = c("Education", "Seniority", "Income"), row.names = c(NA,
-30L), class = "data.frame")
library(mgcv)
First of all, you can just use s(Education, Seniority, bs = 'tp')
for a bivariate thin-plate spline rather than using tensor product construction. A thin-plate spline is well-defined on any dimension.
Secondly, mgcv
does regression not interpolation, so without a tweak you can't have the fitted spline running through all points. The tweak for a thin-plate spline includes:
bs = 'tp'
, by setting k
to exactly the number of unique sampling points (and xt
as well if you have more than 2000 unique data locations);sp = 0
to disable penalization of the spline.The number of unique sampling locations in your dataset income_2
is
xt <- unique(income_2[c("Education", "Seniority")])
nrow(xt)
#[1] 30
Because 30 is smaller than 2000, we can just set k = 30
and don't need to pass xt
to s(, bs = 'tp')
.
interpolation_model <- gam(Income ~ s(Education, Seniority, k = 30, sp = 0),
data = income_2)
interpolation_model$residuals
# [1] 2.131628e-13 2.728484e-12 4.561684e-12 1.264766e-12 3.495870e-12
# [6] 4.177991e-12 -1.023182e-12 1.193712e-12 2.231104e-12 6.878054e-12
#[11] 6.309619e-12 6.679102e-13 7.574386e-12 3.637979e-12 4.227729e-12
#[16] 1.790568e-12 4.376943e-12 5.130119e-12 8.242296e-13 -6.536993e-13
#[21] 2.771117e-12 1.811884e-12 3.495870e-12 9.141132e-12 2.117417e-12
#[26] 7.243983e-12 -3.979039e-13 6.352252e-12 6.203038e-12 3.652190e-12
Now you see that all residuals are zero.
You can also look for other packages that do thin-plate spline interpolation directly.
Thin-plate spline is isotropic / radial, be careful if variables differ in scale!
Thank you for the explanation and solving my question. Do you know why the spline surface looks more undulating with less ridges?
Because your two variables differ in scale greatly. You want to standardize your two variables first, then fit a thin plate spline.
## this is how your original data look like on the 2D domain
with(income_2, plot(Education, Seniority, asp = 1))
## let's scale it
xt_scaled <- scale(xt)
dat <- data.frame(xt_scaled, Income = income_2$Income)
with(dat, plot(Education, Seniority, asp = 1))
## fit a model on scaled data
interpolation_model <- gam(Income ~ s(Education, Seniority, k = 30, sp = 0),
data = dat)
## grid on the transformed space
x <- range(dat$Education)
x <- seq(x[1], x[2], length.out=30)
y <- range(dat$Seniority)
y <- seq(y[1], y[2], length.out=30)
## prediction on the transformed space
newdat <- expand.grid(Education = x, Seniority = y)
z <- matrix(predict(interpolation_model, newdat), nrow = length(x))
Now to produce plot, we want to back transform the grid to its original scale. Note there this is no need to transform the predicted values.
## back transform the grid
scaled_center <- attr(xt_scaled, "scaled:center")
#Education Seniority
# 16.38621 93.86207
scaled_scale <- attr(xt_scaled, "scaled:scale")
#Education Seniority
# 3.810622 55.715623
xx <- x * scaled_scale[1] + scaled_center[1]
yy <- y * scaled_scale[2] + scaled_center[2]
## use `xx`, `yy` and `z`
p <- persp(xx, yy, z, theta = 30, phi = 30,
col = "yellow",expand = 0.5, shade = 0.2,
xlab = "Education", ylab = "Seniority", zlab = "Income")
obs <- trans3d(income_2$Education, income_2$Seniority, income_2$Income, p)
pred <- trans3d(income_2$Education, income_2$Seniority, fitted(interpolation_model), p)
points(obs, col="red",pch=16)
segments(obs$x, obs$y, pred$x, pred$y)