Search code examples
rggplot2tidyverseregression

Create a Piece-Wise Regression in ggplot2 (R) with multiple 'knots'


I am trying to make a piece-wise regression using non-linear data in the ggplot2 package (R). I have written code that creates one successfully but can't seem to get it where I want. The graph (below) is a CDF with two distinct increases, I'm attempting to objectively assign dates to where these increases begin and end. Unfortunately, the code I have recognises only one of the increases and ignores the send increase unless I plot it separately after creating a new dataframe. Here is the code and graph:

library(segmented)
attach(dai15E_flux_split)
y<- dai15E_flux_split$CDF15E
x<- dai15E_flux_split$Dates
pw_reg_15E<- data.frame(x = x, y = y)
out.lm <- lm(y ~ x, data = pw_reg_15E)
o <- segmented(out.lm, seg.Z = ~x, psi = list(x = c(as.POSIXct('2020-12-20 18:00:00'),as.POSIXct('2021-01-23 18:00:00'))),
               control = seg.control(display = FALSE)
)
dat2 = data.frame(x = x, y = broken.line(o)$fit)
w<- ggplot(pw_reg_15E, aes(x = x, y = y)) +
  geom_point() +
  geom_line(data = dat2, color = 'blue', linewidth = 2) +
  #geom_line(data = dat3, color = 'blue') +
  scale_x_datetime(date_breaks = '5 days', date_labels = '%b-%d') +
  scale_y_continuous(breaks = ~ seq(0, max(.x), .1)) +
  annotate('segment', x= as.POSIXct("2021-01-29"), xend= as.POSIXct("2021-01-29"), yend=-Inf, y=Inf, colour = "black", alpha = 1) +#creates the continuous density function plot for Dai15
  geom_vline(xintercept = (dai15E_flux_split$Dates[dai15E_flux_split$peak_status == "Peak"]), color = "black") +
  geom_point(size = 4) + theme_bw() + #ylab(~paste('CDF', Phi)) +
  labs(x = "Date")
plot(w)

Which produces: enter image description here

If I cut the data off at a date post-second break-point I am able to detect the start of the second increase but not the end (granted this could be due to just the nature of the data): enter image description here

I did try creating two separate segment lines for one graph but nothing came of it, anything I'm doing wrong? Ultimately what Im trying to do is this (please excuse the crudeness of that graph, I drew the line in mspaint): enter image description here

reprex data:

structure(list(x = structure(c(1606820400, 1606906800, 1606993200, 
1607079600, 1607166000, 1607252400, 1607338800, 1607425200, 1607511600, 
1607598000, 1607684400, 1607770800, 1607857200, 1607943600, 1608030000, 
1608116400, 1608202800, 1608289200, 1608375600, 1608462000, 1608548400, 
1608634800, 1608721200, 1608807600, 1608894000, 1608980400, 1609066800, 
1609153200, 1609239600, 1609326000, 1609412400, 1609498800, 1609585200, 
1609671600, 1609758000, 1609844400, 1609930800, 1610017200, 1610103600, 
1610190000, 1610276400, 1610362800, 1610449200, 1610535600, 1610622000, 
1610708400, 1610794800, 1610881200, 1610967600, 1611054000, 1611140400, 
1611226800, 1611313200, 1611399600, 1611486000, 1611572400, 1611658800, 
1611745200, 1611831600), class = c("POSIXct", "POSIXt"), tzone = "Asia/Bangkok"), 
    y = c(0.0127654103696743, 0.0250904882020871, 0.0370714866508862, 
    0.0468027880724444, 0.0536991190985604, 0.0626795693063142, 
    0.0708962963854844, 0.0772446662188717, 0.0842076373362264, 
    0.090949381870277, 0.0970952248012049, 0.104371307628584, 
    0.110358027715881, 0.115450773819791, 0.122580977690793, 
    0.134406168897197, 0.146368284831593, 0.158716595955345, 
    0.170651213164138, 0.190479315670677, 0.224094027748295, 
    0.280359267223023, 0.337800104800282, 0.406506532914733, 
    0.491550665708546, 0.548897965427954, 0.577827044961975, 
    0.596475967835595, 0.605583211717681, 0.611728728447968, 
    0.616878285109383, 0.623334778422036, 0.631319388073111, 
    0.641552889847983, 0.650051719012484, 0.659696487319351, 
    0.666246012002399, 0.672683290607584, 0.677061608197219, 
    0.68231817736804, 0.687772229223841, 0.696619586579348, 0.710702270394893, 
    0.718733992834477, 0.728995458593268, 0.744779674223926, 
    0.763218116035974, 0.790875495532422, 0.825075587533513, 
    0.846944735886294, 0.869832760922987, 0.892032203445386, 
    0.919806428133041, 0.943454538344951, 0.963248306979268, 
    0.977788349576517, 0.986129869261138, 0.993916256029506, 
    1)), row.names = c(NA, 59L), class = "data.frame")

Solution

  • If you want to catch two increases, you need at least three breakpoints. Here's an attempt with 4:

    library(segmented)
    out.lm <- lm(y ~ x, data = pw_reg_15E)
    br <- as.POSIXct(c('2020-12-20 18:00:00','2020-12-28 18:00:00', '2021-01-15 18:00:00', '2021-01-23 18:00:00'))
    o <- segmented(out.lm, seg.Z = ~x, psi = list(x = br))
    dat2 = data.frame(x = pw_reg_15E$x, y = predict(o))
    
    ggplot(pw_reg_15E, aes(x = x, y = y)) +
      geom_point() +
      geom_line(data = dat2, col = 'firebrick') +
      geom_vline(xintercept = o$psi[, 'Est.'])
    

    enter image description here

    > as.POSIXct(o$psi[, 'Est.'])
                       psi1.x                    psi2.x                    psi3.x                    psi4.x 
    "2020-12-20 13:14:11 PST" "2020-12-26 15:35:47 PST" "2021-01-14 12:22:15 PST" "2021-01-23 21:34:05 PST"