Search code examples
r

Calculating percent growth in time series data


I have this data in R for two time series such that growth in one of the series results in growth of the other series after some lag:

    df_long = structure(list(time = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 
    6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 
    13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L, 
    19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 
    26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L, 32L, 
    32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L, 36L, 37L, 37L, 38L, 38L, 
    39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L, 43L, 43L, 44L, 44L, 45L, 
    45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L, 49L, 50L, 50L, 51L, 51L, 
    52L, 52L, 53L, 53L, 54L, 54L, 55L, 55L, 56L, 56L, 57L, 57L, 58L, 
    58L, 59L, 59L, 60L, 60L, 61L, 61L, 62L, 62L, 63L, 63L, 64L, 64L, 
    65L, 65L, 66L, 66L, 67L, 67L, 68L, 68L, 69L, 69L, 70L, 70L, 71L, 
    71L, 72L, 72L, 73L, 73L, 74L, 74L, 75L, 75L, 76L, 76L, 77L, 77L, 
    78L, 78L, 79L, 79L, 80L, 80L, 81L, 81L, 82L, 82L, 83L, 83L, 84L, 
    84L, 85L, 85L, 86L, 86L, 87L, 87L, 88L, 88L, 89L, 89L, 90L, 90L, 
    91L, 91L, 92L, 92L, 93L, 93L, 94L, 94L, 95L, 95L, 96L, 96L, 97L, 
    97L, 98L, 98L, 99L, 99L, 100L, 100L), series = c("Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging", 
    "Leading", "Lagging", "Leading", "Lagging", "Leading", "Lagging"
    ), value = c(99.4395243534478, 49.6447967181503, 99.7698225105167, 
    50.1284418545783, 101.558708314149, 49.8766540607688, 100.070508391425, 
    49.8262287003011, 100.129287735161, 49.5241907163675, 101.715064986883, 
    49.9774861375955, 100.460916205989, 49.6075477652715, 98.7349387653935, 
    49.1660290317059, 99.3131471481065, 49.8098867398561, 99.5543380299001, 
    50.4594983045304, 105.273045069337, 49.7123265186958, 108.549174635345, 
    50.3039821611125, 112.937213377001, 49.1910586458554, 117.115339122692, 
    49.9722190172377, 120.989024510128, 50.2597036019717, 123.839343294228, 
    51.755395134878, 122.271001469299, 53.3074738414986, 119.272599768477, 
    54.6031368856927, 122.518596933253, 56.2319063541722, 121.090067201558, 
    57.9292390886276, 120.36611942886, 58.5975047842126, 121.400090427431, 
    57.9741037415852, 120.416998950092, 58.2415311831348, 120.778482610349, 
    58.378760524451, 120.904834400657, 59.6078353210825, 119.613169928027, 
    58.1470703597842, 122.684586279277, 58.6664163996338, 121.851892088966, 
    58.5742772528016, 120.280572632256, 57.9656861397817, 123.190747802792, 
    58.4869121631259, 118.518624697645, 59.3741239062539, 114.137088885571, 
    58.7929070388491, 112.034578571703, 58.5527808934215, 108.655239045281, 
    58.2813661386636, 105.336496980897, 57.3269099865169, 105.197602897623, 
    57.770224894356, 105.05684725131, 54.9386347844557, 104.413440281713, 
    54.8175553805619, 104.158460423083, 54.1228606240017, 104.080615509722, 
    51.0859082195604, 103.752307653576, 52.1981060037744, 104.260896403218, 
    51.6984323802381, 103.156062100451, 51.0194301601899, 106.744208989045, 
    51.0492227278142, 105.740180515963, 51.0041950032861, 103.304721692197, 
    51.5591488246471, 104.057197956036, 51.0766493498308, 103.990571714523, 
    52.190917844231, 105.293017402957, 52.9229183439836, 104.391022038711, 
    51.1672169581333, 109.979926253532, 52.242659958441, 115.154250045687, 
    52.232968695738, 120.894638533399, 52.0065357369094, 128.731853517781, 
    51.3116553533773, 133.042453009798, 51.7724232491348, 135.365618993348, 
    53.7565596327335, 131.278342696254, 56.3796575039867, 134.123048411341, 
    58.0894275243687, 133.508655541522, 61.2312725926207, 133.631448007239, 
    62.8281739067728, 133.849728541686, 63.7282872621704, 132.673688259431, 
    62.4027438857521, 132.89919355217, 62.2696917234875, 131.985299846747, 
    65.1083435050655, 131.914339976542, 62.8015123399534, 133.748239678934, 
    63.252476525753, 133.941162576869, 63.4658499336833, 133.414181646151, 
    62.7593077271382, 134.573287710474, 63.3903570339516, 136.077158706804, 
    63.2970863506865, 130.034970874851, 62.9285726540986, 125.105907922438, 
    63.1055777418292, 126.764059517985, 63.0429167717537, 122.11954358861, 
    64.4066965237066, 119.702696027261, 62.5968819121748, 121.768107679415, 
    61.3752461179914, 120.188723836278, 61.0855592688391, 119.060611279229, 
    60.2720388416955, 120.750494990473, 59.3822078713432, 120.36455785106, 
    57.9114629572074, 120.538914027895, 57.5595080676796, 120.996352384691, 
    58.9130288733449, 120.085202516525, 57.9747112805574, 121.308646065858, 
    57.6745921324798, 120.266209552808, 58.0406683045465, 120.931869666359, 
    58.0634180917107, 121.854007971433, 58.8238622264472, 121.056499149341, 
    58.2274299156511, 120.139114592318, 58.6168257440383, 121.916646753284, 
    57.8876525769674, 121.729456074554, 58.3028916163668, 121.192959979937, 
    57.9892351464628, 120.819714395897, 58.2331582717497, 119.775138801059, 
    57.6572256721037, 122.17198749257, 57.4155315048015, 119.808461657746, 
    59.3400727686692, 123.168401808201, 58.5276125109227, 122.379252065383, 
    57.4501650462833, 120.247872063789, 57.8225664327736, 119.294801047208, 
    57.4884411818443)), row.names = c(NA, -200L), class = c("tbl_df", 
    "tbl", "data.frame"))

When I plot it, it looks like this:

library(ggplot2)
library(tidyr)



ggplot(df_long, aes(x = time, y = value, color = series)) +
    geom_line(size = 1) +
    theme_minimal() +
    labs(
        title = "Time Series with Lagged Growth Pattern",
        subtitle = "Leading series (blue) shows growth patterns that are followed by lagging series (red)",
        x = "Time",
        y = "Value",
        color = "Series"
    ) +
    scale_color_manual(values = c("Leading" = "#2c7bb6", "Lagging" = "#d7191c")) +
    theme(
        legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 10)
    )

enter image description here

I want to create a function in R that studies how growth in one series results in growth of the other series. For example:

  • Step 1: Take the first 5 points of Series 1 Window 1 (points 1-5) and Series 1 Window 2 (points 2-6). Calculate % growth (i.e. Series1Window2Sum-Series1Window1Sum/Series1Window1Sum * 100)

  • Step 2: Then, wait 3 points (i.e. 5 + 3 = 8)

  • Step 3: Then, look at Series 2 Window 1 (points 8-12) and Series 2 Window 2 (points 9-13). Calculate % growth (i.e. Series2Window2Sum-Series2Window1Sum/Series2Window1Sum * 100)

  • Step 4: Advance 1 point in Series 1. Calculate % growth between Series 1 Window 1 (points 2-6) and Series 1 Window 2 (points 3-7), etc.

  • Step 5: Repeat Steps for all data (i.e. sliding window)

I tried to write a function for this manually:

 analyze_growth_patterns <- function(data, window = 5, lag = 3, dominant = 1) {
    series1 <- data$value[data$series == unique(data$series)[1]]
    series2 <- data$value[data$series == unique(data$series)[2]]
    times <- unique(data$time)
    
    dom_values <- if(dominant == 1) series1 else series2
    fol_values <- if(dominant == 1) series2 else series1
    
    max_start <- length(dom_values) - (window + lag + window) + 1
    
    results <- lapply(1:max_start, function(i) {
        dom_window1_sum <- sum(dom_values[i:(i + window - 1)])
        dom_window2_sum <- sum(dom_values[(i + 1):(i + window)])
        dom_growth <- (dom_window2_sum - dom_window1_sum) / dom_window1_sum * 100
        
        fol_start <- i + window + lag
        fol_window1_sum <- sum(fol_values[fol_start:(fol_start + window - 1)])
        fol_window2_sum <- sum(fol_values[(fol_start + 1):(fol_start + window)])
        fol_growth <- (fol_window2_sum - fol_window1_sum) / fol_window1_sum * 100
        
        data.frame(
            time = times[i],
            dominant_growth = dom_growth,
            following_growth = fol_growth,
            following_time = times[fol_start],
            dom_window1_sum = dom_window1_sum,
            dom_window2_sum = dom_window2_sum,
            fol_window1_sum = fol_window1_sum,
            fol_window2_sum = fol_window2_sum
        )
    })
    
    do.call(rbind, results)
}

plot_growth_analysis <- function(results) {
    library(gridExtra)
    library(ggplot2)
    
    p1 <- ggplot(results) +
        geom_line(aes(x = time, y = dominant_growth, color = "Dominant"), size = 1) +
        geom_line(aes(x = following_time, y = following_growth, color = "Following"), size = 1) +
        scale_color_manual(values = c("Dominant" = "#2c7bb6", "Following" = "#d7191c")) +
        theme_minimal() +
        labs(title = "Growth Patterns Over Time",
             y = "Growth (%)",
             color = "Series") +
        theme(legend.position = "bottom")
    
    p2 <- ggplot(results, aes(dominant_growth, following_growth)) +
        geom_point(alpha = 0.6) +
        geom_smooth(method = "lm", se = TRUE) +
        theme_minimal() +
        labs(title = paste("Correlation:", 
                          round(cor(results$dominant_growth, results$following_growth), 3)),
             x = "Dominant Series Growth (%)",
             y = "Following Series Growth (%)") +
        theme(legend.position = "bottom")
    
    grid.arrange(p1, p2, ncol = 2)
}

Finally, the plots:

  results <- analyze_growth_patterns(df_long, window = 5, lag = 3)
plot_growth_analysis(results)

enter image description here

Are there any built in time series functions in R that can do most of this by itself? E.g. zoo, forecast?





PS (unrelated): : Something similar and much simpler - I included it in case anyone was interested:

 library(ggplot2)
library(dplyr)
library(gridExtra)
library(tidyr)



p1 <- ggplot(df_long, aes(x = time, y = value, color = series)) +
    geom_line() +
    theme_minimal() +
    labs(title = "Original Time Series") +
    theme(legend.position = "bottom")

df_wide <- df_long %>%
    pivot_wider(
        id_cols = time,
        names_from = series,
        values_from = value
    )



plot_list <- list()
for(i in 0:7) {
    df_lag <- df_wide %>%
        mutate(Lagging_lag = lag(Lagging, i)) %>%
        na.omit()
    
    cor_val <- cor(df_lag$Leading, df_lag$Lagging_lag)
    
    plot_list[[i+1]] <- ggplot(df_lag, aes(x = Leading, y = Lagging_lag)) +
        geom_point(alpha = 0.6) +
        geom_smooth(method = "lm", color = "black") +
        theme_minimal() +
        labs(
            title = sprintf("Leading vs Lagging (Lag=%d)", i),
            subtitle = sprintf("Correlation: %.3f", cor_val)
        )
}

final_plot <- grid.arrange(
    p1, 
    plot_list[[1]], plot_list[[2]], plot_list[[3]],
    plot_list[[4]], plot_list[[5]], plot_list[[6]],
    plot_list[[7]], plot_list[[8]],
    top = "Time Series Analysis with Lags",
    ncol = 3
)

enter image description here


Solution

  • If the objective here is to align the series we can use dtw from the dtw package - also see packages ptw, rucrdtw, IncDTW and twdtw. See ?dtw and the dtw package vignette for more info. Suggest playing around with the window.type and window.size arguments.

    Not shown here but we could also apply dtw to growth <- lapply(df_wide, diff, 5) in the same way to relate slopes.

    library(dtw)
    
    df_wide <- unstack(df_long, value ~ series)
    d <- with(data.frame(wide), dtw(Leading, Lagging, keep = TRUE,
      window.type = "sakoechiba", window.size = 5))
    
    plot(d, type = "twoway")
    

    screenshot

    Update

    Seems to work better with the revised window.type .