Search code examples
rggplot2histogramcumulative-sum

Why is the cumulative line above the histogram incorrect with `ggplot2`?


I want to overlay the cumulative share of a column with ggplot but the percentages are incorrect

Indeed, you can see that the red line starts at 95% whereas the first bin is below at around 82%

df <- structure(list(col = c(1.42221064814815, 0.709669201387851, 0.00864583333333333,  3.35221946759356, 0.0138087731489429, 0.101736111111111, 0.000459247684037244,  0.0291767592590164, 0.171842569443915, 0.171538472222509, 0.0708449074074074,  0.0234837962962963, 0.25262748842714, 0.386477071758774, 125.055696030094,  0.0696409606492078, 0.0938078703703704, 0.192905092592593, 0.0031709722208756,  0.227335300925705, 0.0134506134247338, 0.040787037037037, 0.266623020834393,  0.00225040509193032, 0.473669131944577, 0.130830208333554, 3.61516203703704,  0.130288240741248, 0.536915474536794, 0.00138538194475351, 0.0113888888888889,  3.26379307870236, 0.12810640046166, 0.392849537037589, 0.71517319444429,  0.112205289351167, 0.431413553241226, 0.0178086342579789, 2.69385361110999,  0.220277777777778, 0.00206320601756926, 0.0808217592592593, 0.13211086805496,  1.90881438657365, 2.04585710648033, 0.845706018518518, 0.0741087962962963,  0.428182499999249, 0.00403622685207261, 0.0592311111120162, 0.0682201851849203,  1.24485666666594, 0.0189236111111111, 0.0453356481481481, 7.11538414351918,  0.0155092592592593, 0.0541087962962963, 0.0759213078711872, 0.00378994212934264,  0.00767912037118717, 0.0622061574072749, 22.5055494907416, 0.0707319328713196,  0.0851041666666667, 0.285934664353176, 0.0116175694432524, 0.709232141204454,  1.05187328703701, 0.0052125925929458, 0.112268171296627, 0.0400231481481481,  0.0341140393526466, 0.225503703703483, 0.0834027777777778, 0.929739918981989,  0.403400393517481, 0.0825652893522271, 0.458994571759745, 0.07600548611195,  0.0985681712958548, 0.0385900578703041, 0.359117986110074, 0.922757222221957,  186.031066087962, 2.39154376157456, 0.499594907407407, 0.0130671296296296,  2.86927083333333, 0.00584490740740741, 0.619270625001302, 0.0142964004642434,  0.0854832175925926, 1.39854887731373, 1.51077546296296, 0.00819540509195239,  0.750400266203063, 233.781311967594, 0.340315266204653, 0.879955011573103,  2.82027777777778)), row.names = c(NA, -100L), class = "data.frame")

library(ggplot); library(dplyr)

df %>%
  ggplot(aes(x = col)) +
  geom_histogram(aes(y = after_stat(cumsum(count / sum(count)))), breaks = 0:max(df$col, na.rm = T), binwidth = 1, fill = "blue", color = "black") +
  geom_line(stat = "bin", aes(y = after_stat(cumsum(count / sum(count)))), color = "red") +
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(xlim=c(0, 10), ylim=c(0, 1)) +
  scale_x_continuous(breaks = seq(0, 10, by = 1)) +
  scale_y_continuous(breaks = seq(0, 1, by = 0.1), labels = scales::percent) 

enter image description here


Solution

  • The issue is that you use the default number of bins (=30) for geom_line, i.e. the binwidth is computed as diff(range(x)) / 30 while for geom_col you have set the binwidth=1.

    I you want the same counts then you have to use the same binning for both layers.

    Additionally note that for your case there is no need to set the breaks. You could use boundary= or center= to set the starting position for the bins. For geom_line it's a bit more involved. Here I use stage to shift the x position of the line after the stat has been applied. But you could also use position = position_nudge(x = -.5) to achieve the same result.

    library(ggplot2)
    
    df |>
      ggplot(aes(x = col)) +
      geom_histogram(
        aes(y = after_stat(cumsum(count / sum(count)))),
        binwidth = 1, fill = "blue", color = "black",
        boundary = 0
      ) +
      geom_line(
        stat = "bin",
        aes(
          x = stage(col, after_stat = x - .5),
          y = after_stat(cumsum(count / sum(count)))
        ),
        color = "red",
        binwidth = 1,
        boundary = 0
      ) +
      coord_cartesian(xlim = c(0, 10), ylim = c(0, 1)) +
      scale_x_continuous(breaks = seq(0, 10, by = 1)) +
      scale_y_continuous(
        breaks = seq(0, 1, by = 0.1),
        labels = scales::percent
      )
    

    EDIT I you want the line to start at (0, 0) then the easiest way would be to switch to geom_freqpoly which by default extends the line:

    library(ggplot2)
    library(dplyr, warn = FALSE)
    
    df |>
      ggplot(aes(x = col)) +
      geom_histogram(
        aes(y = after_stat(cumsum(count / sum(count)))),
        binwidth = 1, fill = "blue", color = "black",
        boundary = 0
      ) +
      geom_freqpoly(
        aes(
          x = stage(col, after_stat = x + .5),
          y = after_stat(cumsum(count / sum(count)))
        ),
        binwidth = 1, color = "red",
        boundary = 0
      ) +
      coord_cartesian(xlim = c(0, 10), ylim = c(0, 1)) +
      scale_x_continuous(breaks = seq(0, 10, by = 1)) +
      scale_y_continuous(
        breaks = seq(0, 1, by = 0.1),
        labels = scales::percent
      )
    

    enter image description here