Search code examples
rdplyrsum

{dplyr} Summarise not working as expected


I am trying to get the sum of all values for each year. Using this code, the sum is NA for all years. I don't understand why.

-code

df %>%
  group_by(`@TIME_PERIOD`) %>%
  summarise(sum = sum(as.numeric(`@OBS_VALUE`)))

Consider this data:

df <- structure(list(`@TIME_PERIOD` = c("2009", "2010", "2011", "2012", 
"2013", "2014", "2015", "2016", "2017", "2018", "2019", "2009", 
"2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", 
"2018", "2019", "2009", "2010", "2011", "2012", "2013", "2014", 
"2015", "2016", "2017", "2018", "2019", "2009", "2010", "2011", 
"2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", 
"2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", 
"2017", "2018", "2019", "2010", "2011", "2012", "2013", "2014", 
"2015", "2016", "2017", "2018", "2019", "2009", "2010", "2011", 
"2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", 
"2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", 
"2017", "2018", "2019", "2009", "2010", "2011", "2012", "2013", 
"2014", "2015", "2016", "2017", "2018", "2019", "2009", "2010", 
"2011", "2012", "2013", "2014", "2009", "2010", "2011", "2012", 
"2013", "2014", "2015", "2016", "2017", "2018", "2019", "2009", 
"2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", 
"2018", "2019", "2009", "2010", "2011", "2012", "2013", "2014", 
"2015", "2016", "2017", "2018", "2019", "2009", "2010", "2011", 
"2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", 
"2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", 
"2017", "2018", "2019", "2009", "2010", "2011", "2012", "2013", 
"2014", "2015", "2016", "2017", "2018", "2019", "2009", "2010", 
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", 
"2019", "2014", "2015", "2016", "2017", "2018", "2019"), `@OBS_VALUE` = c("199.5231", 
"812.4096", "1165.8684449076", "928.150124526", "1445.84844", 
"1296.41598", "1514.59944", "1334.4906", "1647.35848", "1558.9175", 
"1388.63474", "80.38548", "121.1078232", "92.9589516", "127.3524462", 
NA, NA, NA, NA, NA, NA, "36.432936031836", "91.7402892000001", 
"94.1165832", "81.9491565", "90.90666", "102.32922", NA, "168.7485", 
"279.0792996", "417.2796448", "258.4036", "253.9805188", "17.43126", 
"22.44816", "21.73752", "20.84652", "28.8259482", "19.2811221", 
"43.6274751", "50.8128905", "39.2830715", "25.688075", "34.5703882", 
"704.800746172167", "770.576143217993", "642.352476546557", "220.123395925455", 
"136.5309", "291.384", "371.2467", "441.6679", "687.1989", "588.53", 
"527.998", "1248.1427978532", "2236.4275546323", "1059.3077546304", 
"6419.170982289", "5615.471576799", "4760.2082595648", "5757.9532837402", 
"5711.6219394629", "5889.541005575", "6443.9571653108", "20514.72024", 
"20516.0148", "28807.13082", "21230.33346", "24691.54431", "19212.64686", 
"21936.54291", "16742.37571", "19168.29197", "16405.56", "24145.01152", 
"11.5248", "5.3448", "12.939", "11.8746", "39.9939", "13.3551", 
"8.7096", "10.541", "7.1958", "21.755", "46.0594", "8413.31810629381", 
"7354.3811714922", "8309.5456112583", "8820.1613664864", "2307.2343", 
"2272.7952", "2100.1023", "2031.2507", "2988.6556", "4650.99", 
"4731.7608", "196.778757", "979.6497282", "88.205163", "2587.87116", 
"909.93018", "817.93917", NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, "1002.6576", "1098.3564", "5739.86979", "9012.96160010525", 
"10630.3934965252", "7990.07264989088", "7300.22114094787", "7230.71730579791", 
"6725.87228037132", "7531.87672425818", "6769.05856033655", "2.8812", 
"60.129", "50.33271", "69.40044", "70.4488011261", "80.7924921111", 
"67.280691057", "93.6984124631", "142.3116927921", "128.028766965", 
"128.9077672686", "74.7972190797301", "184.378184891178", "179.521916167317", 
"98.8463058489", "86.3206272", "102.3911235", "96.0864846", "105.0379027", 
"114.4432025", "138.2473", "124.1121086", "1168.90284", "1003.61982", 
"1103.17914", "1494.35244", "13179.64497", "12590.82405", "12312.1083", 
"13312.86136", "18590.3493", "19435.8025", "24003.6878", "217.31451", 
"429.05382", "337.203279", "436.906116", "249.299907", "338.903874", 
"242.192202", "162.900614", "193.387125", "297.7229", "196.055768", 
NA, NA, NA, "6252.6366", NA, NA, NA, NA, NA, NA, NA, "43161.3172724031", 
"51812.2550911566", "49162.8700131921", "65072.7542580131", "59170.898252275", 
"31336.1517258698"), `@REF_AREA` = c("AT", "AT", "AT", "AT", 
"AT", "AT", "AT", "AT", "AT", "AT", "AT", "MT", "MT", "MT", "MT", 
"MT", "MT", "MT", "MT", "MT", "MT", "MT", "SK", "SK", "SK", "SK", 
"SK", "SK", "SK", "SK", "SK", "SK", "SK", "EE", "EE", "EE", "EE", 
"EE", "EE", "EE", "EE", "EE", "EE", "EE", "FI", "FI", "FI", "FI", 
"FI", "FI", "FI", "FI", "FI", "FI", "FI", "IT", "IT", "IT", "IT", 
"IT", "IT", "IT", "IT", "IT", "IT", "BE", "BE", "BE", "BE", "BE", 
"BE", "BE", "BE", "BE", "BE", "BE", "LV", "LV", "LV", "LV", "LV", 
"LV", "LV", "LV", "LV", "LV", "LV", "ES", "ES", "ES", "ES", "ES", 
"ES", "ES", "ES", "ES", "ES", "ES", "NL", "NL", "NL", "NL", "NL", 
"NL", "CY", "CY", "CY", "CY", "CY", "CY", "CY", "CY", "CY", "CY", 
"CY", "FR", "FR", "FR", "FR", "FR", "FR", "FR", "FR", "FR", "FR", 
"FR", "GR", "GR", "GR", "GR", "GR", "GR", "GR", "GR", "GR", "GR", 
"GR", "SI", "SI", "SI", "SI", "SI", "SI", "SI", "SI", "SI", "SI", 
"SI", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", 
"DE", "PT", "PT", "PT", "PT", "PT", "PT", "PT", "PT", "PT", "PT", 
"PT", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", 
"IE", "LU", "LU", "LU", "LU", "LU", "LU")), row.names = c(NA, 
-187L), class = "data.frame")

Solution

  • There are some NA elements already in the dataset

    which(is.na(df$`@OBS_VALUE`))
    #[1]  16  17  18  19  20  21  28 105 106 107 108 109 110 111 112 113 114 115 171 172 173 175 176 177 178 179 180 181
    

    It can be solved by na.rm = TRUE in sum

    library(dplyr)
    df %>%
        group_by(`@TIME_PERIOD`) %>%
        summarise(sum = sum(as.numeric(`@OBS_VALUE`), na.rm = TRUE))
    

    -output

    # A tibble: 11 x 2
    #   `@TIME_PERIOD`     sum
    # * <chr>            <dbl>
    # 1 2009            32697.
    # 2 2010            34700.
    # 3 2011            48869.
    # 4 2012            52462.
    # 5 2013            60298.
    # 6 2014            93804.
    # 7 2015           102734.
    # 8 2016            96716.
    # 9 2017           121506.
    #10 2018           116102.
    #11 2019           100166.
    

    Conversion to numeric can be done before the grouping as well

    df %>%
        na.omit %>%
        type.convert(as.is = TRUE) %>%
        group_by(`@TIME_PERIOD`) %>%
        summarise(Sum = sum(`@OBS_VALUE`))
    

    Some suggestions to follow (not in any order) when receiving a dataset

    1. check the structure - str(data) - gives info about the data types
    2. check for missing values - colSums(is.na(data))
    3. check for documentation of proposed function - help(sum) or ?sum- give an idea about all the arguments and expected input data types etc.
    4. check for examples provided in documentation - to find if there are any differences in the format of input data with respect to the one showed in examples (not relevant in this case)
    5. Finally, make sure that the packages and R version are uptodate