Search code examples
rdplyr

how to subset a varible in R inside summarize


for some reason this very basic subset is not working. Can anyone replicate the error that I find?

I simply trying to summarize a variable that meets a condition.

library(tidyverse)

dat |> 
    summarise(
    invoice_usd=sum(invoice_usd,na.rm=TRUE)
    ,invoice_usd_overdue = sum(invoice_usd[overdue_indicator==0],na.rm = TRUE)
  )

I get the following results

Invoice_usd
<dbl>
invoice_usd_overdue
<dbl>
3525924 3525924 

Both are equal to eachother?

here is the data

structure(list(invoice_usd = c(768.3, 33116.65, 29801.35, 15548.67, 
748.8, 43506.0981401951, 34126.59, 110.88, 32519.93, 3814.10450994275, 
123.910066653647, 9056.81686889635, 5280.35381654576, 1497.6, 
25694.7, 34762.41, 30708.78, 30266.71, 5642.53296230612, 23046.73, 
29233.7, 31234.7372533846, 38239.4645517168, 1191.25, 23819.49, 
32331.53, 30703.51, 13821.3265705642, 62096.6980502484, 32225.3, 
26647.2826958931, 35083.8675795201, 74097.9979256714, 34446.39, 
43377.5, 33340.4, 31485.24, 46739.5144071148, 31732.517590235, 
22900.47, 32635.43, 20213.4230960573, 1884.68, 296.25, 1001.52, 
31112.93, 27884.8, 31015.68, 27337.04, 33645.83, 955, 218.85, 
14113.8, 18407.59, 29783.1196862457, 44048.0748905081, 64022.9197862025, 
32710.63, 32199.65, 21611.697431022, 1014229.17, 11063.370500554, 
8283.50645227789, 6189.65, 823.67, 61586.7668701702, 70256.7541546641, 
89136.0099085984, 8735.09297677894, 17974.67, 3552, 7684.67, 
28476.61, 35574.43, 16096.93, 6199.98162780545, 44263.8256750224, 
12617.6797342677, 23698.2896894991, 32507.4, 12675.12, 16298.58, 
30472, 35333.2, 33603.44, 29568.23, 31927.3, 43898.4205715362, 
25534.61, 34515.19, 29304.43, 33724.18, 25941.2, 171.697042400774, 
29855.3747983556, 1225.08, 31576.81, 30970.71, 19705.21, 30684.18
), overdue_indicator = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 
0, 0, 0, 0)), row.names = c(NA, -100L), class = c("tbl_df", "tbl", 
"data.frame"))

Solution

  • Well, the problem is updated value of invoice_usd in summarise call is used which is a single value. If you calculate invoice_usd_overdue first it would work as expected.

    library(dplyr)
    
    dat |> 
      summarise(
        invoice_usd_overdue = sum(invoice_usd[overdue_indicator == 0],na.rm = TRUE), 
        invoice_usd =sum(invoice_usd,na.rm=TRUE)
      )
    
    # A tibble: 1 × 2
    #  invoice_usd_overdue invoice_usd
    #                <dbl>       <dbl>
    #1            3239980.    3525924.
    

    Or change the name of first value.

    dat |> 
      summarise(
        invoice_usd_new =sum(invoice_usd,na.rm=TRUE),
        invoice_usd_overdue = sum(invoice_usd[overdue_indicator == 0],na.rm = TRUE)
      )