Search code examples
rdplyrmediandate-conversiondbplyr

Calculating median value for months for several years


I have recently started my adventure with R and trying to solve the following problem. I have data.frame including arrivals and departures for particular months of the year. I have to find what is the median for every month through all this years. My results should be saved in .csv. Below is just the sample, the whole observations include dates up to 2017 (1548 obs. in total):

#dput output assigned to the flights variable
flights <- structure(list(X = 1:163, ReportPeriod = structure(c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 
15L, 15L, 15L, 15L, 15L, 15L, 15L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 6L), .Label = c("01/01/2006 12:00:00 AM", "01/01/2007 12:00:00 AM", 
"02/01/2006 12:00:00 AM", "02/01/2007 12:00:00 AM", "03/01/2006 12:00:00 AM", 
"03/01/2007 12:00:00 AM", "04/01/2006 12:00:00 AM", "05/01/2006 12:00:00 AM", 
"06/01/2006 12:00:00 AM", "07/01/2006 12:00:00 AM", "08/01/2006 12:00:00 AM", 
"09/01/2006 12:00:00 AM", "10/01/2006 12:00:00 AM", "11/01/2006 12:00:00 AM", 
"12/01/2006 12:00:00 AM"), class = "factor"), FlightType = structure(c(1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 
3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 1L), .Label = c("Charter", "Commuter", "Scheduled"), class = "factor"), 
 Arrival_Departure = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 
 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 
 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 
 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 
 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 
 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
 2L, 1L, 1L, 2L, 2L, 1L), .Label = c("Arrival", "Departure"
 ), class = "factor"), Domestic_International = structure(c(1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("Domestic", 
 "International"), class = "factor"), FlightOpsCount = c(57L, 
 299L, 62L, 5L, 5996L, 31L, 5995L, 31L, 13695L, 4026L, 13666L, 
 4085L, 22L, 101L, 21L, 100L, 5525L, 28L, 5525L, 28L, 12308L, 
 3381L, 12322L, 3398L, 51L, 4L, 47L, 5L, 6217L, 31L, 6216L, 
 30L, 13925L, 3952L, 13963L, 3961L, 57L, 2L, 52L, 5885L, 31L, 
 5886L, 31L, 13420L, 3884L, 13461L, 3897L, 26L, 5L, 24L, 3L, 
 6089L, 41L, 6089L, 41L, 14126L, 3968L, 14138L, 3984L, 18L, 
 1L, 18L, 5980L, 39L, 5980L, 39L, 14208L, 4030L, 14211L, 4028L, 
 16L, 2L, 14L, 6092L, 39L, 6092L, 39L, 14866L, 4483L, 14883L, 
 4506L, 15L, 1L, 16L, 6134L, 40L, 6134L, 40L, 15243L, 4303L, 
 15272L, 4328L, 24L, 1L, 18L, 5903L, 37L, 5903L, 37L, 13832L, 
 3823L, 13815L, 3865L, 48L, 285L, 50L, 6135L, 40L, 6135L, 
 40L, 14292L, 3605L, 14332L, 3898L, 48L, 3L, 50L, 2L, 5772L, 
 39L, 5772L, 39L, 13855L, 3706L, 13855L, 3718L, 46L, 6L, 44L, 
 3L, 6144L, 40L, 6144L, 40L, 14417L, 4086L, 14474L, 4103L, 
 38L, 3L, 42L, 3L, 6081L, 94L, 6081L, 94L, 14140L, 4301L, 
 14165L, 4308L, 14L, 7L, 16L, 5L, 5470L, 83L, 5470L, 83L, 
 12734L, 3779L, 12768L, 3783L, 33L)), .Names = c("X", "ReportPeriod", 
"FlightType", "Arrival_Departure", "Domestic_International", 
"FlightOpsCount"), class = "data.frame", row.names = c(NA, -163L
))

And here is the expected output I should get:

month,Arrival_Departure,FlightOpsCount
January,Arrival,23152
January,Departure,23173
February,Arrival,20849
February,Departure,20878
March,Arrival,23981.5
March,Departure,24005
April,Arrival,23393
April,Departure,23451.5
May,Arrival,24342.5
May,Departure,24376
June,Arrival,24622.5
June,Departure,24667
July,Arrival,25795.5
July,Departure,25837
August,Arrival,25573.5
August,Departure,25600
September,Arrival,23306
September,Departure,23315
October,Arrival,23965
October,Departure,23990
November,Arrival,22379
November,Departure,22361
December,Arrival,23594
December,Departure,23579

I decided to do it in several steps, first thing I try to do is to receive correct date format from character string:

library(dbplyr)
step_1 = as_tibble(flights)

step_2 = step_1 %>%
  transmute(
    date_format = as.POSIXct(strptime(ReportPeriod, format = "%m/%d/%Y")),
    even_new_date = as.Date(date_format, format = "%Y"),
    Arrival_Departure, 
    FlightOpsCount)

And this is actually tricky for me.. I don't understand how to do this correctly and why there are two ways to get date format eg 2006-01-01 vs 2005-12-31? Which one is correct in this case?

Now, suppose 2006-01-01 is correct, I could use as.POSIXct within months() function to obtain month:

step2 = step_1 %>%
transmute(
month = months(as.POSIXct(strptime(ReportPeriod, format = "%m/%d/%Y"))), 
Arrival_Departure, 
FlightOpsCount)

Next step would require grouping operation:

step_3 = step_2 %>%
  group_by(month, Arrival_Departure) %>% 
  summarize(median = median(FlightOpsCount))

And when writing it to csv I'm getting ridiculously small values..

"","month","Arrival_Departure","median"
"1","April","Arrival",102.5
"2","April","Departure",3061
"3","August","Arrival",1412.5
"4","August","Departure",3667.5
"5","December","Arrival",102
"6","December","Departure",1738
"7","February","Arrival",116
"8","February","Departure",116
"9","January","Arrival",284
"10","January","Departure",1708
"11","July","Arrival",95.5
"12","July","Departure",3571
"13","June","Arrival",119
"14","June","Departure",3292
"15","March","Arrival",115
"16","March","Departure",1759
"17","May","Arrival",1609.5
"18","May","Departure",3121
"19","November","Arrival",93.5
"20","November","Departure",93.5
"21","October","Arrival",2359
"22","October","Departure",2756
"23","September","Arrival",1228
"24","September","Departure",3187.5

Can somebody guide me through it and show me the proper way to solve the problem?

I would appreciate any help.


Solution

  • You're almost there, though I would recommend using dplyr:

    # Step 1: Convert dates using as.Date function
    flights$ReportPeriod <- as.Date(flights$ReportPeriod, "%m/%d/%Y")
    
    # Step 2: Use dplyr to summarize information
    require(dplyr)
    flights <- flights %>% 
                 group_by(ReportPeriod, Arrival_Departure) %>%
                 summarise(FlightOpsCount = median(FlightOpsCount)) %>% 
                 as.data.frame() 
    
    # Step 3: Convert date to string for month name
    flights <- flights %>%
                 mutate(ReportPeriod = months(ReportPeriod)) %>%
                 rename(month = ReportPeriod) # If you need to rename the column to be "months"
    
    
    # Alternate Step 3: If you want to add in year as well
    require(lubridate)
    flights <- flights %>%
                 mutate(ReportPeriod = paste(months(ReportPeriod), 
                                             year(ReportPeriod), 
                                             sep = " ")) %>%
                 rename(month = ReportPeriod) # If you need to rename the column to be "months"
    
    # Step 4: Write to csv
    write.csv(flights, "file_name.csv", row.names = FALSE)
    

    Cheers.