Search code examples
rdplyroutliers

How to Calculate mean, median, max and min values for group by 15 days without outlier effect in R


I have data of various machines which looks something like the one in the pic given below Input Sample Data

Input Sample

What I need to do is calculate mean, median, max, and minimum based on groups created by every 15 days eg: group 1 = July 1-15 , group 2 = July 15-31, etc. But while calculating the data it should not consider the outlier values

the output should look like this Output Sample

i have used the following code to aggregate minute wise data to Day level first

DipDay = aggregate(Dipminute2[,c(4:18)], by=list(Dipminute2$Date), sum)
DipDay[,c(2:9)] = round(DipDay[,c(2:9)]*9*38/100000,2)
DipDay[,c(10:16)] = round(DipDay[,c(10:16)]*12*38/100000,2)
DipDay<-DipDay%>%rename("Date"="Group.1")
DipDay$Month = month(ymd(DipDay$Date) ,label = TRUE) 
DipDay$MonthNumber = month(ymd(DipDay$Date)) 
DipDay$Day = day(ymd(DipDay$Date)) 
DipDay$Biweek = as.factor(ifelse(DipDay$Day < 16, "1st_15_days", "last_15_days"))
DipDay$MonthBiweek = paste(DipDay$Month, "_", DipDay$Biweek)
Dip = DipDay[c(21,2:16)]
BiweekDip = aggregate(DipDay[c(2:16,18)], by=list(Biweek=DipDay$MonthBiweek), FUN=median)
BiweekDip = BiweekDip%>%arrange((BiweekDip$MonthNumber))
BiweekDipT= as.data.frame(t(BiweekDip[,c(1:16)]))
BiweekDipT= janitor::row_to_names(BiweekDipT,row_number = 1)
write.csv(BiweekDipT,"Fornightly Dipping Median Count.csv")

Sample input data looks like this for first 10 rows

    structure(list(Timestamp = structure(c(1593714765.03054, 1593714824.99918, 
1593714884.96776, 1593714945.01468, 1593715005.01454, 1593715064.98314, 
1593715125.03002, 1593715185.18608, 1593715244.99822, 1593715304.96657
), class = c("POSIXct", "POSIXt"), tzone = "Asia/Kolkata"), Date = structure(c(18446, 
18446, 18446, 18446, 18446, 18446, 18446, 18446, 18446, 18446
), class = "Date"), Time = c("00:02", "00:03", "00:04", "00:05", 
"00:06", "00:07", "00:08", "00:09", "00:10", "00:11"), `HCM 18` = c(7, 
8, 7, 8, 7, 7, 8, 7, 8, 7), `HCM 19` = c(8, 8, 8, 8, 8, 7, 8, 
8, 8, 8), `HCM 20` = c(7, 7, 7, 7, 8, 7, 4, 8, 7, 7), `HCM 21` = c(8, 
7, 8, 8, 8, 7, 8, 8, 8, 8), `HCM 22` = c(8, 7, 8, 7, 7, 8, 7, 
8, 7, 7), `HCM 23` = c(8, 7, 8, 8, 8, 7, 8, 8, 8, 8), `HCM 24` = c(7, 
7, 7, 6, 7, 7, 7, 6, 4, 0), `HCM 25` = c(8, 8, 8, 7, 8, 8, 8, 
7, 8, 8), `HCM 54` = c(7, 8, 8, 7, 6, 8, 7, 8, 8, 7), `HCM 55` = c(8, 
7, 8, 7, 8, 8, 7, 8, 8, 7), `HCM 56` = c(8, 8, 8, 7, 8, 8, 7, 
8, 8, 7), `HCM 57` = c(8, 8, 7, 8, 8, 7, 8, 8, 7, 8), `HCM 58` = c(8, 
8, 7, 8, 7, 8, 7, 8, 7, 8), `HCM 59` = c(8, 1, 6, 8, 8, 7, 8, 
8, 7, 8), `HCM 60` = c(8, 7, 8, 8, 7, 8, 8, 7, 8, 8)), row.names = c(NA, 
10L), class = "data.frame")

Solution

  • Try :

    library(dplyr)
    
    result <- df %>%
                       #Get month name
                mutate(month = format(Timestamp, '%b'), 
                       #get day of the month
                       day = as.integer(format(Timestamp, '%d')), 
                       #assign labels based on dat of the month
                       day = ifelse(day > 15, 'last_15', 'first_15')) %>%
      #combine month name and labels
      tidyr::unite(Biweek, month, day) %>%
      #For each 15 days
      group_by(Biweek) %>%
      #Change outliers to NA
      mutate(across(`HCM 18`:`HCM 60`, ~replace(., . < quantile(., 0.01) | 
                                               . > quantile(., 0.99), NA))) %>%
      #Calculate min, median, mean, max for each column for every 15 days.
      summarise(across(`HCM 18`:`HCM 60`, list(min = ~min(., na.rm = TRUE), 
                                               median = ~median(., na.rm = TRUE), 
                                               mean = ~mean(., na.rm = TRUE), 
                                               max = ~max(., na.rm = TRUE))))