Search code examples
rdplyrreshapeplyrreshape2

Looping to get outliers data using dplyr in R


I have task to find outliers data, here it is my data:

# combination 1
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1")
product <- rep(0, 120)
detail <- rep(0, 120)
status <- rep(0, 120)
channel <- rep(0, 120)
transaction <-c(5664,4797,2515,1744,2166,2164,3513,6548,7620,8662,11295,11372,12094,14064,15412,13042,12779,14653,13586,12922,11321,9709,7899,5916,5791,5544,3567,1783,2900,4488,1830,4946,6735,16673,12024,8614,16545,11628,8856,13660,10913,11928,12359,9267,7672,6487,10677,4271,3351,4264,3764,3313,1492,4324,4277,4928,7752,8940,10545,10488,13766,11594,8317,12139,14274,11617,7513,8215,7687,4374,5465,4548,3419,2136,2679,2714,3072,2984,3203,6689,6113,8923,6755,6968,7711,5305,3827,4341,5915,6554,7376,6707,3685,4366,3086,1277,2218,1089,282 ,156 ,907,1691,2786,5229,6081,7133,8617,9759,12984,15060,11906,15909,21934,14993,9776,9721,8707,8080,2245,4702)
mycomb1 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

# combination 2
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5")
product <- rep(0, 120)
detail <- rep(1, 120)
status <- rep(0, 120)
channel <- rep(1, 120)
transaction <-c(5564,4588,3256,1034,2479,3678,5454,6104,8199,9261,10115,13665,12030,11996,12610,15061,15957,19130,15086,11779,14274,10614,7442,10216,4937,9178,5871,6702,3150,6505,4855,4744,10661,10485,10805,9321,14260,9831,15602,10599,14739,14117,8549,9638,9161,8282,7877,2060,2492,2816,3983,2053,4758,5717,2816,6141,8322,9745,9677,14478,11905,9580,8742,11012,5775,6773,8583,9261,10890,11950,5248,3579,3176,7268,605 ,1642,1122,6046,3241,4189,6534,7445,8518,7585,9574,5453,5467,4302,6664,8297,6801,5637,4323,2963,1872,1466,1472,1129,581 ,275 ,716 ,1963,2333,5507,7601,7478,7760,16975,11986,15282,12122,10815,16060,21552,11587,11873,7778,7058,6153,3423)
mycomb2 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

# combination 3
datex <- c(rep("07/01/2021", 22), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9")
product <- rep(1, 118)
detail <- rep(2, 118)
status <- rep(1, 118)
channel <- rep(2, 118)
transaction <- c(12,120 ,120 ,120 ,140 ,144 ,120 ,112 ,106 ,120 ,150 ,120 ,116 ,120 ,96,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,143 ,120 ,120 ,102 ,96,120 ,120 ,120 ,120 ,125 ,120 ,94,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,200 ,118 ,120 ,120 ,120 ,180 ,120 ,100 ,92,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,140 ,120 ,120 ,165 ,120 ,120 ,120 ,120 ,120 ,120 ,100 ,110 ,120 ,120 ,88,66,120 ,118 ,120 ,120 ,118 ,120 ,120 ,120 ,120 ,120 ,120 ,120 )
mycomb3 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

# my data
mydata <- rbind(mycomb1, mycomb2, mycomb3)
mydata

# A tibble: 358 x 8
#   datex      hourx seller product detail status channel transaction
#   <date>     <dbl> <chr>    <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
# 1 2021-07-01     0 do1          0      0      0       0        5664
# 2 2021-07-01     1 do1          0      0      0       0        4797
# 3 2021-07-01     2 do1          0      0      0       0        2515
# 4 2021-07-01     3 do1          0      0      0       0        1744
# 5 2021-07-01     4 do1          0      0      0       0        2166
# 6 2021-07-01     5 do1          0      0      0       0        2164
# 7 2021-07-01     6 do1          0      0      0       0        3513
# 8 2021-07-01     7 do1          0      0      0       0        6548
# 9 2021-07-01     8 do1          0      0      0       0        7620
#10 2021-07-01     9 do1          0      0      0       0        8662
# … with 348 more rows

this is addition function to reorder column, to make result become better.

# Function
moveme <- function (invec, movecommand){
  movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], ",|\\s+"), function(x) x[x != ""])
  movelist <- lapply(movecommand, function(x){
    Where <- x[which(x %in% c("before", "after", "first", "last")):length(x)]
    ToMove <- setdiff(x, Where)
    list(ToMove, Where)
  })
  myVec <- invec
  for (i in seq_along(movelist)){
    temp <- setdiff(myVec, movelist[[i]][[1]])
    A <- movelist[[i]][[2]][1]
    if (A %in% c("before", "after")){
      ba <- movelist[[i]][[2]][2]
      if (A == "before"){
        after <- match(ba, temp)-1
      }
      else if (A == "after"){
        after <- match(ba, temp)
      }
    }
    else if (A == "first"){
      after <- 0
    }
    else if (A == "last"){
      after <- length(myVec)
    }
    myVec <- append(temp, values = movelist[[i]][[1]], after = after)
  }
  myVec
}

I want to use looping to get outliers data from many combination data that i have in mydata, this is manual procedure to get outliers data.

  • First, i subset data from mydata.
  • Second, i check seasonality of the data.
  • Third, If it's TRUE, then We use "timetk" package in R to get outliers. But, If it's FALSE, then We use "qcc" package in R instead.
  • collect the outliers data when looping is done.

This is i do manually to get outliers for 1st & 2nd looping.

1ST LOOPING

  • First, i subset data from mydata.
## Looping 1
mydata.comb1 <- subset(mydata, seller == "do1" & product == 0 & detail == 0 & status == 0 & channel == 0)
  • Second, i check seasonality of the data.
# Checking Seasonality
library(seastests)
isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)
#TRUE
  • Third, it's TRUE, then we use then We use "timetk"
library(dplyr)
library(timetk)
mydata.comb1 %>%
  group_by(across(seller:channel)) %>%
  tk_anomaly_diagnostics(datex, transaction) %>%
  ungroup -> model.anomaly.seasonal
model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
model.anomaly.seasonal.data3 %>% 
  left_join(mydata.comb1) -> model.anomaly.seasonal.data4
model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
looping1 <- model.anomaly.seasonal.data5

looping1
# A tibble: 6 x 8
#  datex      hourx seller product detail status channel transaction
#  <date>     <dbl> <chr>    <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
#1 2021-07-01    14 do1          0      0      0       0       15412
#2 2021-07-02     9 do1          0      0      0       0       16673
#3 2021-07-02    12 do1          0      0      0       0       16545
#4 2021-07-02    22 do1          0      0      0       0       10677
#5 2021-07-05    16 do1          0      0      0       0       21934
#6 2021-07-05    22 do1          0      0      0       0        2245

2ND LOOPING

  • First, i subset data from mydata.
mydata.comb2 <- subset(mydata, seller == "do9" & product == 1 & detail == 2 & status == 1 & channel == 2)
  • Second, i check seasonality of the data.
# Checking Seasonality
library(seastests)
y <- mydata.comb2$transaction
isSeasonal(as.ts(y), test = "wo", freq = 24)
#FALSE
  • Third, it's FALSE, then we use then We use "qcc"
library(dplyr)
library(qcc)
model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
model.anomaly.non.seasonal.result <- mydata.comb2[model.anomaly.non.seasonal.indeks,]
looping2 <- model.anomaly.non.seasonal.result

looping2
#        datex hourx seller product detail status channel transaction
#6  2021-07-01     7    do9       1      2      1       2         144
#11 2021-07-01    12    do9       1      2      1       2         150
#48 2021-07-03     1    do9       1      2      1       2         200
#53 2021-07-03     6    do9       1      2      1       2         180
#94 2021-07-04    23    do9       1      2      1       2         165

Then we collect the result from the looping. Here it is the result:

myresult <- rbind(looping1, looping2)
myresult
# A tibble: 11 x 8
#   datex      hourx seller product detail status channel transaction
# * <date>     <dbl> <chr>    <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
# 1 2021-07-01    14 do1          0      0      0       0       15412
# 2 2021-07-02     9 do1          0      0      0       0       16673
# 3 2021-07-02    12 do1          0      0      0       0       16545
# 4 2021-07-02    22 do1          0      0      0       0       10677
# 5 2021-07-05    16 do1          0      0      0       0       21934
# 6 2021-07-05    22 do1          0      0      0       0        2245
# 7 2021-07-01     7 do9          1      2      1       2         144
# 8 2021-07-01    12 do9          1      2      1       2         150
# 9 2021-07-03     1 do9          1      2      1       2         200
#10 2021-07-03     6 do9          1      2      1       2         180
#11 2021-07-04    23 do9          1      2      1       2         165

I have 120K looping, it's going to be painfull if i do manually. So, how do i use dplyr to make this looping procedure more simple? Many Thank You.


Solution

  • It might be possible to optimise the code further but putting the code that you already have in a function you can do -

    library(dplyr)
    library(seastests)
    library(timetk)
    library(qcc)
    library(purrr)
    
    custom_fn <- function(mydata.comb1) {
      if(isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)) {
        mydata.comb1 %>%
          group_by(across(seller:channel)) %>%
          tk_anomaly_diagnostics(datex, transaction) %>%
          ungroup -> model.anomaly.seasonal
        model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
        model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
        model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
        colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
        model.anomaly.seasonal.data3 %>% 
          left_join(mydata.comb1) -> model.anomaly.seasonal.data4
        model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
        looping1 <- model.anomaly.seasonal.data5
      } else {
        y <- mydata.comb1$transaction
        model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
        model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
        model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
        model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
        model.anomaly.non.seasonal.result <- mydata.comb1[model.anomaly.non.seasonal.indeks,]
        looping1 <- model.anomaly.non.seasonal.result
      }
      return(looping1)
    }
    

    Split the data into groups and apply this functions to every subset.

    mydata %>%
      group_split(seller, product, detail, status, channel) %>%
      map_df(custom_fn)
    
    #   datex      hourx seller product detail status channel transaction
    #   <date>     <dbl> <chr>    <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
    # 1 2020-07-01    14 do1          0      0      0       0       15412
    # 2 2020-07-02     9 do1          0      0      0       0       16673
    # 3 2020-07-02    12 do1          0      0      0       0       16545
    # 4 2020-07-02    22 do1          0      0      0       0       10677
    # 5 2020-07-05    16 do1          0      0      0       0       21934
    # 6 2020-07-05    22 do1          0      0      0       0        2245
    # 7 2020-07-03    16 do5          0      1      0       1        5775
    # 8 2020-07-05    11 do5          0      1      0       1       16975
    # 9 2020-07-05    17 do5          0      1      0       1       21552
    #10 2020-07-01     7 do9          1      2      1       2         144
    #11 2020-07-01    12 do9          1      2      1       2         150
    #12 2020-07-03     1 do9          1      2      1       2         200
    #13 2020-07-03     6 do9          1      2      1       2         180
    #14 2020-07-04    23 do9          1      2      1       2         165