Search code examples
rdata.tableapplylapplyany

R data.table divide set of columns and flag using any


I am working on a data set which is large and having many columns. I am using data.table to speed up the calculations. However at certain points I am not sure how to go about and convert my data.table back to data.frame and do the calculation. This slows up the process. It would help a lot to have suggestions on how I can write the below in data.table. Below is a snap of my code on a dummy data -

library(data.table)

#### set the seed value
set.seed(9901)

#### create the sample variables for creating the data
p01 <- sample(1:100,1000,replace = T)
p02 <- sample(1:100,1000,replace = T)
p03 <- sample(1:100,1000,replace = T)
p04 <- sample(1:100,1000,replace = T)
p05 <- sample(1:100,1000,replace = T)
p06 <- sample(1:100,1000,replace = T)
p07 <- sample(1:100,1000,replace = T)

#### create the data.table
data <- data.table(cbind(p01,p02,p03,p04,p05,p06,p07))

###user input for last column
lcol <- 6

###calculate start column as last - 3
scol <- lcol-3

###calculate average for scol:lcol
data <- data[,avg:= apply(.SD,1,mean,na.rm=T),.SDcols=scol:lcol]

###converting to data.frame since do not know the solution in data.table
data <- as.data.frame(data)

###calculate the trend in percentage
data$t01 <- data[,lcol-00]/data[,"avg"]-1
data$t02 <- data[,lcol-01]/data[,"avg"]-1
data$t03 <- data[,lcol-02]/data[,"avg"]-1
data$t04 <- data[,lcol-03]/data[,"avg"]-1
data$t05 <- data[,lcol-04]/data[,"avg"]-1

###converting back to data.table
data <- as.data.table(data)

###calculate the min and max for the trend
data1 <- data[,`:=` (trend_min = apply(.SD,1,min,na.rm=T),
                     trend_max = apply(.SD,1,max,na.rm=T)),.SDcols=c(scol:lcol)]

###calculate flag if any of t04 OR t05 is an outlier for min and max values. This would be many columns in actual data

data1$flag1 <- ifelse(data1$t04 < data1$trend_min | data1$t04 > data1$trend_max,1,0)
data1$flag2 <- ifelse(data1$t05 < data1$trend_min | data1$t05 > data1$trend_max,1,0)

data1$flag <- ifelse(data1$flag1 == 1 | data1$flag2 == 1,1,0)

So basically, how can I -

  1. calculate the percentages based on user input of column index. Note it is not simple divide but percentage

  2. How can I create the flag variable....I think I need to use any function but not sure how....


Solution

  • Some steps can be made more efficient, i.e. instead of using the apply with MARGIN = 1, the mean, min, max can be replaced with rowMeans, pmin, pmax

    library(data.table)
    data[ , avg:= rowMeans(.SD, na.rm = TRUE) ,.SDcols=scol:lcol]
    data[,   sprintf('t%02d', 1:5) := lapply(.SD, function(x) x/avg -1), 
              .SDcol = patterns("^p0[1-5]")]
    data[,`:=` (trend_min = do.call(pmin, c(.SD,na.rm=TRUE)),
                trend_max =  do.call(pmax, c(.SD,na.rm=TRUE)) ),.SDcols=c(scol:lcol)]
    data
    #      p01 p02 p03 p04 p05 p06 p07   avg         t01         t02        t03         t04        t05 trend_min trend_max
    #   1:  35  53  22  82 100  59  69 65.75 -0.46768061 -0.19391635 -0.6653992  0.24714829  0.5209125        22       100
    #   2:  78  75  15  65  70  69  66 54.75  0.42465753  0.36986301 -0.7260274  0.18721461  0.2785388        15        70
    #   3:  15  45  27  61  63  75  99 56.50 -0.73451327 -0.20353982 -0.5221239  0.07964602  0.1150442        27        75
    #   4:  41  80  13  22  63  84  17 45.50 -0.09890110  0.75824176 -0.7142857 -0.51648352  0.3846154        13        84
    #   5:  53   9  75  47  25  75  66 55.50 -0.04504505 -0.83783784  0.3513514 -0.15315315 -0.5495495        25        75
    #  ---                                                                                                                
    # 996:  33  75   9  61  74  55  57 49.75 -0.33668342  0.50753769 -0.8190955  0.22613065  0.4874372         9        74
    # 997:  24  68  74  11  43  75  37 50.75 -0.52709360  0.33990148  0.4581281 -0.78325123 -0.1527094        11        75
    # 998:  62  78  82  97  56  50  74 71.25 -0.12982456  0.09473684  0.1508772  0.36140351 -0.2140351        50        97
    # 999:  70  88  93   4  39  75  93 52.75  0.32701422  0.66824645  0.7630332 -0.92417062 -0.2606635         4        93
    #1000:  20  50  99  94  62  66  98 80.25 -0.75077882 -0.37694704  0.2336449  0.17133956 -0.2274143        62        99
    

    and then create the 'flag'

    data[,  flag := +(Reduce(`|`, lapply(.SD, function(x) 
          x < trend_min| x > trend_max))), .SDcols = t04:t05]