Search code examples
rbigdatarevolution-r

How to cut a variable to 20 equal segments (for example) for several columns in a dataset in R


I know how to do it for one single variable. We can use equal.count() or a combination of quantile() and cut(). anyone knows an aggregate function to do this for 100 columns at the same time?

I know I can write a loop but it is slow. Is there a faster way? Because I am looking at a big data issue and possible solution using revolution R is welcome too. Thanks advance!

To clarify: I was trying to break every single column to 20 ranges not just by the first column. I am not trying to split the dataset but trying to transform variables to different ranges . Hope it clarifies. Thank you a lot


Solution

  • Use the g argument in cut2() to choose the breaks your will cut the variable into.

    require(data.table)
    require(Hmisc)
    
    set.seed(123)
    DT <- data.table(x1 = rnorm(10e5, 50, 50),
                     x2 = rnorm(10e5, 30, 50),
                     x3 = rnorm(10e5, 20, 50),
                     x4 = rnorm(10e5, 10, 50),
                     x5 = rnorm(10e5, 10, 50)
    )
    
    
    cut_qt <- DT[,sapply(.SD, function(x) if(is.numeric(x)) cut2(x, g = 4)), ]
    
    
    print(cut_qt)
    
    head(cut_qt)
    x1               x2               x3               x4               x5                
    [1,] "[  16.3, 50.0)" "[-199.6, -3.8)" "[ -13.7, 20.0)" "[ -23.8, 10.0)" "[ -23.74,  9.97)"
    [2,] "[  16.3, 50.0)" "[  63.6,257.4]" "[  20.0, 53.7)" "[-218.7,-23.8)" "[-222.34,-23.74)"
    [3,] "[  83.7,292.5]" "[  -3.8, 29.9)" "[ -13.7, 20.0)" "[  43.7,247.6]" "[ -23.74,  9.97)"
    [4,] "[  50.0, 83.7)" "[  63.6,257.4]" "[ -13.7, 20.0)" "[  10.0, 43.7)" "[-222.34,-23.74)"
    [5,] "[  50.0, 83.7)" "[  29.9, 63.6)" "[-232.5,-13.7)" "[  10.0, 43.7)" "[-222.34,-23.74)"
    [6,] "[  83.7,292.5]" "[  29.9, 63.6)" "[-232.5,-13.7)" "[  43.7,247.6]" "[ -23.74,  9.97)"
    

    As this is slow considering the OP is dealing with a large dataset:

    > system.time(DT[,lapply(.SD, function(x) if(is.numeric(x)) cut2(x, g = 4)), ])
       user  system elapsed 
      37.66    0.00   38.70 
    

    ALTERNATIVE METHOD USING set()

    # 1) Calculate Quantiles
    q <- DT[,sapply(.SD, function(x) if(is.numeric(x)) quantile(x)), ]
    q
    x1          x2         x3         x4          x5
    0%   -189.95953 -199.574605 -232.54139 -218.74362 -222.343247
    25%    16.28067   -3.797748  -13.72424  -23.76578  -23.736187
    50%    49.98701   29.938932   20.01473   10.03740    9.967671
    75%    83.66663   63.614604   53.74529   43.73047   43.676887
    100%  292.53835  257.368361  280.64704  247.64500  277.418083
    
    
    # 2) Modify the existing DT with the categorical variables using set
    
    cols_to_fix <- names(DT)
    
    for (j in 1:length(cols_to_fix)){
      column <- cols_to_fix[j] 
      brk = q[,j]
      val = cut2(DT[[column]], cuts = brk)
      set(DT, i=NULL, j=j, value = val)
    }
    
    system.time(for (j in 1:length(cols_to_fix)){
      column <- cols_to_fix[j] 
      brk = q[,j]
      val = cut2(DT[[column]], cuts = brk)
      set(DT, i=NULL, j=j, value = val)
    }
      )
    user  system elapsed 
    4.71    0.00    4.83