Search code examples
rdataframeindexingadditioncol

(Update) Add index column to data.frame based on two columns


Example data.frame:

df = read.table(text = 'colA colB
                2 7
                2 7
                2 7
                2 7
                1 7
                1 7
                1 7
                89 5
                89 5
                89 5
                88 5
                88 5
                70 5
                70 5
                70 5
                69 5
                69 5
                44 4
                44 4
                44 4
                43 4
                42 4
                42 4
                41 4
                41 4
                120 1
                100 1', header = TRUE)

I need to add an index col based on colA and colB where colB shows the exact number of rows to group but it can be duplicated. colB groups rows based on colA and colA -1.

Expected output:

 colA colB index_col
    2 7       1
    2 7       1
    2 7       1
    2 7       1
    1 7       1
    1 7       1
    1 7       1
    89 5      2
    89 5      2
    89 5      2
    88 5      2
    88 5      2
    70 5      3
    70 5      3
    70 5      3
    69 5      3
    69 5      3
    44 4      4
    44 4      4 
    44 4      4
    43 4      4
    42 4      5
    42 4      5
    41 4      5
    41 4      5
   120 1      6
   100 1      7

UPDATE

How can I adapt the code that works for the above df for the same purpose but by looking at colB values grouped based on colA, colA -1 and colA -2? i.e. (instead of 2 days considering 3 days)

new_df = read.table(text = 'colA colB
        3 10
        3 10
        3 10      
        2 10
        2 10
        2 10
        2 10
        1 10
        1 10
        1 10
        90 7
        90 7
        89 7
        89 7
        89 7
        88 7
        88 7
        71 7
        71 7
        70 7
        70 7
        70 7
        69 7
        69 7
        44 5
        44 5
        44 5
        43 5
        42 5
        41 5
        41 5
        41 5
        40 5
        40 5
       120 1
       100 1', header = TRUE)

Expected output:

     colA colB index_col
        3 10      1
        3 10      1
        3 10      1
        2 10      1
        2 10      1
        2 10      1
        2 10      1
        1 10      1
        1 10      1
        1 10      1
        90 7      2
        90 7      2
        89 7      2
        89 7      2
        89 7      2
        88 7      2
        88 7      2
        71 7      3
        71 7      3
        70 7      3
        70 7      3
        70 7      3
        69 7      3
        69 7      3
        44 5      4
        44 5      4
        44 5      4
        43 5      4
        42 5      4
        41 5      5
        41 5      5
        41 5      5
        40 5      5
        40 5      5
       120 1      6
       100 1      7

Thanks


Solution

  • We can use rleid

    library(data.table)
    index_col <-setDT(df)[, if(colB[1L] < .N) ((seq_len(.N)-1) %/% colB[1L])+1  
           else as.numeric(colB), rleid(colB)][, rleid(V1)]
    df[, index_col := index_col]
    df
    #    colA colB index_col
    # 1:    2    7         1
    # 2:    2    7         1
    # 3:    2    7         1
    # 4:    2    7         1
    # 5:    1    7         1
    # 6:    1    7         1
    # 7:    1    7         1
    # 8:   70    5         2
    # 9:   70    5         2
    #10:   70    5         2
    #11:   69    5         2
    #12:   69    5         2
    #13:   89    5         3
    #14:   89    5         3
    #15:   89    5         3
    #16:   88    5         3
    #17:   88    5         3
    #18:  120    1         4
    #19:  100    1         5
    

    Or a one-liner would be

    setDT(df)[, index_col := df[, ((seq_len(.N)-1) %/% colB[1L])+1, rleid(colB)][, as.integer(interaction(.SD, drop = TRUE, lex.order = TRUE))]]
    

    Update

    Based on the new update in the OP's post

    setDT(new_df)[, index_col :=  cumsum(c(TRUE, abs(diff(colA))> 1))
              ][, colB := .N , index_col]
    new_df
    #    colA colB index_col
    # 1:    3   10         1
    # 2:    3   10         1
    # 3:    3   10         1
    # 4:    2   10         1
    # 5:    2   10         1
    # 6:    2   10         1
    # 7:    2   10         1
    # 8:    1   10         1
    # 9:    1   10         1
    #10:    1   10         1
    #11:   71    7         2
    #12:   71    7         2
    #13:   70    7         2
    #14:   70    7         2
    #15:   70    7         2
    #16:   69    7         2
    #17:   69    7         2
    #18:   90    7         3
    #19:   90    7         3
    #20:   89    7         3
    #21:   89    7         3
    #22:   89    7         3
    #23:   88    7         3
    #24:   88    7         3
    #25:   44    2         4
    #26:   43    2         4
    #27:  120    1         5
    #28:  100    1         6