Search code examples
rduplicatesintervals

Remove duplicates based on column values in specific intervals in R


I have multi-column data as follows. I want to remove rows having duplicate values in depth column.

      Date     Levels values depth
1   2005-12-31      1 182.80     0
2   2005-12-31      2 182.80     0
3   2005-12-31      5 182.80     2
4   2005-12-31      6 182.80     2
5   2005-12-31      7 182.80     2
6   2005-12-31      8 182.80     3
7   2005-12-31      9 182.80     4
8   2005-12-31     10 182.80     4
9   2005-12-31     11 182.80     5
10  2005-12-31     13 182.70     7
11  2005-12-31     14 182.70     8
12  2005-12-31     16 182.60    10
13  2005-12-31     17 182.50    12
14  2005-12-31     20 181.50    17
15  2005-12-31     23 177.50    23
16  2005-12-31     26 165.90    31
17  2005-12-31     28 155.00    36
18  2005-12-31     29 149.20    40
19  2005-12-31     31 136.90    46
20  2005-12-31     33 126.10    53
21  2005-12-31     35 112.70    60
22  2005-12-31     38  88.23    70
23  2005-12-31     41  67.99    79
24  2005-12-31     44  54.63    87
25  2005-12-31     49  45.40    98
26  2006-12-31      1 182.80     0
27  2006-12-31      2 182.80     0
28  2006-12-31      5 182.80     2
29  2006-12-31      6 182.80     2
30  2006-12-31      7 182.80     2
31  2006-12-31      8 182.80     3
32  2006-12-31      9 182.80     4
33  2006-12-31     10 182.80     4
34  2006-12-31     11 182.70     5
35  2006-12-31     13 182.70     7
36  2006-12-31     14 182.70     8
37  2006-12-31     16 182.60    10
38  2006-12-31     17 182.50    12
39  2006-12-31     20 181.50    17
40  2006-12-31     23 178.60    23
41  2006-12-31     26 168.70    31
42  2006-12-31     28 156.90    36
43  2006-12-31     29 150.40    40
44  2006-12-31     31 137.10    46
45  2006-12-31     33 126.00    53
46  2006-12-31     35 112.70    60
47  2006-12-31     38  91.80    70
48  2006-12-31     41  75.91    79
49  2006-12-31     44  65.17    87
50  2006-12-31     49  58.33    98

I know how to remove duplicates based on a column as follows;

nodup<- distinct(df, column, .keep_all = TRUE)

But how can I do this code for every 25 rows interval?


Solution

  • base R

    do.call(rbind, by(dat, (seq_len(nrow(dat))-1) %/% 25,
                      function(z) z[!duplicated(z$depth),]))
    #            Date Levels values depth
    # 0.1  2005-12-31      1  182.8     0
    # 0.3  2005-12-31      5  182.8     2
    # 0.6  2005-12-31      8  182.8     3
    # 0.7  2005-12-31      9  182.8     4
    # 0.9  2005-12-31     11  182.8     5
    # 0.10 2005-12-31     13  182.7     7
    # 0.11 2005-12-31     14  182.7     8
    # 0.12 2005-12-31     16  182.6    10
    # 0.13 2005-12-31     17  182.5    12
    # 0.14 2005-12-31     20  181.5    17
    # 0.15 2005-12-31     23  177.5    23
    # 0.16 2005-12-31     26  165.9    31
    # 0.17 2005-12-31     28  155.0    36
    # 0.18 2005-12-31     29  149.2    40
    # 0.19 2005-12-31     31  136.9    46
    # 0.20 2005-12-31     33  126.1    53
    # 0.21 2005-12-31     35  112.7    60
    # 0.22 2005-12-31     38   88.2    70
    # 0.23 2005-12-31     41   68.0    79
    # 0.24 2005-12-31     44   54.6    87
    # 0.25 2005-12-31     49   45.4    98
    # 1.26 2006-12-31      1  182.8     0
    # 1.28 2006-12-31      5  182.8     2
    # 1.31 2006-12-31      8  182.8     3
    # 1.32 2006-12-31      9  182.8     4
    # 1.34 2006-12-31     11  182.7     5
    # 1.35 2006-12-31     13  182.7     7
    # 1.36 2006-12-31     14  182.7     8
    # 1.37 2006-12-31     16  182.6    10
    # 1.38 2006-12-31     17  182.5    12
    # 1.39 2006-12-31     20  181.5    17
    # 1.40 2006-12-31     23  178.6    23
    # 1.41 2006-12-31     26  168.7    31
    # 1.42 2006-12-31     28  156.9    36
    # 1.43 2006-12-31     29  150.4    40
    # 1.44 2006-12-31     31  137.1    46
    # 1.45 2006-12-31     33  126.0    53
    # 1.46 2006-12-31     35  112.7    60
    # 1.47 2006-12-31     38   91.8    70
    # 1.48 2006-12-31     41   75.9    79
    # 1.49 2006-12-31     44   65.2    87
    # 1.50 2006-12-31     49   58.3    98
    

    or

    dat[!ave(dat$depth, (seq_len(nrow(dat))-1) %/% 25, FUN = duplicated),]
    

    dplyr

    library(dplyr)
    dat %>%
      group_by(grp = (seq_len(n())-1) %/% 25) %>%
      distinct(depth, .keep_all = TRUE) %>%
      ungroup() %>%
      select(-grp)
    # # A tibble: 42 x 4
    #    Date       Levels values depth
    #    <chr>       <int>  <dbl> <int>
    #  1 2005-12-31      1   183.     0
    #  2 2005-12-31      5   183.     2
    #  3 2005-12-31      8   183.     3
    #  4 2005-12-31      9   183.     4
    #  5 2005-12-31     11   183.     5
    #  6 2005-12-31     13   183.     7
    #  7 2005-12-31     14   183.     8
    #  8 2005-12-31     16   183.    10
    #  9 2005-12-31     17   182.    12
    # 10 2005-12-31     20   182.    17
    # # ... with 32 more rows
    

    data.table

    library(data.table)
    as.data.table(dat)[, .SD[!duplicated(depth),], by=.( (seq_len(nrow(dat))-1) %/% 25 ) ][,-1]
    

    (The [,-1] on the end is because the by= grouping operation implicitly prepends the seq_len(.)... counter as its first column.)

    (Notice a theme? :-)


    Data

    dat <- structure(list(Date = c("2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2005-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31", "2006-12-31"), Levels = c(1L, 2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L, 20L, 23L, 26L, 28L, 29L, 31L, 33L, 35L, 38L, 41L, 44L, 49L, 1L, 2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L, 20L, 23L, 26L, 28L, 29L, 31L, 33L, 35L, 38L, 41L, 44L, 49L), values = c(182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.7, 182.7, 182.6, 182.5, 181.5, 177.5, 165.9, 155, 149.2, 136.9, 126.1, 112.7, 88.23, 67.99, 54.63, 45.4, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.8, 182.7, 182.7, 182.7, 182.6, 182.5, 181.5, 178.6, 168.7, 156.9, 150.4, 137.1, 126, 112.7, 91.8, 75.91, 65.17, 58.33), depth = c(0L, 0L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 7L, 8L, 10L, 12L, 17L, 23L, 31L, 36L, 40L, 46L, 53L, 60L, 70L, 79L, 87L, 98L, 0L, 0L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 7L, 8L, 10L, 12L, 17L, 23L, 31L, 36L, 40L, 46L, 53L, 60L, 70L, 79L, 87L, 98L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50"))