Search code examples
rdataframesumduplicatesaggregate

Sum values of a single column in duplicated rows in R


I have the following data in R. I need to aggregate the Num values when other values of two (or more) rows are the same. I know the answer for data with two columns but how should I have it for more than two columns.

Appreciate!

Example

Original Data:

User    Date    Sex    Age  Ethic   Med    Num  Diag   Dis  Doc   Month
27168   1/1/2002    1   49  NULL    42506   1   2000    0   13545   1
27168   1/1/2002    1   49  NULL    42506   9   2000    0   13545   1
27168   1/1/2002    1   49  NULL    42506   5   2000    0   13545   1
27168   1/1/2002    1   49  NULL    34533   1   2000    0   13545   1
12335   2/1/2002    0   87  2       42506   2   8654    2   34568   1
12335   2/1/2002    0   87  2       65873   4   8654    2   34568   1
12335   2/1/2002    0   87  2       65873   14  8654    2   34568   1
12335   2/1/2002    0   87  2       65825   6   8654    2   34568   1
12335   2/1/2002    0   87  2       65873   4   8654    2   34568   1 

My Goal:

User    Date    Sex    Age  Ethic   Med    Num  Diag   Dis  Doc   Month
27168   1/1/2002    1   49  NULL    42506   15  2000    0   13545   1
27168   1/1/2002    1   49  NULL    34533   1   2000    0   13545   1
12335   2/1/2002    0   87  2       42506   2   8654    2   34568   1
12335   2/1/2002    0   87  2       65873   18  8654    2   34568   1
12335   2/1/2002    0   87  2       65825   6   8654    2   34568   1

Real data:

structure(list(Adm.Code = c(2716882L, 2716882L, 2716884L, 2716884L, 
2716884L, 2716885L, 2716885L, 2716885L, 2716885L, 2716886L, 2716886L, 
2716886L, 2716886L, 2716889L, 2716889L, 2716889L, 2716889L, 2716889L, 
2716889L, 2716889L, 2716896L, 2716896L, 2716896L, 2716896L, 2716896L, 
2716896L, 2716896L, 2716896L, 2716896L, 2716899L), Date = c("1397-01-01", 
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", 
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", 
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", 
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", 
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", 
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01"), Sex = c(1L, 
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L), Old = c(0L, 
0L, 23L, 23L, 23L, 49L, 49L, 49L, 49L, 24L, 24L, 24L, 24L, 22L, 
22L, 22L, 22L, 22L, 22L, 22L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 
35L, 35L, 29L), BloodGroup = c("0", "0", "NULL", "NULL", "NULL", 
"NULL", "NULL", "NULL", "NULL", "0", "0", "0", "0", "0", "0", 
"0", "0", "0", "0", "0", "NULL", "NULL", "NULL", "NULL", "NULL", 
"NULL", "NULL", "NULL", "NULL", "0"), Service.Int.Code = c(32603L, 
32602L, 42506L, 32603L, 32602L, 42506L, 32603L, 32602L, 34533L, 
32603L, 32602L, 42506L, 34533L, 42506L, 32603L, 32602L, 42506L, 
42506L, 32603L, 32602L, 34533L, 42506L, 32603L, 32602L, 34533L, 
32603L, 32602L, 32603L, 32602L, 42506L), Num = c(1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1, 4, 3, 1, 1, 8, 
1, 1, 1, 1, 3), DiagCode = c(2000L, 2000L, 2000L, 2000L, 2000L, 
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L), DiseaseTag = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Doctor.Code = c(13545L, 
13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 
13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 
13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 
13545L, 13545L, 13545L, 13545L, 13545L), month = c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(2L, 
3L, 6L, 11L, 12L, 23L, 28L, 29L, 44L, 48L, 49L, 53L, 61L, 73L, 
80L, 81L, 82L, 84L, 88L, 89L, 94L, 96L, 107L, 108L, 117L, 128L, 
129L, 135L, 136L, 139L), class = "data.frame")

Solution

  • With dplyr you could group_by all but Num and then summarise.

    library(dplyr)
    
    df |> 
      group_by(across(-Num)) |>
      summarise(Num = sum(Num)) |>
      ungroup()
    

    Output:

    # A tibble: 5 × 11
       User Date       Sex   Age Ethic   Med  Diag   Dis   Doc Month   Num
      <dbl> <chr>    <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    1 12335 2/1/2002     0    87 2     42506  8654     2 34568     1     2
    2 12335 2/1/2002     0    87 2     65825  8654     2 34568     1     6
    3 12335 2/1/2002     0    87 2     65873  8654     2 34568     1    22
    4 27168 1/1/2002     1    49 NULL  34533  2000     0 13545     1     1
    5 27168 1/1/2002     1    49 NULL  42506  2000     0 13545     1    15
    

    Data:

    library(readr)
    
    df <- read_table("User    Date    Sex    Age  Ethic   Med    Num  Diag   Dis  Doc   Month
    27168   1/1/2002    1   49  NULL    42506   1   2000    0   13545   1
    27168   1/1/2002    1   49  NULL    42506   9   2000    0   13545   1
    27168   1/1/2002    1   49  NULL    42506   5   2000    0   13545   1
    27168   1/1/2002    1   49  NULL    34533   1   2000    0   13545   1
    12335   2/1/2002    0   87  2       42506   2   8654    2   34568   1
    12335   2/1/2002    0   87  2       65873   4   8654    2   34568   1
    12335   2/1/2002    0   87  2       65873   14  8654    2   34568   1
    12335   2/1/2002    0   87  2       65825   6   8654    2   34568   1
    12335   2/1/2002    0   87  2       65873   4   8654    2   34568   1")