I have the following data in R
. I need to aggregate the Num values when other values of two (or more) rows are the same. I know the answer for data with two columns but how should I have it for more than two columns.
Appreciate!
Example
Original Data:
User Date Sex Age Ethic Med Num Diag Dis Doc Month
27168 1/1/2002 1 49 NULL 42506 1 2000 0 13545 1
27168 1/1/2002 1 49 NULL 42506 9 2000 0 13545 1
27168 1/1/2002 1 49 NULL 42506 5 2000 0 13545 1
27168 1/1/2002 1 49 NULL 34533 1 2000 0 13545 1
12335 2/1/2002 0 87 2 42506 2 8654 2 34568 1
12335 2/1/2002 0 87 2 65873 4 8654 2 34568 1
12335 2/1/2002 0 87 2 65873 14 8654 2 34568 1
12335 2/1/2002 0 87 2 65825 6 8654 2 34568 1
12335 2/1/2002 0 87 2 65873 4 8654 2 34568 1
My Goal:
User Date Sex Age Ethic Med Num Diag Dis Doc Month
27168 1/1/2002 1 49 NULL 42506 15 2000 0 13545 1
27168 1/1/2002 1 49 NULL 34533 1 2000 0 13545 1
12335 2/1/2002 0 87 2 42506 2 8654 2 34568 1
12335 2/1/2002 0 87 2 65873 18 8654 2 34568 1
12335 2/1/2002 0 87 2 65825 6 8654 2 34568 1
Real data:
structure(list(Adm.Code = c(2716882L, 2716882L, 2716884L, 2716884L,
2716884L, 2716885L, 2716885L, 2716885L, 2716885L, 2716886L, 2716886L,
2716886L, 2716886L, 2716889L, 2716889L, 2716889L, 2716889L, 2716889L,
2716889L, 2716889L, 2716896L, 2716896L, 2716896L, 2716896L, 2716896L,
2716896L, 2716896L, 2716896L, 2716896L, 2716899L), Date = c("1397-01-01",
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01",
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01",
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01",
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01",
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01",
"1397-01-01", "1397-01-01", "1397-01-01", "1397-01-01"), Sex = c(1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L), Old = c(0L,
0L, 23L, 23L, 23L, 49L, 49L, 49L, 49L, 24L, 24L, 24L, 24L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 35L, 35L, 35L, 35L, 35L, 35L, 35L,
35L, 35L, 29L), BloodGroup = c("0", "0", "NULL", "NULL", "NULL",
"NULL", "NULL", "NULL", "NULL", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL", "NULL", "NULL", "NULL", "0"), Service.Int.Code = c(32603L,
32602L, 42506L, 32603L, 32602L, 42506L, 32603L, 32602L, 34533L,
32603L, 32602L, 42506L, 34533L, 42506L, 32603L, 32602L, 42506L,
42506L, 32603L, 32602L, 34533L, 42506L, 32603L, 32602L, 34533L,
32603L, 32602L, 32603L, 32602L, 42506L), Num = c(1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1, 4, 3, 1, 1, 8,
1, 1, 1, 1, 3), DiagCode = c(2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L), DiseaseTag = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Doctor.Code = c(13545L,
13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L,
13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L,
13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L, 13545L,
13545L, 13545L, 13545L, 13545L, 13545L), month = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(2L,
3L, 6L, 11L, 12L, 23L, 28L, 29L, 44L, 48L, 49L, 53L, 61L, 73L,
80L, 81L, 82L, 84L, 88L, 89L, 94L, 96L, 107L, 108L, 117L, 128L,
129L, 135L, 136L, 139L), class = "data.frame")
With dplyr
you could group_by
all but Num
and then summarise
.
library(dplyr)
df |>
group_by(across(-Num)) |>
summarise(Num = sum(Num)) |>
ungroup()
Output:
# A tibble: 5 × 11
User Date Sex Age Ethic Med Diag Dis Doc Month Num
<dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 12335 2/1/2002 0 87 2 42506 8654 2 34568 1 2
2 12335 2/1/2002 0 87 2 65825 8654 2 34568 1 6
3 12335 2/1/2002 0 87 2 65873 8654 2 34568 1 22
4 27168 1/1/2002 1 49 NULL 34533 2000 0 13545 1 1
5 27168 1/1/2002 1 49 NULL 42506 2000 0 13545 1 15
Data:
library(readr)
df <- read_table("User Date Sex Age Ethic Med Num Diag Dis Doc Month
27168 1/1/2002 1 49 NULL 42506 1 2000 0 13545 1
27168 1/1/2002 1 49 NULL 42506 9 2000 0 13545 1
27168 1/1/2002 1 49 NULL 42506 5 2000 0 13545 1
27168 1/1/2002 1 49 NULL 34533 1 2000 0 13545 1
12335 2/1/2002 0 87 2 42506 2 8654 2 34568 1
12335 2/1/2002 0 87 2 65873 4 8654 2 34568 1
12335 2/1/2002 0 87 2 65873 14 8654 2 34568 1
12335 2/1/2002 0 87 2 65825 6 8654 2 34568 1
12335 2/1/2002 0 87 2 65873 4 8654 2 34568 1")