Search code examples
rdategroupingcounter

How to count exactly the same number of values for different groups over different years, starting from the earliest common month-day in r?


I have values of 2 biomarkers ('analyte'), measured with 2 different 'methods' over 2 different periods ('old' methods over 4 days from 2022-06-03 to 2022-06-06, and 'new' methods over 7 days from 2023-06-01 to 2023-06-07).

Question: How to count exactly the same (maximal) number of values for 'old' and 'new' methods in a new column, by analyte, starting from the earliest common period, considering month-day regardless of the year?
For example, in the desired output data below, the new 'count' column shows:
n=16 values for old and new methods of the first analyte 'chol', and
n=18 values for old and new methods of the second analyte 'ldh'.

Input data:

dat0 <-
structure(list(analyte = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L), .Label = c("amy", "auu", "chol", "ggt", 
"iron", "ldh", "pal", "prot", "trig", "uree", "ureeu", "uric"
), class = "factor"), method = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("new", "old"), class = "factor"), 
    date = structure(c(1654214400, 1654214400, 1654214400, 1654214400, 
    1654214400, 1654300800, 1654300800, 1654300800, 1654300800, 
    1654387200, 1654387200, 1654387200, 1654473600, 1654473600, 
    1654473600, 1654473600, 1685577600, 1685577600, 1685577600, 
    1685577600, 1685577600, 1685577600, 1685664000, 1685664000, 
    1685664000, 1685664000, 1685664000, 1685664000, 1685750400, 
    1685750400, 1685750400, 1685750400, 1685750400, 1685836800, 
    1685836800, 1685836800, 1685836800, 1685923200, 1685923200, 
    1685923200, 1685923200, 1685923200, 1685923200, 1686009600, 
    1686009600, 1686009600, 1686009600, 1686009600, 1686009600, 
    1686096000, 1686096000, 1686096000, 1686096000, 1686096000, 
    1686096000, 1654214400, 1654214400, 1654214400, 1654214400, 
    1654214400, 1654300800, 1654300800, 1654300800, 1654300800, 
    1654300800, 1654300800, 1654387200, 1654387200, 1654387200, 
    1654387200, 1654387200, 1654473600, 1654473600, 1654473600, 
    1654473600, 1654473600, 1654473600, 1654473600, 1654473600, 
    1685577600, 1685577600, 1685577600, 1685577600, 1685577600, 
    1685664000, 1685664000, 1685664000, 1685664000, 1685664000, 
    1685750400, 1685750400, 1685750400, 1685836800, 1685836800, 
    1685836800, 1685923200, 1685923200, 1685923200, 1685923200, 
    1686009600, 1686009600, 1686009600, 1686009600, 1686096000, 
    1686096000, 1686096000, 1686096000), tzone = "UTC", class = c("POSIXct", 
    "POSIXt")), value = c(2.84, 2.79, 2.82, 2.84, 2.84, 2.81, 
    2.8, 2.83, 2.82, 2.82, 2.8, 2.82, 2.81, 2.79, 2.82, 2.84, 
    2.74, 2.76, 2.77, 2.79, 2.76, 2.81, 2.73, 2.75, 2.77, 2.78, 
    2.76, 2.86, 2.73, 2.78, 2.84, 2.82, 2.8, 2.74, 2.76, 2.79, 
    2.82, 2.8, 2.76, 2.77, 2.83, 2.85, 2.83, 2.78, 2.74, 2.8, 
    2.8, 2.79, 2.85, 2.75, 2.76, 2.78, 2.82, 2.76, 2.86, 121, 
    119, 113, 117, 119, 123, 120, 117, 118, 125, 121, 121, 124, 
    123, 117, 115, 113, 120, 113, 113, 115, 117, 119, 115, 111, 
    111, 113, 116, 115, 111, 115, 112, 118, 115, 113, 106, 118, 
    113, 113, 113, 110, 114, 113, 117, 111, 111, 115, 120, 112, 
    113, 114, 108)), row.names = c(NA, -107L), class = c("tbl_df", 
"tbl", "data.frame"))  

Desired output data:

dat1 <-
structure(list(analyte = c("chol", "chol", "chol", "chol", "chol", 
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol", 
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol", 
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol", 
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol", 
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol", 
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol", 
"chol", "chol", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", 
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", 
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", 
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", 
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", 
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh"
), method = c("old", "old", "old", "old", "old", "old", "old", 
"old", "old", "old", "old", "old", "old", "old", "old", "old", 
"new", "new", "new", "new", "new", "new", "new", "new", "new", 
"new", "new", "new", "new", "new", "new", "new", "new", "new", 
"new", "new", "new", "new", "new", "new", "new", "new", "new", 
"new", "new", "new", "new", "new", "new", "new", "new", "new", 
"new", "new", "new", "old", "old", "old", "old", "old", "old", 
"old", "old", "old", "old", "old", "old", "old", "old", "old", 
"old", "old", "old", "old", "old", "old", "old", "old", "old", 
"new", "new", "new", "new", "new", "new", "new", "new", "new", 
"new", "new", "new", "new", "new", "new", "new", "new", "new", 
"new", "new", "new", "new", "new", "new", "new", "new", "new", 
"new"), date = structure(c(1654214400, 1654214400, 1654214400, 
1654214400, 1654214400, 1654300800, 1654300800, 1654300800, 1654300800, 
1654387200, 1654387200, 1654387200, 1654473600, 1654473600, 1654473600, 
1654473600, 1685577600, 1685577600, 1685577600, 1685577600, 1685577600, 
1685577600, 1685664000, 1685664000, 1685664000, 1685664000, 1685664000, 
1685664000, 1685750400, 1685750400, 1685750400, 1685750400, 1685750400, 
1685836800, 1685836800, 1685836800, 1685836800, 1685923200, 1685923200, 
1685923200, 1685923200, 1685923200, 1685923200, 1686009600, 1686009600, 
1686009600, 1686009600, 1686009600, 1686009600, 1686096000, 1686096000, 
1686096000, 1686096000, 1686096000, 1686096000, 1654214400, 1654214400, 
1654214400, 1654214400, 1654214400, 1654300800, 1654300800, 1654300800, 
1654300800, 1654300800, 1654300800, 1654387200, 1654387200, 1654387200, 
1654387200, 1654387200, 1654473600, 1654473600, 1654473600, 1654473600, 
1654473600, 1654473600, 1654473600, 1654473600, 1685577600, 1685577600, 
1685577600, 1685577600, 1685577600, 1685664000, 1685664000, 1685664000, 
1685664000, 1685664000, 1685750400, 1685750400, 1685750400, 1685836800, 
1685836800, 1685836800, 1685923200, 1685923200, 1685923200, 1685923200, 
1686009600, 1686009600, 1686009600, 1686009600, 1686096000, 1686096000, 
1686096000, 1686096000), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    value = c(2.84, 2.79, 2.82, 2.84, 2.84, 2.81, 2.8, 2.83, 
    2.82, 2.82, 2.8, 2.82, 2.81, 2.79, 2.82, 2.84, 2.74, 2.76, 
    2.77, 2.79, 2.76, 2.81, 2.73, 2.75, 2.77, 2.78, 2.76, 2.86, 
    2.73, 2.78, 2.84, 2.82, 2.8, 2.74, 2.76, 2.79, 2.82, 2.8, 
    2.76, 2.77, 2.83, 2.85, 2.83, 2.78, 2.74, 2.8, 2.8, 2.79, 
    2.85, 2.75, 2.76, 2.78, 2.82, 2.76, 2.86, 121, 119, 113, 
    117, 119, 123, 120, 117, 118, 125, 121, 121, 124, 123, 117, 
    115, 113, 120, 113, 113, 115, 117, 119, 115, 111, 111, 113, 
    116, 115, 111, 115, 112, 118, 115, 113, 106, 118, 113, 113, 
    113, 110, 114, 113, 117, 111, 111, 115, 120, 112, 113, 114, 
    108), count = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 
    14, 15, 16, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, 3, 4, 5, 6, 
    7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, 
    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -107L))

Thanks for help


Solution

  • I'm not 100% sure, but I think this might help in moving forward. I get a bit of a difference in output, but it may be justifiable. This assumes that sequences will not wrap around years or have more complex occurrences.

    First, you will need to separate your date to consider month and day without year. You can sort or arrange the data for each analyte using month and day. The first day of the year where there is overlap (both "new" and "old" available for a given day and analyte) will be indicated with TRUE in a new column called overlap.

    A count will be computed for each method and analyte where there is overlap.

    Next part, you can arrange by analyte and count, and filter or keep rows where there are both methods available for a given count and analyte.

    I believe the difference in output is that the following only includes exactly 14 tests for "ldh" for both methods, even though one ends on June 5 and the other June 6. If you wanted them to continue with the same end date, this could be modified.

    library(tidyverse)
    
    dat0 |>
      separate(date, c('year', 'month', 'day'), sep = "-", remove = FALSE, convert = TRUE) |>
      arrange(analyte, month, day) |>
      mutate(overlap = all(c("old", "new") %in% method), .by = c(analyte, month, day)) |>
      mutate(count = cumsum(overlap), .by = c(analyte, method)) |>
      filter(overlap & all(c("old", "new") %in% method), .by = c(analyte, count)) |>
      arrange(analyte, method, date)
    

    Output

       analyte method date                 year month   day  value overlap count
       <fct>   <fct>  <dttm>              <int> <int> <int>  <dbl> <lgl>   <int>
     1 chol    new    2023-06-03 00:00:00  2023     6     3   2.73 TRUE        1
     2 chol    new    2023-06-03 00:00:00  2023     6     3   2.78 TRUE        2
     3 chol    new    2023-06-03 00:00:00  2023     6     3   2.84 TRUE        3
     4 chol    new    2023-06-03 00:00:00  2023     6     3   2.82 TRUE        4
     5 chol    new    2023-06-03 00:00:00  2023     6     3   2.8  TRUE        5
     6 chol    new    2023-06-04 00:00:00  2023     6     4   2.74 TRUE        6
     7 chol    new    2023-06-04 00:00:00  2023     6     4   2.76 TRUE        7
     8 chol    new    2023-06-04 00:00:00  2023     6     4   2.79 TRUE        8
     9 chol    new    2023-06-04 00:00:00  2023     6     4   2.82 TRUE        9
    10 chol    new    2023-06-05 00:00:00  2023     6     5   2.8  TRUE       10
    11 chol    new    2023-06-05 00:00:00  2023     6     5   2.76 TRUE       11
    12 chol    new    2023-06-05 00:00:00  2023     6     5   2.77 TRUE       12
    13 chol    new    2023-06-05 00:00:00  2023     6     5   2.83 TRUE       13
    14 chol    new    2023-06-05 00:00:00  2023     6     5   2.85 TRUE       14
    15 chol    new    2023-06-05 00:00:00  2023     6     5   2.83 TRUE       15
    16 chol    new    2023-06-06 00:00:00  2023     6     6   2.78 TRUE       16
    17 chol    old    2022-06-03 00:00:00  2022     6     3   2.84 TRUE        1
    18 chol    old    2022-06-03 00:00:00  2022     6     3   2.79 TRUE        2
    19 chol    old    2022-06-03 00:00:00  2022     6     3   2.82 TRUE        3
    20 chol    old    2022-06-03 00:00:00  2022     6     3   2.84 TRUE        4
    21 chol    old    2022-06-03 00:00:00  2022     6     3   2.84 TRUE        5
    22 chol    old    2022-06-04 00:00:00  2022     6     4   2.81 TRUE        6
    23 chol    old    2022-06-04 00:00:00  2022     6     4   2.8  TRUE        7
    24 chol    old    2022-06-04 00:00:00  2022     6     4   2.83 TRUE        8
    25 chol    old    2022-06-04 00:00:00  2022     6     4   2.82 TRUE        9
    26 chol    old    2022-06-05 00:00:00  2022     6     5   2.82 TRUE       10
    27 chol    old    2022-06-05 00:00:00  2022     6     5   2.8  TRUE       11
    28 chol    old    2022-06-05 00:00:00  2022     6     5   2.82 TRUE       12
    29 chol    old    2022-06-06 00:00:00  2022     6     6   2.81 TRUE       13
    30 chol    old    2022-06-06 00:00:00  2022     6     6   2.79 TRUE       14
    31 chol    old    2022-06-06 00:00:00  2022     6     6   2.82 TRUE       15
    32 chol    old    2022-06-06 00:00:00  2022     6     6   2.84 TRUE       16
    33 ldh     new    2023-06-03 00:00:00  2023     6     3 113    TRUE        1
    34 ldh     new    2023-06-03 00:00:00  2023     6     3 106    TRUE        2
    35 ldh     new    2023-06-03 00:00:00  2023     6     3 118    TRUE        3
    36 ldh     new    2023-06-04 00:00:00  2023     6     4 113    TRUE        4
    37 ldh     new    2023-06-04 00:00:00  2023     6     4 113    TRUE        5
    38 ldh     new    2023-06-04 00:00:00  2023     6     4 113    TRUE        6
    39 ldh     new    2023-06-05 00:00:00  2023     6     5 110    TRUE        7
    40 ldh     new    2023-06-05 00:00:00  2023     6     5 114    TRUE        8
    41 ldh     new    2023-06-05 00:00:00  2023     6     5 113    TRUE        9
    42 ldh     new    2023-06-05 00:00:00  2023     6     5 117    TRUE       10
    43 ldh     new    2023-06-06 00:00:00  2023     6     6 111    TRUE       11
    44 ldh     new    2023-06-06 00:00:00  2023     6     6 111    TRUE       12
    45 ldh     new    2023-06-06 00:00:00  2023     6     6 115    TRUE       13
    46 ldh     new    2023-06-06 00:00:00  2023     6     6 120    TRUE       14
    47 ldh     old    2022-06-03 00:00:00  2022     6     3 121    TRUE        1
    48 ldh     old    2022-06-03 00:00:00  2022     6     3 119    TRUE        2
    49 ldh     old    2022-06-03 00:00:00  2022     6     3 113    TRUE        3
    50 ldh     old    2022-06-03 00:00:00  2022     6     3 117    TRUE        4
    51 ldh     old    2022-06-03 00:00:00  2022     6     3 119    TRUE        5
    52 ldh     old    2022-06-04 00:00:00  2022     6     4 123    TRUE        6
    53 ldh     old    2022-06-04 00:00:00  2022     6     4 120    TRUE        7
    54 ldh     old    2022-06-04 00:00:00  2022     6     4 117    TRUE        8
    55 ldh     old    2022-06-04 00:00:00  2022     6     4 118    TRUE        9
    56 ldh     old    2022-06-04 00:00:00  2022     6     4 125    TRUE       10
    57 ldh     old    2022-06-04 00:00:00  2022     6     4 121    TRUE       11
    58 ldh     old    2022-06-05 00:00:00  2022     6     5 121    TRUE       12
    59 ldh     old    2022-06-05 00:00:00  2022     6     5 124    TRUE       13
    60 ldh     old    2022-06-05 00:00:00  2022     6     5 123    TRUE       14