I have values of 2 biomarkers ('analyte'), measured with 2 different 'methods' over 2 different periods ('old' methods over 4 days from 2022-06-03 to 2022-06-06, and 'new' methods over 7 days from 2023-06-01 to 2023-06-07).
Question: How to count exactly the same (maximal) number of values for 'old' and 'new' methods in a new column, by analyte, starting from the earliest common period, considering month-day regardless of the year?
For example, in the desired output data below, the new 'count' column shows:
n=16 values for old and new methods of the first analyte 'chol', and
n=18 values for old and new methods of the second analyte 'ldh'.
Input data:
dat0 <-
structure(list(analyte = structure(c(3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L), .Label = c("amy", "auu", "chol", "ggt",
"iron", "ldh", "pal", "prot", "trig", "uree", "ureeu", "uric"
), class = "factor"), method = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("new", "old"), class = "factor"),
date = structure(c(1654214400, 1654214400, 1654214400, 1654214400,
1654214400, 1654300800, 1654300800, 1654300800, 1654300800,
1654387200, 1654387200, 1654387200, 1654473600, 1654473600,
1654473600, 1654473600, 1685577600, 1685577600, 1685577600,
1685577600, 1685577600, 1685577600, 1685664000, 1685664000,
1685664000, 1685664000, 1685664000, 1685664000, 1685750400,
1685750400, 1685750400, 1685750400, 1685750400, 1685836800,
1685836800, 1685836800, 1685836800, 1685923200, 1685923200,
1685923200, 1685923200, 1685923200, 1685923200, 1686009600,
1686009600, 1686009600, 1686009600, 1686009600, 1686009600,
1686096000, 1686096000, 1686096000, 1686096000, 1686096000,
1686096000, 1654214400, 1654214400, 1654214400, 1654214400,
1654214400, 1654300800, 1654300800, 1654300800, 1654300800,
1654300800, 1654300800, 1654387200, 1654387200, 1654387200,
1654387200, 1654387200, 1654473600, 1654473600, 1654473600,
1654473600, 1654473600, 1654473600, 1654473600, 1654473600,
1685577600, 1685577600, 1685577600, 1685577600, 1685577600,
1685664000, 1685664000, 1685664000, 1685664000, 1685664000,
1685750400, 1685750400, 1685750400, 1685836800, 1685836800,
1685836800, 1685923200, 1685923200, 1685923200, 1685923200,
1686009600, 1686009600, 1686009600, 1686009600, 1686096000,
1686096000, 1686096000, 1686096000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), value = c(2.84, 2.79, 2.82, 2.84, 2.84, 2.81,
2.8, 2.83, 2.82, 2.82, 2.8, 2.82, 2.81, 2.79, 2.82, 2.84,
2.74, 2.76, 2.77, 2.79, 2.76, 2.81, 2.73, 2.75, 2.77, 2.78,
2.76, 2.86, 2.73, 2.78, 2.84, 2.82, 2.8, 2.74, 2.76, 2.79,
2.82, 2.8, 2.76, 2.77, 2.83, 2.85, 2.83, 2.78, 2.74, 2.8,
2.8, 2.79, 2.85, 2.75, 2.76, 2.78, 2.82, 2.76, 2.86, 121,
119, 113, 117, 119, 123, 120, 117, 118, 125, 121, 121, 124,
123, 117, 115, 113, 120, 113, 113, 115, 117, 119, 115, 111,
111, 113, 116, 115, 111, 115, 112, 118, 115, 113, 106, 118,
113, 113, 113, 110, 114, 113, 117, 111, 111, 115, 120, 112,
113, 114, 108)), row.names = c(NA, -107L), class = c("tbl_df",
"tbl", "data.frame"))
Desired output data:
dat1 <-
structure(list(analyte = c("chol", "chol", "chol", "chol", "chol",
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol",
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol",
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol",
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol",
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol",
"chol", "chol", "chol", "chol", "chol", "chol", "chol", "chol",
"chol", "chol", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh",
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh",
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh",
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh",
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh",
"ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh", "ldh"
), method = c("old", "old", "old", "old", "old", "old", "old",
"old", "old", "old", "old", "old", "old", "old", "old", "old",
"new", "new", "new", "new", "new", "new", "new", "new", "new",
"new", "new", "new", "new", "new", "new", "new", "new", "new",
"new", "new", "new", "new", "new", "new", "new", "new", "new",
"new", "new", "new", "new", "new", "new", "new", "new", "new",
"new", "new", "new", "old", "old", "old", "old", "old", "old",
"old", "old", "old", "old", "old", "old", "old", "old", "old",
"old", "old", "old", "old", "old", "old", "old", "old", "old",
"new", "new", "new", "new", "new", "new", "new", "new", "new",
"new", "new", "new", "new", "new", "new", "new", "new", "new",
"new", "new", "new", "new", "new", "new", "new", "new", "new",
"new"), date = structure(c(1654214400, 1654214400, 1654214400,
1654214400, 1654214400, 1654300800, 1654300800, 1654300800, 1654300800,
1654387200, 1654387200, 1654387200, 1654473600, 1654473600, 1654473600,
1654473600, 1685577600, 1685577600, 1685577600, 1685577600, 1685577600,
1685577600, 1685664000, 1685664000, 1685664000, 1685664000, 1685664000,
1685664000, 1685750400, 1685750400, 1685750400, 1685750400, 1685750400,
1685836800, 1685836800, 1685836800, 1685836800, 1685923200, 1685923200,
1685923200, 1685923200, 1685923200, 1685923200, 1686009600, 1686009600,
1686009600, 1686009600, 1686009600, 1686009600, 1686096000, 1686096000,
1686096000, 1686096000, 1686096000, 1686096000, 1654214400, 1654214400,
1654214400, 1654214400, 1654214400, 1654300800, 1654300800, 1654300800,
1654300800, 1654300800, 1654300800, 1654387200, 1654387200, 1654387200,
1654387200, 1654387200, 1654473600, 1654473600, 1654473600, 1654473600,
1654473600, 1654473600, 1654473600, 1654473600, 1685577600, 1685577600,
1685577600, 1685577600, 1685577600, 1685664000, 1685664000, 1685664000,
1685664000, 1685664000, 1685750400, 1685750400, 1685750400, 1685836800,
1685836800, 1685836800, 1685923200, 1685923200, 1685923200, 1685923200,
1686009600, 1686009600, 1686009600, 1686009600, 1686096000, 1686096000,
1686096000, 1686096000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
value = c(2.84, 2.79, 2.82, 2.84, 2.84, 2.81, 2.8, 2.83,
2.82, 2.82, 2.8, 2.82, 2.81, 2.79, 2.82, 2.84, 2.74, 2.76,
2.77, 2.79, 2.76, 2.81, 2.73, 2.75, 2.77, 2.78, 2.76, 2.86,
2.73, 2.78, 2.84, 2.82, 2.8, 2.74, 2.76, 2.79, 2.82, 2.8,
2.76, 2.77, 2.83, 2.85, 2.83, 2.78, 2.74, 2.8, 2.8, 2.79,
2.85, 2.75, 2.76, 2.78, 2.82, 2.76, 2.86, 121, 119, 113,
117, 119, 123, 120, 117, 118, 125, 121, 121, 124, 123, 117,
115, 113, 120, 113, 113, 115, 117, 119, 115, 111, 111, 113,
116, 115, 111, 115, 112, 118, 115, 113, 106, 118, 113, 113,
113, 110, 114, 113, 117, 111, 111, 115, 120, 112, 113, 114,
108), count = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -107L))
Thanks for help
I'm not 100% sure, but I think this might help in moving forward. I get a bit of a difference in output, but it may be justifiable. This assumes that sequences will not wrap around years or have more complex occurrences.
First, you will need to separate your date to consider month
and day
without year
. You can sort or arrange
the data for each analyte
using month
and day
. The first day of the year where there is overlap (both "new" and "old" available for a given day and analyte
) will be indicated with TRUE
in a new column called overlap
.
A count
will be computed for each method
and analyte
where there is overlap
.
Next part, you can arrange
by analyte
and count
, and filter
or keep rows where there are both methods available for a given count
and analyte
.
I believe the difference in output is that the following only includes exactly 14 tests for "ldh" for both methods, even though one ends on June 5 and the other June 6. If you wanted them to continue with the same end date, this could be modified.
library(tidyverse)
dat0 |>
separate(date, c('year', 'month', 'day'), sep = "-", remove = FALSE, convert = TRUE) |>
arrange(analyte, month, day) |>
mutate(overlap = all(c("old", "new") %in% method), .by = c(analyte, month, day)) |>
mutate(count = cumsum(overlap), .by = c(analyte, method)) |>
filter(overlap & all(c("old", "new") %in% method), .by = c(analyte, count)) |>
arrange(analyte, method, date)
Output
analyte method date year month day value overlap count
<fct> <fct> <dttm> <int> <int> <int> <dbl> <lgl> <int>
1 chol new 2023-06-03 00:00:00 2023 6 3 2.73 TRUE 1
2 chol new 2023-06-03 00:00:00 2023 6 3 2.78 TRUE 2
3 chol new 2023-06-03 00:00:00 2023 6 3 2.84 TRUE 3
4 chol new 2023-06-03 00:00:00 2023 6 3 2.82 TRUE 4
5 chol new 2023-06-03 00:00:00 2023 6 3 2.8 TRUE 5
6 chol new 2023-06-04 00:00:00 2023 6 4 2.74 TRUE 6
7 chol new 2023-06-04 00:00:00 2023 6 4 2.76 TRUE 7
8 chol new 2023-06-04 00:00:00 2023 6 4 2.79 TRUE 8
9 chol new 2023-06-04 00:00:00 2023 6 4 2.82 TRUE 9
10 chol new 2023-06-05 00:00:00 2023 6 5 2.8 TRUE 10
11 chol new 2023-06-05 00:00:00 2023 6 5 2.76 TRUE 11
12 chol new 2023-06-05 00:00:00 2023 6 5 2.77 TRUE 12
13 chol new 2023-06-05 00:00:00 2023 6 5 2.83 TRUE 13
14 chol new 2023-06-05 00:00:00 2023 6 5 2.85 TRUE 14
15 chol new 2023-06-05 00:00:00 2023 6 5 2.83 TRUE 15
16 chol new 2023-06-06 00:00:00 2023 6 6 2.78 TRUE 16
17 chol old 2022-06-03 00:00:00 2022 6 3 2.84 TRUE 1
18 chol old 2022-06-03 00:00:00 2022 6 3 2.79 TRUE 2
19 chol old 2022-06-03 00:00:00 2022 6 3 2.82 TRUE 3
20 chol old 2022-06-03 00:00:00 2022 6 3 2.84 TRUE 4
21 chol old 2022-06-03 00:00:00 2022 6 3 2.84 TRUE 5
22 chol old 2022-06-04 00:00:00 2022 6 4 2.81 TRUE 6
23 chol old 2022-06-04 00:00:00 2022 6 4 2.8 TRUE 7
24 chol old 2022-06-04 00:00:00 2022 6 4 2.83 TRUE 8
25 chol old 2022-06-04 00:00:00 2022 6 4 2.82 TRUE 9
26 chol old 2022-06-05 00:00:00 2022 6 5 2.82 TRUE 10
27 chol old 2022-06-05 00:00:00 2022 6 5 2.8 TRUE 11
28 chol old 2022-06-05 00:00:00 2022 6 5 2.82 TRUE 12
29 chol old 2022-06-06 00:00:00 2022 6 6 2.81 TRUE 13
30 chol old 2022-06-06 00:00:00 2022 6 6 2.79 TRUE 14
31 chol old 2022-06-06 00:00:00 2022 6 6 2.82 TRUE 15
32 chol old 2022-06-06 00:00:00 2022 6 6 2.84 TRUE 16
33 ldh new 2023-06-03 00:00:00 2023 6 3 113 TRUE 1
34 ldh new 2023-06-03 00:00:00 2023 6 3 106 TRUE 2
35 ldh new 2023-06-03 00:00:00 2023 6 3 118 TRUE 3
36 ldh new 2023-06-04 00:00:00 2023 6 4 113 TRUE 4
37 ldh new 2023-06-04 00:00:00 2023 6 4 113 TRUE 5
38 ldh new 2023-06-04 00:00:00 2023 6 4 113 TRUE 6
39 ldh new 2023-06-05 00:00:00 2023 6 5 110 TRUE 7
40 ldh new 2023-06-05 00:00:00 2023 6 5 114 TRUE 8
41 ldh new 2023-06-05 00:00:00 2023 6 5 113 TRUE 9
42 ldh new 2023-06-05 00:00:00 2023 6 5 117 TRUE 10
43 ldh new 2023-06-06 00:00:00 2023 6 6 111 TRUE 11
44 ldh new 2023-06-06 00:00:00 2023 6 6 111 TRUE 12
45 ldh new 2023-06-06 00:00:00 2023 6 6 115 TRUE 13
46 ldh new 2023-06-06 00:00:00 2023 6 6 120 TRUE 14
47 ldh old 2022-06-03 00:00:00 2022 6 3 121 TRUE 1
48 ldh old 2022-06-03 00:00:00 2022 6 3 119 TRUE 2
49 ldh old 2022-06-03 00:00:00 2022 6 3 113 TRUE 3
50 ldh old 2022-06-03 00:00:00 2022 6 3 117 TRUE 4
51 ldh old 2022-06-03 00:00:00 2022 6 3 119 TRUE 5
52 ldh old 2022-06-04 00:00:00 2022 6 4 123 TRUE 6
53 ldh old 2022-06-04 00:00:00 2022 6 4 120 TRUE 7
54 ldh old 2022-06-04 00:00:00 2022 6 4 117 TRUE 8
55 ldh old 2022-06-04 00:00:00 2022 6 4 118 TRUE 9
56 ldh old 2022-06-04 00:00:00 2022 6 4 125 TRUE 10
57 ldh old 2022-06-04 00:00:00 2022 6 4 121 TRUE 11
58 ldh old 2022-06-05 00:00:00 2022 6 5 121 TRUE 12
59 ldh old 2022-06-05 00:00:00 2022 6 5 124 TRUE 13
60 ldh old 2022-06-05 00:00:00 2022 6 5 123 TRUE 14