Search code examples
rdatasetleft-join

How to join two datasets based on the id, date and approximate time?


I need to merge datasets A and B based on the date, id and nearest time.

The time in the datasets are not matched. The time in B is always 0 to 10 minutes greater than the time in A.

I tried left_join with within, between, overlaps, etc. but couldn't manage it.

A

DATETIME            | ID | W
--------------------------------
2020-12-02 18:02:01 | 1  | 0.25
2020-12-02 19:06:21 | 1  | 0.35
2020-12-02 18:12:08 | 2  | 0.44
2020-12-03 10:03:03 | 3  | 0.98

B

DATETIME            | ID | X1  | X3
--------------------------------------
2020-12-02 18:08:01 | 1  | 1.3 | 99.3
2020-12-02 18:21:11 | 2  | 4.2 | 33.2
2020-12-03 10:09:22 | 3  | 7.1 | 39.9

Desired merge

DATETIME.x          | ID.x | W    | DATETIME.y          | ID.y | X1  | X3 
----------------------------------------------------------------------------
2020-12-02 18:02:01 | 1    | 0.25 | 2020-12-02 18:08:01 | 1    | 1.3 | 99.3
2020-12-02 19:06:21 | 1    | 0.35 |                     |      |     |
2020-12-02 18:12:08 | 2    | 0.44 | 2020-12-02 18:21:11 | 2    | 4.2 | 33.2
2020-12-03 10:03:03 | 3    | 0.98 | 2020-12-03 10:09:22 | 3    | 7.1 | 39.9

Edit: I just added a part of original datasets. The DATASET B always have less number of rows than DATASET A

DATASET A
structure(list(DateTime = structure(c(1598636287, 1598636622, 
1598638754, 1598639954, 1598641154, 1598642353, 1598643551, 1598692415, 
1598693716, 1598694315, 1598694914, 1598695514, 1598696414, 1598697012, 
1598697312, 1598697912, 1598698512, 1598699111, 1598699710, 1598700309, 
1598700910, 1598701209, 1598701809, 1598702408, 1598703007, 1598703608, 
1598704207, 1598704806, 1598705105, 1598705705, 1598706713, 1598707914, 
1598709112, 1598710313, 1598711511, 1598712711, 1598713910, 1598715110, 
1598715710, 1598716909, 1598718109, 1598719309, 1598720508, 1598721708, 
1598722906, 1598724105, 1598724706, 1598725906, 1598727104, 1598728304, 
1598729502, 1598730704, 1598731903, 1598733101, 1598733703, 1598734903, 
1598736102, 1598737301, 1598738500, 1598739700, 1598740900, 1598742099, 
1598742697, 1598743898, 1598745098, 1598746297, 1598747497, 1598748696, 
1598749895, 1598751096, 1598751695, 1598752893, 1598754094, 1598755293, 
1598756493, 1598757692, 1598758891, 1598760092, 1598760690, 1598784123, 
1598784723, 1598785324, 1598785922, 1598786522, 1598787121, 1598787722, 
1598788921, 1598789521, 1598790122, 1598790722, 1598791321, 1598791920, 
1598792520, 1598793120, 1598794320, 1598794920, 1598795518, 1598796118, 
1598796719, 1598797319, 1598797918, 1598798516, 1598799716, 1598800317, 
1598800917, 1598801518, 1598802115, 1598802717, 1598803315, 1598803915, 
1598805115, 1598805714, 1598806313, 1598806915, 1598807514, 1598808114, 
1598808714, 1598809313, 1598810511, 1598811113, 1598811711, 1598812311, 
1598812911, 1598813511, 1598814112, 1598814710, 1598815910, 1598816510, 
1598817110, 1598817709, 1598818309, 1598818910, 1598819508, 1598820108, 
1598821308, 1598821908, 1598822508, 1598823107, 1598823707, 1598824306, 
1598824907, 1598825506, 1598826706, 1598827305, 1598827905, 1598828505, 
1598829104, 1598829704, 1598830305, 1598830905, 1598832103, 1598832703, 
1598833303, 1598833903, 1598834502, 1598835101, 1598835701, 1598836300, 
1598837501, 1598838100, 1598838700, 1598839300, 1598839900, 1598840499, 
1598841100, 1598841699, 1598842898, 1598843497, 1598844098, 1598844697, 
1598845298, 1598845897, 1598846497, 1598847098, 1598848297, 1598848896, 
1598849496, 1598850096, 1598850696, 1598851296, 1598851895, 1598852494, 
1598853695, 1598854294, 1598854893, 1598855494, 1598856093, 1598856693, 
1598857292, 1598857891, 1598859093, 1598859692, 1598860291, 1598860891, 
1598861491, 1598862091, 1598862690, 1598863290, 1598864490, 1598865089, 
1598865689, 1598866289, 1598866889, 1598867490, 1598868088, 1598868689, 
1598869888, 1598870486, 1598871086, 1598871686, 1598872287, 1598872886, 
1598873486, 1598874085, 1598875285, 1598875884, 1598876485, 1598877084, 
1598877684, 1598878284, 1598878885, 1598879484, 1598880683, 1598881284, 
1598881883, 1598882482, 1598883082, 1598883682, 1598884283, 1598884881, 
1598886081, 1598886681, 1598887280, 1598887879, 1598888479, 1598889079, 
1598889678, 1598890280, 1598891478, 1598892078, 1598892680, 1598893278, 
1598893878, 1598894477, 1598895076, 1598895677, 1598896876, 1598897476, 
1598898076, 1598898676, 1598899275, 1598899875, 1598900475, 1598901075, 
1598902273, 1598902873, 1598903473, 1598904074, 1598904673, 1598905272, 
1598905872, 1598906473, 1598907671, 1598908272, 1598908871, 1598909471, 
1598910071, 1598910672, 1598911270, 1598911870, 1598913070, 1598913669, 
1598914268, 1598914870, 1598915469, 1598916067, 1598916667, 1598917268
), tzone = "UTC", class = c("POSIXct", "POSIXt")), ID = c("8", 
"1", "1", "2", "3", "4", "6", "1", "2", "3", "4", "6", "7", "8", 
"1", "2", "3", "4", "6", "7", "8", "1", "2", "3", "4", "6", "7", 
"8", "1", "2", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", 
"3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", 
"8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", 
"5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", 
"2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", 
"7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", 
"4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", 
"1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", 
"6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", 
"3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", 
"8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", 
"5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", 
"2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", 
"7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", 
"4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", 
"1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", 
"6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", 
"3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", 
"8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", 
"5", "6", "7", "8"), W = c(-315.330562689608, -2.52416919062039, 
-6.75916991234354, -4.9925923458724, -2.68077945324944, -2.59352725163617, 
2.02704542198996, -115.650578709976, -27.2982158256338, -37.8537391639228, 
43.2574538530341, 1.03630362417363, 1.27128035515822, -111.453375637402, 
-16.1498545551005, -32.0567779488567, -14.2158062039774, -12.7265408441033, 
14.4552340030648, -0.585924470701273, -29.2725817416404, -14.4761930694506, 
-10.5411967306746, 38.1106281789558, 29.9561186051226, 5.80726901038645, 
-0.980361745751139, 27.7640143766151, -11.4539988350286, -7.87890259507465, 
-11.0164165425268, -14.671496103194, -21.8761126812378, -22.4177243924256, 
-38.1473733135052, 10.9434594904922, 45.3757341781314, -18.9774126098487, 
-8.66568056481869, -3.89551896276334, 2.34305751190445, 21.1184797201658, 
-2.80106266579382, -2.13127112875359, -1.30488761540419, 3.52433963912613, 
3.59774311813275, -1.57072913521284, 0.17549951904551, 4.49101332311171, 
4.14251221995872, 9.44852080922671, -18.5351726039937, -7.42350363641328, 
-4.62564402194394, -4.5136677142908, 3.50669871332958, 7.83229254342291, 
1.97803316530571, -109.966915087567, 0.249080462434332, 0.411500902934364, 
-4.32842271743313, -35.3580246389245, 0.471211838052437, -1.86781930165719, 
-0.237138120660447, -12.3918391824992, 9.57167516936909, -2.15079530394908, 
-0.15681768087188, 0.148139265555683, 0.65012500297816, 0.82970239607941, 
0.98475483572049, 3.48468701581947, 3.45516644446294, 8.28283805412418, 
9.5546080107533, 106.566326082132, -8.4059856673066, 136.702102750715, 
-7.97509319346756, -9.65644617841705, -66.4363930169088, -72.0098875866613, 
-24.3158796139066, -6.1988529506953, -13.7044921103796, 2.98196655360075, 
27.705159582663, -2.6774142696737, -2.64104407255444, -3.01266615508847, 
-15.4414872227932, -11.4932157142113, -3.05245054086814, 0.912643312835983, 
0.376129346746173, 11.1387845048052, -7.11691433937843, 9.61060027882159, 
-12.9765287694212, -5.67396777577145, -0.362964911940461, 7.15437079406064, 
-0.809097105496422, 1.35224187333621, -1.23075264056478, -2.97134675299893, 
-4.50835807234897, 4.36839894967037, -1.10797564355453, 2.42223049178652, 
-1.67111895593451, 2.02613961608675, 12.1764092996962, 0.149218747547633, 
-2.62695129351176, -1.44788286325, -0.869156660324555, -0.628698207215824, 
-2.97110159051323, 1.89612173055238, -2.23996469444859, -0.24492418965564, 
-14.7940925475231, -3.16254513667387, -1.52445292135167, -3.23100699247941, 
1.55341327394996, -1.87035442833437, -10.550723496796, -1.59032323370544, 
3.89321488983673, 0.90104711689204, 26.9477421102614, 0.346026397414036, 
0.350734294086829, 0.114371620042446, -1.11520830232016, 0.371307471846682, 
0.514956010112907, 3.49970995671813, 2.69870761352848, 2.69456927840329, 
0.279003230202838, -0.0225711009518213, -1.63067119943349, -0.599693559777168, 
-2.56052035234386, -0.846365082204427, -5.42273142828007, -0.607372870656234, 
7.42844641587937, -3.50660448466716, -13.5314318519484, 41.6795464524761, 
-33.5119663802369, -3.67599936282413, -13.0496154516734, 7.2316432610428, 
3.30908108695209, 5.55275503969583, 3.85099158300275, -1.75597753662526, 
9.07123637878036, -3.71450440697352, -2.82876372575196, 26.9334541462943, 
-2.16901098952482, -4.80863436490251, -4.91493961086461, -3.63857345204952, 
-3.63608261370314, -4.80503757527894, -3.55857146168661, -0.566768973352482, 
-1.09179105417579, -4.75020279883772, -3.96107100582678, -3.8185226284532, 
2.46998106156591, 3.79867882937243, -8.89003812162545, 14.6773858053186, 
4.51965724318235, 0.680551662459277, -7.79647123577452, -1.84316898195118, 
-1.77996273644294, -6.68347148060333, -1.68132994983378, 5.59757907996578, 
-12.2857215270019, -15.5450216523343, -8.88175096187388, -15.1518675705718, 
26.9069022226759, 5.81328310693763, -3.18165833285836, -4.13183220797559, 
-2.29209372974387, -3.5717355365426, -3.96494332315338, -7.62459016232794, 
-14.1184744077418, -8.80744232458462, -0.558188273327847, -2.15736663831821, 
-0.302670424368334, -5.9666731908502, -2.91143679799658, 0.78193344751675, 
-12.3730563990833, -11.2345797439574, 0.580719551998759, 5.50192512168596, 
3.14362071385508, 4.42954409963619, -2.63594117852702, -4.67529303597675, 
-8.49801360194862, -0.530431046698544, 3.62897613236058, -7.97511967956899, 
0.77288784156167, -22.1871534520521, 17.46379541586, -2.81617618157617, 
-4.81896806636922, -4.6609249091753, -11.3676003160432, -4.38548871151348, 
-6.0696655207222, 3.42114071095143, 10.0298776712577, -3.7323480151313, 
4.06029043013707, -0.774765238927718, 3.03849716672757, -9.58655960203042, 
0.577044533244037, 0.912588696849156, -1.13372337503903, 0.0488029971462183, 
0.201812215030437, -0.75545074368071, -12.3827058554518, -3.43598697198347, 
3.26610839122928, 0.986772864305677, 6.32443180317462, 2.0181742152896, 
-0.616998876325967, 1.98941690678587, 0.796388406110153, 3.39213613292779, 
4.19845521046369, -1.63041961396707, 1.35354928613412, 1.33456998177465, 
-2.19345310582715, -1.27023292659084, 0.458779324945803, 2.35649297440622, 
0.472567239466579, -0.134201165275529, 0.422020168259809, -0.0774992104338274, 
-5.10174927347748, -2.11045995214029, -2.91046480470787, -2.22008940343392, 
-1.64820685253815, -1.13278568364395, -2.99297085126001, -0.541262894155119
)), row.names = c(NA, -278L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x000001a408b982e0>)

DATASET B
structure(list(DateTime = structure(c(1598861250, 1598866650, 
1598872050, 1598877450, 1598861850, 1598867250, 1598872650, 1598878050, 
1598862450, 1598867850, 1598873250, 1598878650, 1598857800, 1598863050, 
1598868450, 1598873850, 1598879250, 1598859450, 1598864850, 1598870250, 
1598875650, 1598881050, 1598860050, 1598865450, 1598870850, 1598876250, 
1598881650), tzone = "", class = c("POSIXct", "POSIXt")), ID = c(1, 
1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 7, 7, 7, 7, 7, 
8, 8, 8, 8, 8), X1 = c(0.198140995, 0.19808281, 0.198024625, 
0.19761733, 0.10382311, 0.1050256, 0.105277735, 0.105044995, 
0.343487125, 0.343176805, 0.343215595, 0.34342894, -0.6956, 0.322579315, 
0.32228839, 0.32201686, 0.32151259, 0.19816039, 0.1981216, 0.19788886, 
0.1977337, 0.19738459, 0.104307985, 0.10405585, 0.105394105, 
0.104967415, 0.104967415), X2 = c(15.62, 14.8, 14.6, 14.1, 13.07, 
13, 12.9, 12.46, 14.23, 13.78, 13.6, 13.22, -50, 13.04, 13.1, 
12.87, 12.49, 16.31, 14.99, 14.6, 14.29, 13.75, 13.98, 12.82, 
13, 12.6, 11.97)), row.names = c(NA, -27L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x000001a408b982e0>)

Solution

  • I've used fuzzyjoin for similar joins for time variables, which allows you to not only define what variables you want to join by, but apply a function to how you want to join on that variable:

    fuzzyjoin::fuzzy_left_join(a, b,
                               by = c("ID" = "ID", "DATETIME" = "DATETIME"),
                               match_fun = list(`==`, \(x, y) difftime(x, y, units = "mins") <= 10))
    

    Output:

               DATETIME.x ID.x    W          DATETIME.y ID.y  X1   X3
    1 2020-12-02 18:02:01    1 0.25 2020-12-02 18:08:01    1 1.3 99.3
    2 2020-12-02 19:06:21    1 0.35                <NA>   NA  NA   NA
    3 2020-12-02 18:12:08    2 0.44 2020-12-02 18:21:11    2 4.2 33.2
    4 2020-12-03 10:03:03    3 0.98 2020-12-03 10:09:22    3 7.1 39.9
    

    You could also use dplyr's join_by() within full_join but you cant use functions in that so will need to create a temporary variable. Same result, just a bit more cumbersome:

    a %>%
      mutate(high = DATETIME + 10) %>%
      full_join(b, join_by(ID == ID, high <= DATETIME)) %>% 
      select(-high)
    

    Data:

    a <- structure(list(DATETIME = structure(c(1606932121, 1606935981, 
                                               1606932728, 1606989783), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
                        ID = c(1L, 1L, 2L, 3L), W = c(0.25, 0.35, 0.44, 0.98)), row.names = c("1", 
                                                                                              "2", "3", "4"), class = "data.frame")
    b <- structure(list(DATETIME = structure(c(1606932481, 1606933271, 
                                               1606990162), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
                        ID = 1:3, X1 = c(1.3, 4.2, 7.1), X3 = c(99.3, 33.2, 39.9)), row.names = c(NA, 
                                                                                                  -3L), class = "data.frame")
    
    merged <- structure(list(DATETIME.x = structure(c(1606932121, 1606935981, 
                                                      1606932728, 1606989783), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
                             ID.x = c(1L, 1L, 2L, 3L), W = c(0.25, 0.35, 0.44, 0.98), 
                             DATETIME.y = structure(c(1606932481, NA, 1606933271, 1606990162
                             ), class = c("POSIXct", "POSIXt"), tzone = "UTC"), ID.y = c(1L, 
                                                                                         NA, 2L, 3L), X1 = c(1.3, NA, 4.2, 7.1), X3 = c(99.3, NA, 
                                                                                                                                        33.2, 39.9)), row.names = c(NA, -4L), class = "data.frame")