Search code examples
rmemorydata.tablefuzzyjoin

Rewrite Time interval Fuzzy join to be less memory intensive


This question is expanding on this post: Pairing Time series Data with Batch Data in R

A good solution was given to me that worked for the dputs I provided but the problem is that my dataset is quite large and I guess the combinatorial explosion caused by the way I needed to join one table on another based on time intervals was overloading my memory. The apexdata set is 78592 obs of 17 variables. The ibadata set is 175200 obs of 9 variables. When the join is happening, I would think its possible that for every 1 row of apexdata, potentially up to 40 rows of ibadata could be joined... Maybe more but this is really highly variable.

I tried some measurements of memory being used using pryr and got the below.


> object_size(apexdata)
15.09 MB
> object_size(ibadata)
12.62 MB
> mem_used()
140 MB

I would like help figuring out a pathforward towards efficiently getting a result. One non-coding solution that occurs could be to chunk out the dataset into multiple smaller datasets and run the same code multiple times. If this is the best way, would decreasing the row count be the best way to do that?

Looking for similar solutions, it seems like data.table might have less memory intensive ways of doing this using foverlaps or non equi joins. Would that be appropriate?

library(dplyr)
library(fuzzyjoin)

# Use fuzzy_left_join() to define join fields
result <- ibadata %>%
  fuzzy_left_join(apexdata,
                  by = c("time" = "starttime",
                         "time" = "stoptime"),
                  match_fun = list(`>=`, `<`)) %>%
  filter(!is.na(Unit)) %>%
  group_by(Unit) %>%
  summarise_at(vars(a:w), mean, na.rm = TRUE)

# Result (truncated for readability)
data.frame(result[,1:8])

Dputs:

dput(head(apexdata,20))
structure(list(Unit = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
13, 14, 15, 16, 17, 18, 19, 20), starttime = structure(c(1705190403, 
1705190406, 1705190410, 1705195920, 1705195920, 1705195920, 1705196580, 
1705196880, 1705197120, 1705197420, 1705197660, 1705197960, 1705198200, 
1705198500, 1705198740, 1705199040, 1705199280, 1705199580, 1705199820, 
1705200120), tzone = "UTC", class = c("POSIXct", "POSIXt")), 
    stoptime = structure(c(1705190408, 1705190412, 1705190420, 
    1705200540, 1705200900, 1705201140, 1705201440, 1705201860, 
    1705202100, 1705202400, 1705202640, 1705202940, 1705203180, 
    1705203480, 1705203720, 1705204020, 1705204260, 1705204560, 
    1705204800, 1705205100), tzone = "UTC", class = c("POSIXct", 
    "POSIXt"))), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))
dput(head(ibadata,30))
structure(list(time = structure(c(1705190400, 1705190401, 1705190402, 
1705190403, 1705190404, 1705190405, 1705190406, 1705190407, 1705190408, 
1705190409, 1705190410, 1705190411, 1705190412, 1705190413, 1705190414, 
1705190415, 1705190416, 1705190417, 1705190418, 1705190419, 1705190420, 
1705190421, 1705190422, 1705190423, 1705190424, 1705190425, 1705190426, 
1705190427, 1705190428, 1705190429), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), a = c(21839, 21839, 21839, 2184, 21844, 218453, 2185, 
2185, 218606, 21862, 218674, 21868, 218625, 21862, 218691, 21873, 
21877, 21879, 218831, 21885, 218825, 21879, 218812, 21884, 218867, 
2189, 218902, 21891, 218922, 21897), b = c(2205, 220572, 22062, 
220687, 22073, 220743, 22079, 220816, 2209, 220947, 22108, 22108, 
221126, 22113, 221185, 22119, 221247, 22125, 221373, 22143, 221465, 
22148, 221532, 2216, 221649, 22171, 221738, 22177, 221782, 22183
), c = c(23435, 23435, 23435, 234374, 23447, 23447, 23447, 23447, 
23447, 23447, 234416, 23441, 234465, 23447, 234509, 23453, 23453, 
23453, 234605, 23464, 234615, 23458, 234606, 23464, 234667, 2347, 
234689, 23464, 234662, 23475), d = c(23308, 23308, 233077, 23307, 
23307, 23307, 23307, 23307, 23307, 23307, 233074, 23313, 23313, 
23313, 23313, 23313, 23313, 23313, 23313, 233152, 23319, 233213, 
23325, 23325, 23325, 233251, 23336, 23336, 23337, 233373), e = c(22611, 
22612, 22612, 22612, 226171, 22618, 226138, 22611, 226146, 22617, 
226207, 22623, 22623, 22623, 22623, 22623, 22623, 22623, 22623, 
22623, 22623, 22623, 226239, 22629, 22629, 226341, 22635, 226298, 
22629, 226343), f = c(2278, 22781, 227759, 22775, 22775, 22775, 
227786, 22781, 22781, 22781, 22781, 22781, 22781, 22781, 227816, 
22787, 22787, 22787, 22787, 22787, 22787, 22787, 22787, 227911, 
22792, 227869, 22786, 22786, 22786, 227888), g = c(18867, 18879, 
188788, 18867, 18867, 188632, 18861, 188649, 18867, 188629, 18861, 
18861, 18861, 188629, 18867, 188631, 18855, 18855, 18855, 18855, 
188492, 18849, 18849, 18849, 188533, 18855, 188594, 18861, 18852, 
18849), h = c(1773, 17731, 17736, 177348, 1773, 1773, 1773, 1773, 
1773, 1773, 177192, 17718, 177218, 17724, 17724, 17724, 17724, 
17724, 177211, 17717, 17717, 17717, 17717, 17717, 177159, 17711, 
177121, 17717, 177184, 17724), i = c(17, 17, 17, 17, 17, 17, 
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 
17, 17, 17, 17, 17, 17, 17, 17), j = c(0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0), k = c(104835, 104835, 104835, 104835, 104835, 104835, 104835, 
104835, 104835, 104835, 104835, 104835, 104835, 104835, 104835, 
104835, 104835, 104835, 104835, 104835, 104835, 104835, 104835, 
104835, 104835, 104835, 104835, 104835, 104835, 104835), l = c(196186, 
198153, 198652, 199944, 198159, 197963, 199065, 195913, 196702, 
196708, 196879, 194930, 197112, 197875, 200390, 199174, 196824, 
197605, 197172, 200082, 196605, 196841, 198612, 197962, 197621, 
197014, 197729, 197984, 197198, 196946), m = c(193727, 194671, 
194557, 194623, 195706, 195879, 195672, 195781, 196027, 195989, 
195104, 194442, 194804, 195251, 194984, 195815, 196762, 197694, 
197746, 197808, 197274, 195542, 195577, 194481, 194785, 194834, 
195128, 195177, 196748, 196293), n = c(257690, 258295, 253273, 
263207, 287060, 291882, 293853, 294140, 291764, 292896, 295990, 
288650, 286251, 285801, 286529, 289491, 288419, 283850, 284192, 
286041, 285978, 287937, 286857, 292262, 292865, 280934, 285851, 
283258, 276197, 279833), o = c(261511, 261773, 261767, 259751, 
258722, 259089, 259701, 259877, 25959, 259962, 260708, 262357, 
261321, 260341, 261406, 261216, 260288, 260779, 261716, 262594, 
260852, 259573, 26066, 262512, 261615, 259187, 260353, 260503, 
258265, 257562), p = c(626936, 627071, 632219, 630111, 622312, 
622462, 620979, 620358, 613787, 611566, 620765, 620913, 621307, 
619567, 619049, 618754, 616256, 61397, 616649, 619968, 627465, 
628215, 623739, 627658, 62656, 621474, 623952, 622797, 624502, 
627092), q = c(624144, 6228, 624444, 624776, 623206, 62223, 623977, 
622446, 620586, 622345, 622479, 621403, 621453, 622914, 622879, 
623082, 625422, 62669, 627363, 628626, 62905, 628104, 627644, 
627727, 628124, 627282, 6289, 629243, 628948, 627812), r = c(930035, 
931337, 923942, 924188, 918944, 914098, 906264, 918388, 932029, 
931316, 94113, 949525, 95893, 960039, 957767, 94860, 950414, 
955721, 952335, 945857, 94316, 94025, 936534, 93450, 93505, 936947, 
932007, 938411, 942931, 944234), s = c(913782, 914331, 917071, 
925487, 947097, 956014, 957963, 957825, 959429, 959406, 95762, 
958156, 959762, 960561, 959082, 959443, 959142, 957543, 959185, 
958198, 958171, 959972, 959491, 961616, 95935, 959322, 958079, 
958469, 957539, 956692), t = c(908173, 90750, 911236, 904482, 
893898, 891823, 897302, 902655, 896257, 899124, 901424, 894936, 
898939, 91048, 904791, 900456, 899448, 892944, 898304, 912246, 
913413, 919507, 917222, 918116, 915059, 90899, 903138, 892807, 
889867, 900289), u = c(890908, 890406, 890249, 888641, 884488, 
881455, 881968, 880658, 88288, 884088, 880681, 878949, 880532, 
880853, 881552, 879144, 880398, 882238, 880577, 882038, 882487, 
881405, 881321, 882045, 882263, 888436, 898413, 900874, 900381, 
901677), v = c(113177, 114547, 114681, 115676, 113546, 113653, 
113833, 114229, 113681, 113456, 114761, 114993, 113711, 114364, 
114130, 113361, 113949, 114468, 115709, 114681, 114922, 114943, 
115193, 114952, 115059, 115273, 115799, 115822, 116976, 116562
), w = c(112682, 112565, 112509, 112642, 112719, 112711, 113057, 
11336, 113202, 113054, 113018, 113358, 113719, 113886, 113915, 
113897, 113863, 113981, 114042, 114342, 114658, 114683, 114728, 
114684, 114748, 114561, 114442, 114754, 115002, 115298)), row.names = c(NA, 
-30L), class = c("tbl_df", "tbl", "data.frame"))

What I got when running with the whole dataset:

> 
> ibadata$time <- dmy_hms(ibadata$time)
> apexdata <-mutate(apexdata,heatSC = paste(`MLH_HEAT_NO`,`BLI_STRAND_NO`,`CUT`,sep=","))
> apexdata$CHARGE_TIME<- as.POSIXct(apexdata$CHARGE_TIME, "%m/%d/%Y %H:%M:%S", tz = "UTC")
> apexdata$MILL_ENTER_TIME <- as.POSIXct(apexdata$MILL_ENTER_TIME, "%m/%d/%Y %H:%M:%S", tz = "UTC")
> 
> object_size(apexdata)
15.09 MB
> object_size(ibadata)
12.62 MB
> mem_used()
140 MB
> gc()
          used (Mb) gc trigger  (Mb) max used  (Mb)
Ncells 1417250 75.7    3291638 175.8  3228184 172.5
Vcells 7634496 58.3   13642305 104.1 11301568  86.3
> # Use fuzzy_left_join() to define join fields
> result <- ibadata %>%
+   fuzzy_left_join(apexdata,
+                   by = c("time" = "CHARGE_TIME",
+                          "time" = "MILL_ENTER_TIME"),
+                   match_fun = list(`>=`, `<`)) %>%
+   filter(!is.na(heatSC)) %>%
+   group_by(heatSC) %>%
+   summarise_at(vars(Z1Temp:Z6Temp), mean, na.rm = TRUE)
Error: cannot allocate vector of size 102.3 Gb

Solution

  • How about this:

    library(data.table)
    setDT(apexdata)
    setDT(ibadata)
    apexdata[ibadata, on = .(starttime <= time, stoptime > time), nomatch = NULL
      ][, lapply(.SD, mean, na.rm = TRUE), by = "Unit", .SDcols = a:w]
    #     Unit         a        b        c        d        e       f        g       h     i     j      k        l        m        n        o        p        q
    #    <num>     <num>    <num>    <num>    <num>    <num>   <num>    <num>   <num> <num> <num>  <num>    <num>    <num>    <num>    <num>    <num>    <num>
    # 1:     1  49370.20 141279.6  65632.4 23307.00 104030.0 63778.4 86775.20 36888.0    17     0 104835 198208.8 195532.2 286028.4 259428.0 623244.4 511325.6
    # 2:     2  80896.67  85044.5  58607.5 58269.17 124390.3 56948.5 75454.67 33667.0    17     0 104835 196699.5 195502.5 292882.2 221427.3 618061.3 622206.0
    # 3:     3 100606.50 101764.7 107870.6 65273.00  42981.4 43287.5 52813.00 65567.6    17     0 104835 197804.3 196041.0 287521.4 261272.6 563462.5 567829.0
    # 6 variables not shown: [r <num>, s <num>, t <num>, u <num>, v <num>, w <num>]
    

    If you're still running into memory problems, here's a more brute-force method that may be a little more frugal (at the cost of speed).

    Map(function(fm, to) ibadata[fm <= time & time < to, lapply(.SD, mean, na.rm = TRUE), .SDcols = a:w],
        apexdata$starttime, apexdata$stoptime) |>
      rbindlist()
    #            a        b        c        d        e       f        g       h     i     j      k        l        m        n        o        p        q        r
    #        <num>    <num>    <num>    <num>    <num>   <num>    <num>   <num> <num> <num>  <num>    <num>    <num>    <num>    <num>    <num>    <num>    <num>
    # 1:  49370.20 141279.6  65632.4 23307.00 104030.0 63778.4 86775.20 36888.0    17     0 104835 198208.8 195532.2 286028.4 259428.0 623244.4 511325.6 916376.4
    # 2:  80896.67  85044.5  58607.5 58269.17 124390.3 56948.5 75454.67 33667.0    17     0 104835 196699.5 195502.5 292882.2 221427.3 618061.3 622206.0 788605.8
    # 3: 100606.50 101764.7 107870.6 65273.00  42981.4 43287.5 52813.00 65567.6    17     0 104835 197804.3 196041.0 287521.4 261272.6 563462.5 567829.0 695652.4
    # 5 variables not shown: [s <num>, t <num>, u <num>, v <num>, w <num>]