Join data by interval from another dataset (non-equi joins) [R]

I am trying to aggregate (FUN = mean) the dataset df1 by intervals of df1$depth from the dataset df2 (df2$minDepth & df2$Depth) and that by group (Station and Transect) in order to join them by the column depth, Station and Transect. The problem is the max depth depends of the station, therefore from one group of data to another the interval can be 100-200 or 100-175, which make it quite complex.

e.g. if for df2 the interval from a given station in a given transect is 400-1000, I would like the temperature from df1 being aggregate from depth >400 to <= 1000.

Another possibilities would be to replicate the rows of df2 with a df2$minDepth + 0.5 until reaching df2$maxDepth value and then join by depth.

In both case I do not really know how to proceed.

df1 <- structure(list(Transect = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L), .Label = c("1", "2", "3", "4"), class = "factor"), Station = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("", "1", "2", "3", "4", 
"5", "6", "7", "8"), class = "factor"), temperature = c(0.0904, 
0.15, 0.7691, 0.0146, -0.3466, 1.9339, -0.0583, 0.208, -0.0755, 
1.9752, 2.0051, 1.9778, 1.9825, 3.3611, 2.3119, 0.2727, 0.0849, 
2.4664, 2.6563, 1.2256, 1.4938, 1.2118, 3.561, 2.0338, 1.8384, 
1.2348, 1.9761, 0.0472, 1.9731, 1.967, 0.0832, 1.9593, 1.6035, 
-0.5551, 2.0336, 0.2156, -0.1609, 0.1558, 0.2346, -0.4225, 1.9736, 
-0.0274, 1.9396, 1.1732, 2.0147, 1.4887, -0.0349, 1.4741, -0.1417, 
1.0949, 2.6406, 1.0877, 1.8169, 2.5692, 2.4662, -0.2685, 1.9774, 
1.9705, 2.0462, -0.1153, -0.6441, -0.8629, 1.9899, 1.6423, 1.541, 
-0.2768, 2.4161, 1.9753, 2.6955, 0.5983, -0.2354, -0.8856, -0.923, 
0.2718, 1.9774, 1.1851, 2.7037, -0.8477, 2.5275, -0.6093, 1.3051, 
2.0382, 2.4863, 1.3012, 1.9462, 2.0294, 1.9392, 1.421, 1.9744, 
1.9761, -1.0952, 1.5835, 2.6097, -0.3529, 1.9355, 1.9773, -0.743, 
2.603, 1.5249, -0.8462, 0.4651, 1.5508, -1.2618, -1.5018, -1.686, 
1.3767, 1.7305, 1.4991, 1.5003, 0.8662, 1.6691, -0.2488, 1.3982, 
1.6882, -1.6824, -1.4222, 0.9335, -1.5617, 1.3818, 0.2459, 1.7215, 
-1.677, -1.0255, 1.3761, 1.1095, 0.5204, 1.7848, -1.5796, 0.424, 
-1.5652, 1.3739, 3.9624, 1.5481, 1.1437, -1.5125, -1.6832, 1.4667, 
0.6127, 1.475, 1.4825, -0.6471, 1.0726, -1.6796, 1.1914, -1.2973, 
-1.6244, 0.6191, 1.1524, 1.6302, -1.4496, 0.7461, 0.6422, 2.7357, 
1.0529, 0.6229, 0.8845, 1.0181, -1.5218, 1.0376, -0.1418, 1.005, 
0.98, 2.7239, -1.5238, 0.9606, 0.8511, 0.7578, 1.013, 1.0136, 
6.0901, 0.2868, -1.5776, -0.2276, 2.6903, -1.3518, 1.0335, 0.7362, 
0.4402, 0.8468, 0.2621, -0.0475, 0.9451, -1.4984, 0.7847, 2.598, 
0.6331, 0.9527, 0.6488, 0.6407, 0.6129, -1.5152, 0.3581, 1.0337, 
0.9938, 0.9142, 1.0187, 0.85, 0.6053, -0.8318, 0.7135), depth = c(161.9, 
176.7, 30.5, 66.3, 128.3, 420.5, 61.1, 180.7, 77.1, 934.5, 474.5, 
500.5, 768.5, 6.7, 8.5, 187.5, 157.9, 4.5, 32.7, 272.5, 37.9, 
40.7, 13.1, 446.5, 406.5, 276.5, 794.5, 59.1, 818.5, 722.5, 152.3, 
526.5, 362.5, 117.1, 444.5, 56.5, 95.1, 177.1, 46.5, 87.1, 908.5, 
143.1, 666.5, 242.5, 432.5, 338.5, 141.9, 342.5, 79.1, 208.5, 
29.1, 204, 408, 37.4, 39, 170.7, 922, 702, 438, 178.6, 146.2, 
108.4, 484, 366, 348, 168.7, 44.7, 890, 22.6, 18, 171.9, 126.9, 
100.1, 205.7, 938, 264, 23.6, 109.2, 43.9, 147.4, 280, 436, 41.1, 
276, 550, 454, 574, 312, 876, 902, 85, 354, 35.6, 165.8, 658, 
920, 81.3, 31.5, 320, 132.5, 8.8, 242.1, 101.2, 30.5, 64.4, 321.8, 
205.9, 273.1, 270.3, 179, 197.1, 145.2, 304.9, 200.7, 68.4, 94.8, 
170.6, 82.4, 317.4, 151.8, 227.7, 66.8, 115.6, 321, 176.3, 67, 
213.2, 35.5, 153, 34.7, 323, 25, 246.5, 176.7, 88.8, 64.8, 292.7, 
161.8, 287.9, 279.5, 15.9, 53, 59.6, 180.3, 98.4, 70.4, 162.6, 
177.9, 195.5, 93.2, 298.2, 238.2, 365.5, 475.4, 223.4, 346.6, 
422.9, 78.8, 453.7, 126.6, 395.8, 377.8, 387.5, 75.6, 372.2, 
334.6, 307, 405.6, 413.2, 4.2, 143.8, 42.2, 97.5, 333.5, 90.4, 
446.1, 294.2, 171.4, 333, 157.5, 129.8, 361.8, 56, 313.8, 299.5, 
229.8, 367.8, 240.6, 235, 11.4, 77.6, 159.4, 445.7, 388.2, 357.4, 
427.3, 334.2, 211.8, 109.4, 285.4)), row.names = c(NA, -200L), class = c("tbl_df", 
"tbl", "data.frame"))

df2 <- structure(list(Transect = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3", "4", "6", 
"7"), class = "factor"), Station = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 9L, 9L, 9L, 9L, 9L), .Label = c("1", "10", "11", 
"12", "14", "16", "17", "18", "2", "20", "22", "23", "24", "3", 
"4", "5", "7", "8", "9"), class = "factor"), minDepth = c(100L, 
100L, 50L, 50L, 50L, 25L, 25L, 25L, 200L, 100L, 100L, 100L, 100L, 
100L, 100L, 50L, 100L, 100L, 100L, 100L, 50L, 50L, 0L, 0L, 0L, 
200L, 200L, 200L, 200L, 200L, 200L, 200L, 200L, 50L, 50L, 25L, 
25L, 0L, 0L, 0L, 200L, 100L, 50L, 50L, 50L, 25L, 0L, 0L, 50L, 
100L, 200L, 200L, 200L, 400L, 400L, 0L, 0L, 100L, 100L, 100L), 
    maxDepth = c(175L, 175L, 100L, 100L, 100L, 50L, 50L, 50L, 
    225L, 200L, 200L, 200L, 200L, 200L, 200L, 100L, 200L, 200L, 
    200L, 200L, 100L, 100L, 25L, 25L, 25L, 300L, 300L, 300L, 
    300L, 300L, 300L, 300L, 300L, 100L, 100L, 50L, 50L, 25L, 
    25L, 25L, 480L, 200L, 100L, 100L, 100L, 50L, 25L, 50L, 100L, 
    200L, 400L, 400L, 400L, 850L, 850L, 50L, 50L, 200L, 200L, 
    200L), `Average length(µm)` = c(2925, 4233.33333333333, 
    4367.5, 4367.5, 4367.5, 3360, 3360, 3360, 4404.16666666667, 
    2497.22222222222, 2497.22222222222, 2497.22222222222, 3277.5, 
    3277.5, 6593.75, 4525, 2822.5, 2822.5, 2822.5, 4393.75, 1992.5, 
    1992.5, 2012.5, 2012.5, 2012.5, 2902.5, 2902.5, 2902.5, 3232.5, 
    3232.5, 6675, 6675, 6675, 4325, 4325, 2064.28571428571, 5391.66666666667, 
    1925, 1925, 1925, 4162.5, 4216.66666666667, 2432.5, 2432.5, 
    3475, 2500, 3250, 3283.75, 3962.5, 4329.16666666667, 4412.5, 
    4412.5, 4412.5, 5252.5, 5252.5, 3550, 6400, 2345, 2345, 2345
    )), row.names = c(NA, -60L), class = c("tbl_df", "tbl", "data.frame"
))

Solution

I think what you need are non-equi joins to match the depth in df1 with the depth ranges in df2. I can think of 2 approaches which can handle this.

The sqldf package allows you to use SQL syntax to manipulate data frames.

library(sqldf)

# get the unique intervals from df2
intervals <- unique(df2[,1:4])

# join the intervals to df1
new_df1 <- sqldf('select df1.*, i.minDepth, i.maxDepth from df1 left join intervals i 
                 on (df1.Transect=i.Transect and df1.Station=i.Station 
                        and df1.depth > i.minDepth and df1.depth <= i.maxDepth)')


# calculate mean temperature using the intervals to group
sqldf('select Transect,Station,minDepth,maxDepth,avg(temperature) as mean_temp
        from new_df1
        group by Transect, Station, minDepth, maxDepth
        order by Transect, Station, minDepth, maxDepth')

The data.table package also allows non-equi joins.

library(data.table)

# convert data frames to data.table
dt1 <- as.data.table(df1)
dt2 <- as.data.table(df2)

# get unique intervals rom dt2
intervalsdt <- unique(dt2[,1:4])

# join the intervals on to dt1
new_dt1 <- intervalsdt[dt1,  
                       .(Transect, Station, minDepth=x.minDepth, maxDepth=x.maxDepth, temperature), # the minDepth=x.minDepth and maxDepth=x.maxDepth are needed to take these values from the left data table (dt1), otherwise you will get the matched data (both equal to depth from intervalsdt)
                       on=.(Transect==Transect, Station==Station, minDepth<depth, maxDepth>=depth), # this is equivalent to the ON ... part of a SQL join
                       nomatch=NA] # nomatch=NA makes this into a right join so it keeps the rows which dont fall within an interval

# calculate mean temperature
new_dt1[, .(mean_temp=mean(temperature)), by=.(Transect, Station, minDepth, maxDepth)
        ][order(Transect, Station, minDepth, maxDepth)]