Search code examples
rnon-equi-join

Join data by interval from another dataset (non-equi joins) [R]


I am trying to aggregate (FUN = mean) the dataset df1 by intervals of df1$depth from the dataset df2 (df2$minDepth & df2$Depth) and that by group (Station and Transect) in order to join them by the column depth, Station and Transect. The problem is the max depth depends of the station, therefore from one group of data to another the interval can be 100-200 or 100-175, which make it quite complex.

e.g. if for df2 the interval from a given station in a given transect is 400-1000, I would like the temperature from df1 being aggregate from depth >400 to <= 1000.

Another possibilities would be to replicate the rows of df2 with a df2$minDepth + 0.5 until reaching df2$maxDepth value and then join by depth.

In both case I do not really know how to proceed.

df1 <- structure(list(Transect = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L), .Label = c("1", "2", "3", "4"), class = "factor"), Station = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("", "1", "2", "3", "4", 
"5", "6", "7", "8"), class = "factor"), temperature = c(0.0904, 
0.15, 0.7691, 0.0146, -0.3466, 1.9339, -0.0583, 0.208, -0.0755, 
1.9752, 2.0051, 1.9778, 1.9825, 3.3611, 2.3119, 0.2727, 0.0849, 
2.4664, 2.6563, 1.2256, 1.4938, 1.2118, 3.561, 2.0338, 1.8384, 
1.2348, 1.9761, 0.0472, 1.9731, 1.967, 0.0832, 1.9593, 1.6035, 
-0.5551, 2.0336, 0.2156, -0.1609, 0.1558, 0.2346, -0.4225, 1.9736, 
-0.0274, 1.9396, 1.1732, 2.0147, 1.4887, -0.0349, 1.4741, -0.1417, 
1.0949, 2.6406, 1.0877, 1.8169, 2.5692, 2.4662, -0.2685, 1.9774, 
1.9705, 2.0462, -0.1153, -0.6441, -0.8629, 1.9899, 1.6423, 1.541, 
-0.2768, 2.4161, 1.9753, 2.6955, 0.5983, -0.2354, -0.8856, -0.923, 
0.2718, 1.9774, 1.1851, 2.7037, -0.8477, 2.5275, -0.6093, 1.3051, 
2.0382, 2.4863, 1.3012, 1.9462, 2.0294, 1.9392, 1.421, 1.9744, 
1.9761, -1.0952, 1.5835, 2.6097, -0.3529, 1.9355, 1.9773, -0.743, 
2.603, 1.5249, -0.8462, 0.4651, 1.5508, -1.2618, -1.5018, -1.686, 
1.3767, 1.7305, 1.4991, 1.5003, 0.8662, 1.6691, -0.2488, 1.3982, 
1.6882, -1.6824, -1.4222, 0.9335, -1.5617, 1.3818, 0.2459, 1.7215, 
-1.677, -1.0255, 1.3761, 1.1095, 0.5204, 1.7848, -1.5796, 0.424, 
-1.5652, 1.3739, 3.9624, 1.5481, 1.1437, -1.5125, -1.6832, 1.4667, 
0.6127, 1.475, 1.4825, -0.6471, 1.0726, -1.6796, 1.1914, -1.2973, 
-1.6244, 0.6191, 1.1524, 1.6302, -1.4496, 0.7461, 0.6422, 2.7357, 
1.0529, 0.6229, 0.8845, 1.0181, -1.5218, 1.0376, -0.1418, 1.005, 
0.98, 2.7239, -1.5238, 0.9606, 0.8511, 0.7578, 1.013, 1.0136, 
6.0901, 0.2868, -1.5776, -0.2276, 2.6903, -1.3518, 1.0335, 0.7362, 
0.4402, 0.8468, 0.2621, -0.0475, 0.9451, -1.4984, 0.7847, 2.598, 
0.6331, 0.9527, 0.6488, 0.6407, 0.6129, -1.5152, 0.3581, 1.0337, 
0.9938, 0.9142, 1.0187, 0.85, 0.6053, -0.8318, 0.7135), depth = c(161.9, 
176.7, 30.5, 66.3, 128.3, 420.5, 61.1, 180.7, 77.1, 934.5, 474.5, 
500.5, 768.5, 6.7, 8.5, 187.5, 157.9, 4.5, 32.7, 272.5, 37.9, 
40.7, 13.1, 446.5, 406.5, 276.5, 794.5, 59.1, 818.5, 722.5, 152.3, 
526.5, 362.5, 117.1, 444.5, 56.5, 95.1, 177.1, 46.5, 87.1, 908.5, 
143.1, 666.5, 242.5, 432.5, 338.5, 141.9, 342.5, 79.1, 208.5, 
29.1, 204, 408, 37.4, 39, 170.7, 922, 702, 438, 178.6, 146.2, 
108.4, 484, 366, 348, 168.7, 44.7, 890, 22.6, 18, 171.9, 126.9, 
100.1, 205.7, 938, 264, 23.6, 109.2, 43.9, 147.4, 280, 436, 41.1, 
276, 550, 454, 574, 312, 876, 902, 85, 354, 35.6, 165.8, 658, 
920, 81.3, 31.5, 320, 132.5, 8.8, 242.1, 101.2, 30.5, 64.4, 321.8, 
205.9, 273.1, 270.3, 179, 197.1, 145.2, 304.9, 200.7, 68.4, 94.8, 
170.6, 82.4, 317.4, 151.8, 227.7, 66.8, 115.6, 321, 176.3, 67, 
213.2, 35.5, 153, 34.7, 323, 25, 246.5, 176.7, 88.8, 64.8, 292.7, 
161.8, 287.9, 279.5, 15.9, 53, 59.6, 180.3, 98.4, 70.4, 162.6, 
177.9, 195.5, 93.2, 298.2, 238.2, 365.5, 475.4, 223.4, 346.6, 
422.9, 78.8, 453.7, 126.6, 395.8, 377.8, 387.5, 75.6, 372.2, 
334.6, 307, 405.6, 413.2, 4.2, 143.8, 42.2, 97.5, 333.5, 90.4, 
446.1, 294.2, 171.4, 333, 157.5, 129.8, 361.8, 56, 313.8, 299.5, 
229.8, 367.8, 240.6, 235, 11.4, 77.6, 159.4, 445.7, 388.2, 357.4, 
427.3, 334.2, 211.8, 109.4, 285.4)), row.names = c(NA, -200L), class = c("tbl_df", 
"tbl", "data.frame"))
df2 <- structure(list(Transect = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3", "4", "6", 
"7"), class = "factor"), Station = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 9L, 9L, 9L, 9L, 9L), .Label = c("1", "10", "11", 
"12", "14", "16", "17", "18", "2", "20", "22", "23", "24", "3", 
"4", "5", "7", "8", "9"), class = "factor"), minDepth = c(100L, 
100L, 50L, 50L, 50L, 25L, 25L, 25L, 200L, 100L, 100L, 100L, 100L, 
100L, 100L, 50L, 100L, 100L, 100L, 100L, 50L, 50L, 0L, 0L, 0L, 
200L, 200L, 200L, 200L, 200L, 200L, 200L, 200L, 50L, 50L, 25L, 
25L, 0L, 0L, 0L, 200L, 100L, 50L, 50L, 50L, 25L, 0L, 0L, 50L, 
100L, 200L, 200L, 200L, 400L, 400L, 0L, 0L, 100L, 100L, 100L), 
    maxDepth = c(175L, 175L, 100L, 100L, 100L, 50L, 50L, 50L, 
    225L, 200L, 200L, 200L, 200L, 200L, 200L, 100L, 200L, 200L, 
    200L, 200L, 100L, 100L, 25L, 25L, 25L, 300L, 300L, 300L, 
    300L, 300L, 300L, 300L, 300L, 100L, 100L, 50L, 50L, 25L, 
    25L, 25L, 480L, 200L, 100L, 100L, 100L, 50L, 25L, 50L, 100L, 
    200L, 400L, 400L, 400L, 850L, 850L, 50L, 50L, 200L, 200L, 
    200L), `Average length(µm)` = c(2925, 4233.33333333333, 
    4367.5, 4367.5, 4367.5, 3360, 3360, 3360, 4404.16666666667, 
    2497.22222222222, 2497.22222222222, 2497.22222222222, 3277.5, 
    3277.5, 6593.75, 4525, 2822.5, 2822.5, 2822.5, 4393.75, 1992.5, 
    1992.5, 2012.5, 2012.5, 2012.5, 2902.5, 2902.5, 2902.5, 3232.5, 
    3232.5, 6675, 6675, 6675, 4325, 4325, 2064.28571428571, 5391.66666666667, 
    1925, 1925, 1925, 4162.5, 4216.66666666667, 2432.5, 2432.5, 
    3475, 2500, 3250, 3283.75, 3962.5, 4329.16666666667, 4412.5, 
    4412.5, 4412.5, 5252.5, 5252.5, 3550, 6400, 2345, 2345, 2345
    )), row.names = c(NA, -60L), class = c("tbl_df", "tbl", "data.frame"
))

Solution

  • I think what you need are non-equi joins to match the depth in df1 with the depth ranges in df2. I can think of 2 approaches which can handle this.

    The sqldf package allows you to use SQL syntax to manipulate data frames.

    library(sqldf)
    
    # get the unique intervals from df2
    intervals <- unique(df2[,1:4])
    
    # join the intervals to df1
    new_df1 <- sqldf('select df1.*, i.minDepth, i.maxDepth from df1 left join intervals i 
                     on (df1.Transect=i.Transect and df1.Station=i.Station 
                            and df1.depth > i.minDepth and df1.depth <= i.maxDepth)')
    
    
    # calculate mean temperature using the intervals to group
    sqldf('select Transect,Station,minDepth,maxDepth,avg(temperature) as mean_temp
            from new_df1
            group by Transect, Station, minDepth, maxDepth
            order by Transect, Station, minDepth, maxDepth')
    
    

    The data.table package also allows non-equi joins.

    library(data.table)
    
    # convert data frames to data.table
    dt1 <- as.data.table(df1)
    dt2 <- as.data.table(df2)
    
    # get unique intervals rom dt2
    intervalsdt <- unique(dt2[,1:4])
    
    # join the intervals on to dt1
    new_dt1 <- intervalsdt[dt1,  
                           .(Transect, Station, minDepth=x.minDepth, maxDepth=x.maxDepth, temperature), # the minDepth=x.minDepth and maxDepth=x.maxDepth are needed to take these values from the left data table (dt1), otherwise you will get the matched data (both equal to depth from intervalsdt)
                           on=.(Transect==Transect, Station==Station, minDepth<depth, maxDepth>=depth), # this is equivalent to the ON ... part of a SQL join
                           nomatch=NA] # nomatch=NA makes this into a right join so it keeps the rows which dont fall within an interval
    
    # calculate mean temperature
    new_dt1[, .(mean_temp=mean(temperature)), by=.(Transect, Station, minDepth, maxDepth)
            ][order(Transect, Station, minDepth, maxDepth)]