Search code examples
db2

Combining Inner Joins and Between Statements in the Same Query


I have these two tables:

CREATE TABLE table_a (
  name VARCHAR(255),
  date_1 DATE
);


INSERT INTO table_a (name, date_1) VALUES
('john', '2010-01-01'),
('john', '2012-02-01'),
('john', '2017-08-01'),
('sara', '2008-04-01'),
('sara', '2011-04-01'),
('tim', '2000-01-01'),
('tim', '2001-01-01'),
('alex', '2013-01-01');


CREATE TABLE table_b (
  name VARCHAR(255),
  date_2 DATE,
  date_3 DATE,
  var CHAR(1)
);


INSERT INTO table_b (name, date_2, date_3, var) VALUES
('john', '2001-01-01', '2015-01-01', 'b'),
('sara', '2000-01-01', '2015-01-01', 'c'),
('sara', '2015-01-02', '2022-01-01', 'a'),
('tim', '2020-01-01', '2021-01-01', 'a'),
('john', '1998-01-01', '1999-01-01', 'd');

They look like this:

#table_a
      name     date_1
 john 2010-01-01
 john 2012-02-01
 john 2017-08-01
 sara 2008-04-01
 sara 2011-04-01
  tim 2000-01-01
  tim 2001-01-01
 alex 2013-01-01

 #table_b
  name     date_2     date_3 var
 john 2001-01-01 2015-01-01   b
 sara 2000-01-01 2015-01-01   c
 sara 2015-01-02 2022-01-01   a
  tim 2020-01-01 2021-01-01   a
 john 1998-01-01 1999-01-01   d

Here is what I am trying to accomplish:

Part 1: Exact Join

  • For rows in table_a and table_b (based on name) - see if date_1 falls between a pair of (date_2, date_3) within table_b. If yes,then join

Part 2: Window Join

  • For rows that were not joined in Part 1 (i.e. could include names that were analyzed in part 1), see if these names appear in table_b
  • If yes, then see if the name (table_a) has a row in table_b with a date_2 that occurs BEFORE the date_1
  • If yes, then join on row closest to the earliest date_1
  • Otherwise, no join

Here is the code I am using:

# random ID approach : faster

WITH exact_join AS (
  SELECT a.*, b.var, random() as random_id
  FROM table_a a
  LEFT JOIN table_b b ON a.name = b.name AND a.date_1 BETWEEN b.date_2 AND b.date_3
),
window_join AS (
  SELECT a.*, b.var
  FROM table_a a
  LEFT JOIN (
    SELECT name, var, date_2, ROW_NUMBER() OVER (PARTITION BY name ORDER BY date_2 DESC) as rn
    FROM table_b
  ) b ON a.name = b.name AND a.date_1 > b.date_2
  WHERE b.rn = 1 AND a.random_id NOT IN (SELECT random_id FROM exact_join)
)
SELECT * FROM exact_join
UNION ALL
SELECT * FROM window_join;

# non-random id approach (slower)

WITH exact_join AS (
  SELECT a.*, b.var, ROW_NUMBER() OVER (ORDER BY 1) as id
  FROM table_a a
  LEFT JOIN table_b b ON a.name = b.name AND a.date_1 BETWEEN b.date_2 AND b.date_3
),
window_join AS (
  SELECT a.*, b.var
  FROM (
    SELECT *, ROW_NUMBER() OVER (ORDER BY 1) as id
    FROM table_a
  ) a
  LEFT JOIN (
    SELECT name, var, date_2, ROW_NUMBER() OVER (PARTITION BY name ORDER BY date_2 DESC) as rn
    FROM table_b
  ) b ON a.name = b.name AND a.date_1 > b.date_2
  WHERE b.rn = 1 AND a.id NOT IN (SELECT id FROM exact_join)
)
SELECT * FROM exact_join
UNION ALL
SELECT * FROM window_join;

I think the code is giving the correct output:

  name     date_1  var id
 john 2010-01-01    b  1
 john 2012-02-01    b  2
 john 2017-08-01 <NA>  3
 sara 2008-04-01    c  4
 sara 2011-04-01    c  5
  tim 2000-01-01 <NA>  6
  tim 2001-01-01 <NA>  7
 alex 2013-01-01 <NA>  8

Is this the correct use of the random() function to prevent rows from analyzed twice or skipped?


Solution

  • I think the result you are showing is incorrect according to the rule:

    If yes, then see if the name (table_a) has a row in table_b with a date_2 that occurs BEFORE the date_1

    In that case the third occurrence of John should have matching values instead of empty values. But which one: b or d? I chose the latest one according to dates (b).

    You can do:

    select a.name, a.date_1, b.var
    from table_a a
    left join lateral (
      select * from table_b b where b.name = a.name and b.date_2 < a.date_1
      order by b.date_2 desc fetch next 1 rows only
    ) b on 1 = 1
    

    Result:

     NAME  DATE_1      VAR  
     ----- ----------- ---- 
     john  2010-01-01  b    
     john  2012-02-01  b    
     john  2017-08-01  b    
     sara  2008-04-01  c    
     sara  2011-04-01  c    
     tim   2000-01-01  null 
     tim   2001-01-01  null 
     alex  2013-01-01  null 
    

    See running example at db<>fiddle.

    Now, if you are interested on improving the join performance you can create the index:

    create index ix1 on table_b (name, date_2);
    

    Extra solution without lateral joins (for the same price)

    Without the lateral join the query won't be as performant since you'll need that nasty double join. You can do:

    select a.name, a.date_1, b.var
    from table_a a
    left join table_b b on b.name = a.name and b.date_2 = (
      select max(date_2) from table_b c where c.name = a.name and c.date_2 < a.date_1
    )
    

    The fiddle is updated.