Search code examples
sqljoinconcatenationsql-server-2016common-table-expression

SQL - Create a new table by joining two tables based on Start and End values and then Concatenate rows based on Start and End values?


This is a follow up question of How to extract multiple rows from a table based on values from multiple columns from another table and then concatenate in SQL?. I am new to SQL. I have 2 tables Table 1 and Table 2. Table 1 have columns ID, SSTART, SEND, UPSTART, UPEND, DNSTART, DNSTAND. Table 2 have columns Position and Seq.

Table 1

ID UPSTART UPEND SStart SEnd DNSTART DNEND
1 98 99 100 104 105 106
2 98 99 100 104 105 106
3 100 101 102 106 107 108
4 100 101 102 106 107 108

Table 2

Position Seq
98 M
99 N
100 A
101 T
102 C
103 T
104 G
105 T
106 T
107 G
108 T
109 G

My final table needs Columns ID, SStart, SSEnd, FullSeq, UPStart, UPEnd, UPSeq, DNStart, DNStand, DnSeq as shown below:

ID UpStart UpEnd UpSeq SStart SSEnd FullSeq DNStart DNStand DnSeq
1 98 99 MN 100 104 ATCTG 105 106 TT
2 98 99 MN 100 104 ATCTG 105 106 TT
3 100 101 AT 102 106 CTGTT 107 108 GT
4 100 101 AT 102 106 CTGTT 107 108 GT

I am using SQL server 2016. I tried

; WITH SequenceCTE AS(
     SELECT  
     [ID] , 
     [SStart],
     [SSEnd],
     [UpStart],
     [UpEnd],
     Seq,
    [DnStart],
    [DnEnd],
    [Position]
  FROM Table_1 a
        JOIN Table_2 b
            ON b.Position >= a.[UpStart] AND
              b.Position <= a.[DnEnd]
)


SELECT  DISTINCT
    a.ID,
    a.[UpStart],
    a.[UpEnd],      
    UpSeq = (
        SELECT  STUFF(',' + Seq,1,1,'')
        FROM    SequenceCTE b
        WHERE   
                 a.ID = b.ID AND
                 a.[Position] > = b.[UpStart] AND
                 a.[Position] < = b.[UpEnd] AND
                 
                order by b.Position
        FOR XML PATH ('') 
        
        
    ) , 
    
    a.[SStart], 
    a.[SSEnd],
    FullSeq =  (
        SELECT  STUFF(',' + Seq,1,1,'')
        FROM    SequenceCTE b
        WHERE   
                 a.ID = b.ID AND
                 a.[SStart] = b.[SStart] AND
                 a.[SSEnd] = b.[SSEnd]
                order by b.Position
        FOR XML PATH ('') 
        
        
    ) , 
   
    a.[DnStart],
    a.[DnEnd],
    DownSeq = (
        SELECT  STUFF(',' + Seq,1,1,'')
        FROM    SequenceCTE b
        WHERE   
                 a.ID = b.ID AND
                a.[Position] > = b.[DnStart] AND
                a.[Position] < =  b.[DnEnd]                 
                order by b.Position
                
        FOR XML PATH ('') 
        
    )
FROM    SequenceCTE a 

But it didn't work. Then I tried

With FullSeqCTE as(
 Select * ,
 b1.Seq as FullSeq 
 from Table_1 a 
 join Table_2 b1 ON b1.Position >= a.SStart and b1.Position <= a.SSEnd
 ), UpperSeqCTE as(
 Select * ,
 b2.Seq as UpSeq
 from Table_1 a 
join Table_2 b2 ON b2.Position >= a.UpStart and b2.Position <= a.UpEnd
), LowerSeqCTE as (
Select * ,
b3.Seq as DownSeq
 from Table_1 a 
 join Table_2 b3 ON b3.Position > = a.DnStart and b3.Position < = a.DnEnd
)

But I am not sure how to proceed. Thank you very much for all the help.

Create Statement Table 1

   CREATE TABLE [Table_1](
    [SStart] [int] NULL,
    [SSend] [int] NULL,
    [ID] [int] NULL,
    [UpStart] [int] NULL,
    [UpEnd] [int] NULL,
    [DnStart] [int] NULL,
    [DnEnd] [int] NULL
) ON [PRIMARY]
GO

Insert Statement Table 1

  INSERT INTO [Table_1]
       ([ID]        
       ,[UpStart]
       ,[UpEnd]
        ,[SStart]
         ,[SSend]
       ,[DnStart]
       ,[DnEnd])
   VALUES
       (1,98,99,100,104,105,106),
       (2,98,99,100,104,105,106),
       (3,100,101,102,106,107,108),
       (4,100,101,102,106,107,108)
 GO

Create Statement Table 2

 CREATE TABLE [Table_2](
  [Position] [int] NULL,
  [Seq] [nvarchar](1) NULL
 ) ON [PRIMARY]
Go

Insert Statement for Table 2

 INSERT INTO [dbo].[Table_2]
       ([Position]
       ,[Seq])
 VALUES
       (98,'M'),
       (99,'N'),
       (100,'A'),
       (101,'T'),
       (102,'C'),
       (103,'T'),
       (104,'G'),
       (105,'T'),
       (106,'T'),
       (107,'G'),
       (108,'T'),
       (109,'G')
GO


  

Solution

  • I am using common table expressions (cte_Up, cte_S, cte_Dn) to limit the required groupings.

    Solution 1

    When working with SQL Server 2017 or later, you can use the STRING_AGG() function to concatenate columns.

    with cte_Up as
    (
      select t1.Id, string_agg(t2.Seq, '') within group (order by t2.Position) as UpSeq
      from table_1 t1
      join table_2 t2
        on  t2.Position >= t1.UpStart
        and t2.Position <= t1.UpEnd
      group by t1.Id
    ),
    cte_S as
    (
      select t1.Id, string_agg(t2.Seq, '') within group (order by t2.Position) as SSeq
      from table_1 t1
      join table_2 t2
        on  t2.Position >= t1.SStart
        and t2.Position <= t1.SEnd
      group by t1.Id
    ),
    cte_Dn as
    (
      select t1.Id, string_agg(t2.Seq, '') within group (order by t2.Position) as DnSeq
      from table_1 t1
      join table_2 t2
        on  t2.Position >= t1.DnStart
        and t2.Position <= t1.DnEnd
      group by t1.Id
    )
    select t1.Id,
           t1.UpStart,
           t1.UpEnd,
           u.UpSeq,
           t1.SStart,
           t1.SEnd,
           s.SSeq,
           t1.DnStart,
           t1.DnEnd,
           d.DnSeq
    from table_1 t1
    join cte_Up u
      on u.Id = t1.Id
    join cte_S s
      on s.Id = t1.Id
    join cte_Dn d
      on d.Id = t1.Id;
    

    Fiddle to see things in action.


    Solution 2

    Using the for xml clause to achieve string concatenation when string_agg() is not availabe.

    with cte_Up as
    (
      select t1.Id,
             ( select '' + t2.Seq
               from table_2 t2
               where t2.Position >= t1.UpStart
                 and t2.Position <= t1.UpEnd
               order by t2.Position
               for xml path('') ) as UpSeq
      from table_1 t1
    ),
    cte_S as
    (
      select t1.Id,
             ( select '' + t2.Seq
               from table_2 t2
               where t2.Position >= t1.SStart
                 and t2.Position <= t1.SEnd
               order by t2.Position
               for xml path('') ) as SSeq
      from table_1 t1
    ),
    cte_Dn as
    (
      select t1.Id,
             ( select '' + t2.Seq
               from table_2 t2
               where t2.Position >= t1.DnStart
                 and t2.Position <= t1.DnEnd
               order by t2.Position
               for xml path('') ) as DnSeq
      from table_1 t1
    )
    select t1.Id,
           t1.UpStart,
           t1.UpEnd,
           u.UpSeq,
           t1.SStart,
           t1.SEnd,
           s.SSeq,
           t1.DnStart,
           t1.DnEnd,
           d.DnSeq
    from table_1 t1
    join cte_Up u
      on u.Id = t1.Id
    join cte_S s
      on s.Id = t1.Id
    join cte_Dn d
      on d.Id = t1.Id;
    

    Fiddle to see things in action.


    Result

    Id | UpStart UpEnd UpSeq | SStart SEnd SSeq  | DnStart DnEnd DnSeq
    -- | ------- ----- ----- | ------ ---- ----- | ------- ----- -----
    1  | 98       99   MN    | 100    104  ATCTG | 105     106   TT
    2  | 98       99   MN    | 100    104  ATCTG | 105     106   TT
    3  | 100      101  AT    | 102    106  CTGTT | 107     108   GT
    4  | 100      101  AT    | 102    106  CTGTT | 107     108   GT