Search code examples
sqlsql-serverrecursion

How to write a SQL query to group accounts that share same phone numbers directly or indirectly in recursive way?


I am trying to group a list of accounts if they share same phone numbers directly or indirectly using SQL query in recursive way. I was trying traversing the parent-child hierarchy like most SQL recursive examples do, but it doesn't work because each parent in my case could have multiple children. I am expecting the query to return all three accounts within the same group.

e.g.

AccountID   PhoneNumber
1           1111111111
1           2222222222
2           1111111111
2           4444444444
3           3333333333
3           4444444444

I have tried common approach to this recursive problem in SQL by defining base case and recursive case, but I am getting "The maximum recursion 100 has been exhausted before statement completion." error.

-- Step 1: Define a recursive CTE to find connected components
WITH ConnectedAccounts AS (
    -- Base case: Select each account and its direct connections
    SELECT 
        a.AccountID AS RootAccount,
        a.AccountID,
        a.PhoneNumber,
        level = 0
    FROM 
        AccountPhoneNumbers a
    
    UNION ALL
    
    -- Recursive case: Connect accounts sharing a phone number
    SELECT 
        c.RootAccount,
        b.AccountID,
        b.PhoneNumber,
        c.level + 1
    FROM 
        AccountPhoneNumbers b 
        JOIN ConnectedAccounts c ON b.PhoneNumber = c.PhoneNumber
    WHERE 
        b.AccountID <> c.AccountID  -- Prevent looping back to the same account
),

-- Step 2: Select distinct groupings from the recursive result
DistinctGroups AS (
    SELECT DISTINCT
        RootAccount,
        AccountID
    FROM
        ConnectedAccounts 
)

-- Step 3: Aggregate accounts into groups based on their connected root account
SELECT
    RootAccount,
    STRING_AGG(AccountID, ',') AS ConnectedAccounts
FROM
    DistinctGroups 
GROUP BY
    RootAccount

Solution

  • Here's a couple of versions.

    The graph (might be slow):

    with 
        nodes as (
    v(grp)
        select accountid as id, phonenumber as grp
        from
        (
            VALUES  (1, 1111111111)
            ,   (1, 2222222222)
            ,   (2, 1111111111)
            ,   (2, 4444444444)
            ,   (3, 3333333333)
            ,   (3, 4444444444)
            ,   (4, 11111111113)
            ,   (5, 11111111113)
            ,   (5, 1111111111)
        ) t (AccountID,PhoneNumber)
    
        ),
        edges as (
            select distinct n1.id as id1, n2.id as id2
            from nodes n1
            inner join nodes n2 on n1.grp = n2.grp
        ),
        rec as (
            select id1, id2, cast(id1 as nvarchar(max)) as visited from edges
            union all
            select r.id1, e.id2, concat(r.visited, ',', e.id2)
            from rec r
            inner join edges e on e.id1 = r.id2
            where concat(',', r.visited, ',') not like concat('%,', e.id2, ',%')
        ),
        fin as (
            select id1, min(value) min_id
            from rec r
            cross apply string_split(r.visited, ',')
            group by id1
        )
    select id1 as id, dense_rank() over(order by min_id) grp
    from fin f
    

    The loop (probably needs index on phone):

    SELECT  AccountID AS ID, PhoneNumber AS Number, CAST(NULL AS INT) AS ID_To
    INTO #t
    FROM
    (
            VALUES  (1, 1111111111)
            ,   (1, 2222222222)
            ,   (2, 1111111111)
            ,   (2, 4444444444)
            ,   (3, 3333333333)
            ,   (3, 4444444444)
            ,   (4, 11111111113)
            ,   (5, 11111111113)
            ,   (5, 1111111111)
            ,   (6, 4444444444)
            ,   (7, 3333333333)
            ,   (8, 13)
            ,   (8, 7)
            ,   (8, 3)
            ,   (9, 13)
            ,   (10, 14)
        ) t (AccountID,PhoneNumber)
    
    -- Create index on Number
    
    WHILE @@rowcount > 0
    BEGIN
        UPDATE  t3
        SET ID_TO = x.ID_to
        FROm    (   
            SELECT  t2.ID, ID_to = MIN(ISNULL(t.ID_To, t.ID))
            FROM    #t t
            INNER JOIN #t t2
                ON  ISNULL(t.id_To, t.id) < ISNULL(t2.ID_to, t2.id)
                AND t.Number = t2.Number
                AND t.ID <> t2.ID
            GROUP BY t2.ID
            ) x
        INNER JOIN #t t3 -- Update ALL rows to the lowest value
            ON  t3.ID = x.ID
    END
    
    -- See created groups
    SELECT  ISNULL(ID_TO, ID) AS groups
    FROM    #t
    GROUP BY ISNULL(ID_TO, ID)