I have a table that has a long list of duplicated items. I am working on a stored procedure to consolidate them all into one record. Each one of the duplicated items has a number of child tables that should either be deleted, or rekeyed to point at the resulting record. My table has an Id, but the ReadableIdentifier is the column I need to deduplicate.
Id | ReadableIdentifier | Name | UpdatedOn
1 | ABC1234 | Product X | 2014-04-25 16:00:08.000
2 | ABC1234 | Product X | 2014-04-28 16:00:08.000
3 | ABC1234 | Product X | 2014-04-21 16:00:08.000
4 | ABDD9945 | Widget R | 2014-04-25 16:00:08.000
5 | ABDD9945 | Widget R | 2014-04-25 18:45:08.000
As you can see, records 1-3 are duplicates with different Id's and UpdatedOn dates. Same for 4-5. I need to consolidate these into one record, preferring the one with the most recent UpdatedOn date.
End Goal (not showing children tables):
Id | ReadableIdentifier | Name | UpdatedOn
2 | ABC1234 | Product X | 2014-04-28 16:00:08.000
5 | ABDD9945 | Widget R | 2014-04-25 18:45:08.000
I am using a CURSOR
to do this, but am wondering if there is a better solution.
DECLARE dupeCursor CURSOR
FAST_FORWARD
FOR
WITH Counts AS (
SELECT
COUNT(1) Count,
ReadableIdentifier
FROM dbo.Item WITH (NOLOCK)
WHERE ReadableIdentifier IS NOT NULL
GROUP BY ReadableIdentifier)
SELECT
Counts.Count,
Counts.ReadableIdentifier,
Counts.CompanyId
FROM
Counts
WHERE Counts.Count > 1;
OPEN dupeCursor;
DECLARE @readableId VARCHAR(50);
DECLARE @itemToPersistId INT, @itemToDeleteId INT;
FETCH NEXT FROM dupeCursor INTO @readableId;
WHILE @@FETCH_STATUS = 0
BEGIN
WITH V AS (
SELECT Id, ROW_NUMBER() OVER (PARTITION BY ReadableId ORDER BY UpdatedOn DESC) as Row
FROM dbo.Item WITH (NOLOCK) WHERE ReadableId = @readableId
)
SELECT @itemToPersistId = Id
FROM V
WHERE V.Row = 1
CREATE TABLE #itemsToDelete (Id UNIQUEIDENTIFIER)
INSERT INTO #itemsToDelete
SELECT Id
FROM dbo.Item WITH (NOLOCK)
WHERE ReadableId = @readableId AND Id != @itemToPersistId;
--UPDATE CHILDREN TABLES
DELETE FROM dbo.ItemDetails WHERE ItemId IN (SELECT Id FROM #itemsToDelete);
UPDATE dbo.ItemPurchases SET ItemId = @itemToPersistId
WHERE ItemId IN (SELECT Id FROM #itemsToDelete);
UPDATE dbo.PurchaseOrders SET ItemId = @itemToPersistId
WHERE ItemId IN (SELECT Id FROM #itemsToDelete);
DELETE FROM dbo.ItemMetadata WHERE ItemId IN (SELECT Id FROM #itemsToDelete);
--delete Duplicated Items
DELETE FROM dbo.Item WHERE Id IN (SELECT Id FROM #itemsToDelete);
DROP TABLE #itemsToDelete
FETCH NEXT FROM dupeCursor INTO @readableId;
END
CLOSE dupeCursor;
DEALLOCATE dupeCursor;
I realize the cursor is most likely the issue, but I'm not sure how to go about updating all of the child tables without using one.
Ok I dont have data to test this for the child tables but it should work:
WITH V
AS (SELECT *,
ROW_NUMBER() OVER(PARTITION BY ReadableId ORDER BY UpdatedOn DESC) AS Row
FROM dbo.Item WITH (NOLOCK))
SELECT *
INTO #itemsToDelete
FROM V;
--UPDATE CHILDREN TABLES
DELETE FROM dbo.ItemDetails
WHERE ItemId IN
(
SELECT Id
FROM #itemsToDelete
WHERE Row > 1
);
UPDATE IP
SET
IP.ItemId = itk.ID
FROM dbo.ItemPurchases AS IP
INNER JOIN #itemsToDelete AS itd ON IP.ItemId = itd.ID
AND itd.Row > 1
INNER JOIN #itemsToDelete AS itk ON itk.ReadableIdentifier = itd.ReadableIdentifier
AND itk.Row = 1
AND itd.Row > 1;
UPDATE po
SET
po.ItemId = itk.ID
FROM dbo.PurchaseOrders AS po
INNER JOIN #itemsToDelete AS itd ON po.ItemId = itd.ID
AND itd.Row > 1
INNER JOIN #itemsToDelete AS itk ON itk.ReadableIdentifier = itd.ReadableIdentifier
AND itk.Row = 1
AND itd.Row > 1;
DELETE FROM dbo.ItemMetadata
WHERE ItemId IN
(
SELECT Id
FROM #itemsToDelete
WHERE Row > 1
);
--delete Duplicated Items
DELETE FROM dbo.Item
WHERE Id IN
(
SELECT Id
FROM #itemsToDelete
WHERE Row > 1
);