Search code examples
sql-servertemporal-tables

SQL Server - Updating data with effectiveFrom and EffectiveTo dates - Upsert optimisation


We have a requirement to create a table that maintains temporal validity (i.e. for a given key, in this case Md5 in the table below, there will be no overlapping periods). Users need the ability to set the EffectiveFrom and EffectiveTo dates therefore the temporal tables are not useful as they only appear to allow system generated dates. The use case is that bulk data will be uploaded with the validity date range set and this needs to be applied to the existing data to ensure that there are no period overlaps.

Table definitions:

IF OBJECT_ID('dbo.IngestedData', 'U') IS NOT NULL
    DROP TABLE IngestedData;

CREATE TABLE IngestedData
(   
    ID INT IDENTITY(1,1),
    Md5 VARCHAR(15) NOT NULL,   
    EffectiveFrom DATE NOT NULL,    
    EffectiveTo DATE NOT NULL,
    UpdateUser VARCHAR(50),
    JsonData VARCHAR(MAX),
    CONSTRAINT CK_IngestedData_Start_End CHECK (EffectiveFrom < EffectiveTo),
    CONSTRAINT UK_IngestedData_Md5_Start_End UNIQUE(Md5, EffectiveFrom),
    PRIMARY KEY (Id)
);

CREATE NONCLUSTERED INDEX AK_IngestedData_Md5 
   ON IngestedData (Md5); 
CREATE NONCLUSTERED INDEX AK_IngestedData_EffectiveFrom   
   ON IngestedData (EffectiveFrom);
CREATE NONCLUSTERED INDEX AK_IngestedData_EffectiveTo 
   ON IngestedData (EffectiveTo);

I have written an upsert procedure that works well for single row updates as follows:

Upsert procedure:

CREATE PROCEDURE dbo.usp_UpsertIngestedDataRow 
    @Md5 VARCHAR(20),
    @EffectiveFrom DateTime,
    @EffectiveTo DateTime,
    @UpdateUser VARCHAR(50),
    @JsonData VARCHAR(MAX)
AS
BEGIN
    SET NOCOUNT ON;

    BEGIN TRY;
    BEGIN TRANSACTION;
        --Select the data that needs to be modified along with the action to be taken
        WITH NewRow(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
        (
            SELECT NULL, @Md5, @EffectiveFrom, @EffectiveTo, @UpdateUser, @JsonData, 'I'
        ),
        OverlappingRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData) AS
        (
            SELECT 
                X.ID, X.Md5, X.EffectiveFrom, X.EffectiveTo, X.UpdateUser, X.JsonData 
            FROM 
                NewRow A 
            JOIN 
                IngestedData X ON (X.EffectiveFrom < A.EffectiveTo
                               AND X.EffectiveTo > A.EffectiveFrom)
                               AND A.Md5 = X.Md5
        ),
        NewStartRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
        (
            SELECT 
                s.ID, s.Md5, s.EffectiveFrom, 
                (SELECT DATEADD(DAY, -1, MIN(EffectiveFrom)) 
                 FROM NewRow), 
                s.UpdateUser, s.JsonData, 'I'
            FROM
                OverlappingRows s
            WHERE 
                EffectiveFrom < (SELECT MIN(EffectiveFrom) FROM NewRow)
        ),
        NewEndRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
        (
            SELECT 
                s.ID, s.Md5, 
                (SELECT DATEADD(DAY, 1, MIN(EffectiveTo)) 
                 FROM NewRow), 
                s.EffectiveTo, s.UpdateUser, s.JsonData, 'I'
            FROM  
                OverlappingRows s
            WHERE 
                EffectiveTo > (SELECT MAX(EffectiveTo) FROM NewRow)
        ),
        DeleteRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
        (
            SELECT 
                del.ID, del.Md5, del.EffectiveFrom, del.EffectiveTo, 
                del.UpdateUser, del.JsonData, 'D'
            FROM
                OverlappingRows del 
            INNER JOIN 
                NewRow n ON n.EffectiveFrom <= del.EffectiveFrom 
                         AND n.EffectiveTo >= del.EffectiveTo
        )
        SELECT *
        INTO #Temp
        FROM
            (SELECT * FROM NewRow
             UNION
             SELECT * FROM NewStartRows
             UNION
             SELECT * FROM NewEndRows
             UNION
             SELECT * FROM DeleteRows) AS Data;

        --Delete any rows that are being replaced
        DELETE FROM IngestedData WHERE ID IN (SELECT DISTINCT ID FROM #Temp)

        --Insert the replacement
        INSERT INTO IngestedData(Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData) 
        SELECT Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData
        FROM #Temp
        WHERE [Action] = 'I'

        --Drop temp table
        IF  OBJECT_ID('tempdb.dbo.#Temp', 'U') IS NOT NULL
            DROP TABLE #Temp

    COMMIT;
    END TRY
    BEGIN CATCH
        ROLLBACK;
        THROW;
    END CATCH


END
GO

The individual call performance is good with a call taking around 7ms even when the table is populated with 10,000,000 records. The issue is making large number of updates. Executing the above stored procedure for 35,000 records via a cursor takes around 5 minutes.

I tried rewriting the procedure to take a table variable which would allow the DML to use set operations but got lost in the logic. Can anyone help to turn the above logic in to a set based update that follows this pattern:

New stored procedure:

CREATE PROCEDURE usp_BulkUpsertIngestedData 
    @UpdateUser VARCHAR(15), 
    @NewRows DataIngestionRecord READONLY
AS
BEGIN

Type definition

CREATE TYPE DataIngestionRecord AS TABLE
            (
                Md5 VARCHAR(15) NOT NULL,   
                EffectiveFrom DATE NOT NULL,    
                EffectiveTo DATE NOT NULL,
                JsonData VARCHAR(MAX)
            )

Solution

  • Having tried both disabling then rebuilding the indexes and removing the CTE in the procedure I found that performance was really not improved at all when using row by row updates.

    I took another tack and decided that I could limit the upsert use case by specifying that in any given update each unique Md5 could only have one new temporal range applied. This simplified the logic required to convert the stored procedure in to a set based operation (and fits with our requirements).

    I did take @Tanner's advice and remove the chained CTE from the stored procedure. The final stored procedure ended up as:

    CREATE PROCEDURE dbo.usp_UpsertIngestedDataSet 
        @NewRows DataIngestionRecord READONLY,
        @UpdateUser VARCHAR(15)
    AS
    BEGIN
    
        SET NOCOUNT ON;
    
        --Ensure that there are not multiple temporal regions in the update data for a given key
        SELECT Md5
        INTO #Duplicates
        FROM @NewRows
        GROUP BY Md5
        HAVING COUNT(*) > 1;
    
        IF(@@ROWCOUNT > 0) BEGIN
            DECLARE @Err VARCHAR(MAX)
            SELECT @Err = COALESCE(@Err + CHAR(13), '') + Md5
            FROM #Duplicates
            ORDER BY Md5;
    
            SET @Err = 'The following Md5 values have multiple temporal ranges in the uploaded data which is not supported: ' + char(13) + @Err;
    
            THROW 50002, @Err, 1;
        END
    
        --Determine all overlapping rows from the existing data set
        SELECT id.ID, id.Md5, id.EffectiveFrom, id.EffectiveTo, id.UpdateUser, id.JsonData
        INTO #OverlappingRecords
        FROM IngestedData id JOIN @NewRows nr ON 
            id.Md5 = nr.Md5 AND
            (id.EffectiveFrom < nr.EffectiveTo 
            AND id.EffectiveTo > nr.EffectiveFrom)
    
        --Calculate truncation of left overlapping rows
        SELECT ol.Id,ol.Md5, ol.EffectiveFrom, DATEADD(DAY,-1, nr.EffectiveFrom) AS EffectiveTo, 'U' AS Action
        INTO #Changes
        FROM #OverlappingRecords ol JOIN @NewRows nr ON 
            ol.Md5 = nr.Md5 
            AND ol.EffectiveFrom < nr.EffectiveFrom
    
        --Calculate truncation of right overlapping rows
        INSERT INTO #Changes
        SELECT ol.ID, ol.Md5, DATEADD(DAY,1,nr.EffectiveTo), ol.EffectiveTo, 'U'
        FROM #OverlappingRecords ol JOIN @NewRows nr ON
            ol.Md5 = nr.Md5 
            AND ol.EffectiveTo > nr.EffectiveTo
            AND ol.EffectiveFrom > nr.EffectiveFrom;
    
        --If any area overlaps both the left and right of a new region we need a new insert for the right overlap
        SELECT ol.ID, ol.Md5, DATEADD(DAY,1,nr.EffectiveTo) AS EffectiveFrom, ol.EffectiveTo, 'I' AS [Action]
        INTO #InsertRecords
        FROM #OverlappingRecords ol JOIN @NewRows nr ON
            ol.Md5 = nr.Md5 
            AND ol.EffectiveTo > nr.EffectiveTo
            AND ol.EffectiveFrom < nr.EffectiveFrom;
    
        BEGIN TRANSACTION;
    
        --Delete all overwritten regions (i.e. existing temporal ranges that are completely replaced by a new range)
        DELETE FROM IngestedData 
        WHERE ID IN (SELECT ol.ID
                     FROM #OverlappingRecords ol JOIN @NewRows nr ON 
                        ol.Md5 = nr.Md5 
                        AND nr.EffectiveFrom <= ol.EffectiveFrom 
                        AND nr.EffectiveTo >= ol.EffectiveTo);
    
        --Insert New Data (both from uploaded data and from existing region splits)
        INSERT INTO IngestedData (Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData)
        SELECT Md5, EffectiveFrom, EffectiveTo, 'user2', JsonData
        FROM @NewRows
        UNION
        SELECT id.Md5,ir.EffectiveFrom, ir.EffectiveTo,id.UpdateUser,id.JsonData
        FROM IngestedData id JOIN #InsertRecords ir
        ON id.ID = ir.ID AND ir.[Action] = 'I';
    
        --Update truncated rows
        Update id
        SET EffectiveFrom = u.EffectiveFrom, EffectiveTo = u.EffectiveTo
        FROM IngestedData id JOIN #Changes u ON id.ID = u.ID AND u.[Action] = 'U';
    
        COMMIT;
    
    END
    GO
    

    Translating this code to set based logic made the difference and this version now completes 20,000 updates against 1,000,000 rows of data in a trifling 7370ms.