Search code examples
sqlsql-serversql-server-2014sql-optimization

Optimizing data load in SQL Server


I have been trying to upload around 18 million records from Oracle to SQL Server. The problem is the loading task which take days to complete even though the data is not so much. I have marked parts of the code relative to their respective time taken to completely execute.

declare @v_sql VARCHAR(MAX)
declare @column_name VARCHAR(MAX) 
declare @column_name2 VARCHAR(MAX) =''
declare @column_name3 VARCHAR(MAX) =''
declare @tablename VARCHAR(MAX) ='TITLE_BLOCK_PARTS' --input parameter
declare @v_sql2 VARCHAR(MAX)=''
declare @temp_table VARCHAR(500) = '##'+@tablename+'_T'
declare @temp_table2    VARCHAR(500)
declare @v_entityid int = 14 --input parameter
declare @vcount INT = 1
declare @v_total_records bigint
DECLARE @CountResults TABLE (CountReturned bigint)
declare @v_total_ei bigint
declare @v_max_ei bigint
declare @v_total_no_col int
declare @col_to_exclude VARCHAR(MAX) = ',ID,CLASS,'
declare @stg_table vARCHAR(500)  = '##'+@tablename+'_TEMP'
declare @v_success int = -1
declare @v_tablename VARCHAR(MAX) = @tablename
declare @itemnumber varchar(500) ='';
declare @rev varchar(500) ='';
declare @v_stg_sql  VARCHAR(MAX) = ''
declare @v_temp_table vARCHAR(500)  = '##'+@tablename+'_T_ORI'
declare @v_sql3 VARCHAR(MAX) = ''
declare @v_countrownum bigint = 1
declare @v_rownum bigint 
DECLARE @v_rownumtab TABLE (ROWCountReturned bigint)

BEGIN

--SET @col_to_exclude =  ''''+REPLACE(@col_to_exclude,',',''',''') +'''' 

--Start: Following code takes maximum 5 to 7 minutes to complete
IF EXISTS (SELECT * FROM sys.tables WHERE name = @v_tablename)
Begin 
set @v_sql = ' DROP TABLE ' + @v_tablename
exec (@v_sql)
END

set @v_sql = 'SELECT *
into ' + @tablename +'
  FROM [ORCLTST]..[AGILE].'+@v_tablename;

exec (@v_sql)

--End: Following code takes maximum 5 to 7 minutes to complete

--Start: Following code takes maximum 4 minutes to complete
IF OBJECT_ID('tempdb..'+@stg_table) IS NOT NULL 
exec ('drop table ' + @stg_table)

SET @v_sql = 'SELECT * 
INTO '+@stg_table+
'
FROM ' + @tablename;

EXEC (@v_sql);

set @v_sql = ''

set @v_total_no_col = (SELECT COUNT(COLUMN_NAME)
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @tablename
AND COLUMN_NAME
NOT IN 
(
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @tablename
AND 
charindex(',' + CAST(COLUMN_NAME as nvarchar(20)) + ',', @col_to_exclude) > 0
)
)


DECLARE get_columns CURSOR FOR  
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @tablename
AND COLUMN_NAME
NOT IN 
(
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @tablename
AND 
charindex(',' + CAST(COLUMN_NAME as nvarchar(20)) + ',', @col_to_exclude) > 0
)

--SELECT COLUMN_NAME
--FROM INFORMATION_SCHEMA.COLUMNS
--WHERE TABLE_NAME = @tablename
--AND COLUMN_NAME NOT IN (@col_to_exclude)
--@tablename
--print @sql3

OPEN get_columns  
FETCH NEXT FROM get_columns INTO @column_name

WHILE @@FETCH_STATUS = 0          
 BEGIN  
--PRINT @column_name        

SET @column_name2 = replace(REPLACE(REPLACE(LTRIM(RTRIM(REPLACE(@column_name,'"',''))),')',''),'(',''),' ','')

--replace(@column_name2
--print 'ALTER TABLE ' + @sql5+ ' DROP constraint ' + @sql3
set @v_sql2+='CAST('+'['+@column_name+']'+' AS VARCHAR(8000))'+ 'AS '+ @column_name2 + ',
'
+
'
'


SET @v_sql3 += 'CAST('+'ISNULL(['+@column_name+'],'''')'+' AS VARCHAR(8000))'+ 'AS '+ @column_name2 + ',
'
+
'
'

SET @column_name3 += @column_name2 + ',
'+

'  
'

FETCH NEXT FROM get_columns INTO  @column_name        

      end  
     CLOSE get_columns          
 DEALLOCATE get_columns 
--Print @v_sql2   
set @v_sql2 = left(@v_sql2, len(@v_sql2)-5)

set @v_sql3 = left(@v_sql3, len(@v_sql3)-5)

SET @column_name3 = left(@column_name3, len(@column_name3)-7)

IF OBJECT_ID('tempdb..'+@temp_table) IS NOT NULL 
exec ('drop table ' + @temp_table)

IF OBJECT_ID('tempdb..'+@temp_table+'2') IS NOT NULL 
exec ('drop table ' + @temp_table+'2')

IF OBJECT_ID('tempdb..'+@v_temp_table) IS NOT NULL 
exec ('drop table ' + @v_temp_table)

IF OBJECT_ID('tempdb..'+@v_temp_table+'2') IS NOT NULL 
exec ('drop table ' + @v_temp_table+'2')

SET @v_sql = 'Select
    ROW_NUMBER() 
        OVER (ORDER BY number, REV) AS Row, 
        '
            +
    @v_sql2
    +
    '
    '
    +   


 '   INTO ' + @v_temp_table + ' 
 from 

 '

 +
 @tablename

PRINT @v_sql
EXEC (@V_SQL)



SET @v_sql = 'Select
    ROW_NUMBER() 
        OVER (ORDER BY number, REV) AS Row, 
        '
            +
    @v_sql3
    +
    '
    '
    +   


 '   INTO ' + @v_temp_table+'2' + ' 
 from 

 '

 +
 @v_temp_table

PRINT @v_sql
EXEC (@V_SQL)

SET @v_sql ='
Select  ROW
 ,ColumnName
       , ColumnValue
       INTO ' + @temp_table + ' 
FROM 
(
    Select
    ROW,
    '
    +
    @v_sql3
    +
    '
    '
    +
'

'
+   

 '   from 

 ' 
 +
 @v_temp_table+'2'
 +
 '
 '
 +
 '
 )t  
   UNPIVOT (ColumnValue for ColumnName 
           IN (
           '
           +
           @column_name3
           +
           '

           '
           +

            '   
               )
            )UP
            ';


PRINT @v_sql
--END;
--/*
EXEC (@V_SQL)

set @v_sql = ''

/*set @v_sql = 'with q1 as  (
SELECT t2.Id, t2.displayorder,t2.DisplayName ,T2.[Group],t1.* 
'
 +
 ' 
FROM '+@temp_table+ ' t1
INNER JOIN PLMColumn t2
ON t1.ColumnName = t2.name
where t2.EntityId = ' + cast(@v_entityid as varchar(100))
+
')
,q2 as(
    select      Id, ColumnName, displayorder,[Group], count(*) as c_nnum
    from q1

group by Id,ColumnName,displayorder,[Group])

select t1.id, t1.displayorder, t1.ColumnName, t2.columnvalue
into   '+ @temp_table+'2'+
'
from q2 t1
INNER JOIN '+@temp_table+ ' t2
ON t1.ColumnName = t2.ColumnName
WHERE T1.[Group] <> ''PAGE THREE''

' */

 --

set @temp_table2 = replace(@temp_table+'2', '##','');

IF EXISTS (SELECT * FROM sys.tables WHERE name = @temp_table2)
Begin 
set @v_sql = ' DROP TABLE ' + @temp_table2
exec (@v_sql)
END

set @v_Sql = 'SELECT t1.Row,t2.Id, t2.displayorder,t2.DisplayName ,T2.[Group],t1.ColumnName, t1.ColumnValue
 into   '+ @temp_table2 +
' FROM '+@temp_table+ ' t1
INNER JOIN PLMColumn t2
ON t1.ColumnName = t2.name
where t2.EntityId = ' + cast(@v_entityid as varchar(100)) + '
and T2.[Group] <> ''PAGE THREE'''

print @v_sql
exec(@v_sql)


set @v_sql = ' CREATE INDEX idx_'+@temp_table2+'_row
ON '+ @temp_table2 + ' (row)';

print @v_sql
exec(@v_sql)
--end
--/*
--set @v_sql = 'select COUNT(*) from ' + @tablename
SELECT @v_sql = N'SELECT COUNT(*) FROM' + QUOTENAME(@tablename);

INSERT @CountResults
EXEC(@v_sql)


set @v_total_records = (SELECT CountReturned FROM @CountResults)
print '@v_total_records '+ cast(@v_total_records as varchar(5000)) 

set @v_max_ei = (SELECT IDENT_CURRENT ('PLMENTITYINSTANCE'))
set @v_max_ei = @v_max_ei + 1
--(select ISNULL(MAX (id),0) from PLMEntityInstance)

print '@v_max_ei '+ cast(@v_max_ei as varchar(5000)) 

set @v_total_ei = ISNULL(@v_total_records,0) + @v_max_ei
print '@v_total_ei '+ cast(@v_total_ei as varchar(5000)) 
--END;


IF EXISTS (SELECT * FROM sys.tables WHERE name = 'ITEM_STG')
Begin 
set @v_sql = ' DROP TABLE ITEM_STG'
exec (@v_sql)
END
set @v_sql = 'CREATE TABLE [dbo].[ITEM_STG](
    [INSTANCE_ID] [bigint] NOT NULL,
    [ITEM_NUMBER] [varchar](max) NOT NULL,
    [REV] [varchar](max) NOT NULL)'

exec (@v_sql)

set @v_sql = ''
--END
--/*
WHILE (@vcount <= @v_total_records)
BEGIN

INSERT INTO PLMEntityInstance (EntityId)
VALUES (@v_entityid)


SET @vcount = @vcount + 1
--print 
END 


set @v_sql = ''

set @v_sql = N'with c AS
(select DISTINCT ROW from ' + QUOTENAME(@temp_table2) +')

select count(*) from c

'

INSERT @v_rownumtab
EXEC(@v_sql)


set @v_rownum = (select * from @v_rownumtab)

print @v_rownum

--End: Following code takes maximum 4 minutes to complete

--end
--/*


--Start: Following code takes days to complete
while (@v_max_ei <= @v_total_ei AND @v_countrownum <=@v_rownum)
BEGIN

set @v_sql = ''
--top('+CAST(@v_total_no_col AS varchar(100))+') 
set @v_sql = '

with q2
as(

select 

* from '
+ @temp_table2
 +
 '
 where row = '+ CAST(@v_countrownum AS varchar(5000)) + '

 )
INSERT INTO PLMColumnValue(EntityInstanceId, ColumnID, Value)

select '+ CAST(@v_max_ei AS varchar(5000)) + ' as Entityinstanceid, t.id as column_id, t.columnvalue from q2 t
order by DisplayOrder
'

print @v_countrownum
print @v_sql
exec (@v_sql)


set @itemnumber = (select Value from PLMColumnValue where columnid=(select id from PLMColumn where EntityId = @v_entityid and Name = 'number') and EntityInstanceId =  @v_max_ei)
set @rev = (select Value from PLMColumnValue where columnid=(select id from PLMColumn where EntityId = @v_entityid and Name = 'rev') and EntityInstanceId =  @v_max_ei)

print @itemnumber
print @rev

SET @v_stg_sql = ''
set @v_stg_sql = '
INSERT INTO ITEM_STG (INSTANCE_ID,ITEM_NUMBER,REV)
select ' +  CAST(@v_max_ei AS varchar(5000)) + ' , ' + ''''+@itemnumber+'''' + ' , '  +''''+ @rev+'''' + ''

print @v_stg_sql
exec (@v_stg_sql)


set @itemnumber = '';
set @rev = '';

SET @v_stg_sql = ''
set @v_sql = ''

SET @v_max_ei = @v_max_ei + 1;
set @v_countrownum = @v_countrownum + 1;
END

--End: Following code takes days to complete

END;

The main tables under use are PlmEntityInstance, PlmColumnValue and ITEM_STG. PlmColumnValue table is where all the migrated data is loaded, this table has an EAV model and here the code take ages to load the data. How can I optimize this process? Creating an index on the table where all the data is stored after all the required modifications and before loading (its name is stored in variable @temp_table2) did help a lot but still its not good enough. I initiated loading process yesterday at 2100 local time and its been 15 hours and it has loaded only 149088 records out of 18 million records. Please Help!

If there is not enough information provided in question please let me know.

Please see below wait stats

WaitType    Wait_S  Resource_S  Signal_S    WaitCount   Percentage  AvgWait_S   AvgRes_S    AvgSig_S
CXPACKET    657100.57   636945.69   20154.88    338749067   59.49   0.0019  0.0019  0.0001
PREEMPTIVE_OLEDBOPS 268405.70   268405.70   0.00    24041   24.30   11.1645 11.1645 0.0000
OLEDB   91213.76    91213.76    0.00    34867   8.26    2.6160  2.6160  0.0000
ASYNC_NETWORK_IO    33316.35    32352.43    963.93  4060699 3.02    0.0082  0.0080  0.0002

Sample data is placed on the following link: https://www.dropbox.com/home/Public/Query/

The data in the csv provided in the above link has 8531 records.

You can initialize parameter @temp_table2 with 'TITLE_BLOCK_PARTS'


Solution

  • Replace all WHILE loops with set based INSERT using tally table. For example replace

    declare @vcount INT = 1
    WHILE (@vcount <= @v_total_records)
    BEGIN
    
        INSERT INTO PLMEntityInstance (EntityId)
        VALUES (@v_entityid)
    
        SET @vcount = @vcount + 1
    
    END 
    

    with

    INSERT INTO PLMEntityInstance (EntityId)
    SELECT @v_entityid 
    FROM Tally 
    WHERE N <= @v_total_records
    

    Tally can be created in multiple ways, for example, 20 million rows tally:

    WITH E1(N) AS (
                    SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                    SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                    SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
                ),                          
    E2(N) AS (SELECT 1 FROM E1 a, E1 b), 
    E4(N) AS (SELECT 1 FROM E2 a, E2 b) --10E+4 or 10,000 rows max
    SELECT TOP 20000000
       IDENTITY(INT,1,1) AS N
    INTO dbo.Tally
    FROM E4 a, E4 b; --100,000,000 rows max
    
    ALTER TABLE dbo.Tally
        ADD CONSTRAINT PK_Tally_N 
            PRIMARY KEY CLUSTERED (N) WITH FILLFACTOR = 100;
    

    EDIT
    Concerning refactoring your last WHILE to set based code. I'm not sure i fully understand what the whole script is doing, but it's too long to comment. So first look at original code of the last loop:

    @v_countrownum = 1
    ...
    @v_max_ei = ...
    while (@v_max_ei <= @v_total_ei AND @v_countrownum <=@v_rownum)
    BEGIN
    ...
    SET @v_max_ei = @v_max_ei + 1;
    set @v_countrownum = @v_countrownum + 1;
    END
    

    Let’s refactor it to iterator query:

    ...
    @v_max_ei = ...
    WITH iterator AS (
        SELECT v_max_ei = @v_max_ei + N - 1
            ,v_countrownum = N
        FROM tally
        WHERE @v_max_ei + N - 1 <= @v_total_ei AND N <= @v_rownum
    )
    SELECT * 
    FROM iterator
    

    Next, your loop unconditionally adds rows to PLMColumnValue and ITEM_STG at every iteration, so we must use LEFT JOIN of iterator and source data tables and take care of NULLs. I see your code defaults missing data to ''. Also I need some test data, which are generated this way using subset of PLMColumn fields from your dataset.

    -- test data generator
    DROP TABLE title_block3;
    GO
    SELECT row = N, id, DisplayOrder
        ,columnvalue = c.NAME + '_val_0' + cast(N AS VARCHAR(3))
    INTO title_block3
    FROM PLMColumn c
    JOIN tally ON tally.N <= 3
    WHERE c.NAME IN ('number','rev' ) OR c.NAME LIKE 'text%'
    
    SELECT * FROM  title_block3;
    

    So finally the script to replace your last WHILE. I use arbitrary variables values instead of computed by the first part of your script. @v_rownum = 3 according to test data.

    declare @temp_table2 varchar(100);
    declare @v_sql nvarchar(max);
    declare @v_max_ei int;
    declare @v_entityid int;
    declare @v_total_ei int;
    declare @v_rownum int;
    
    set @temp_table2 = 'title_block3';  -- see generator
    
    set @v_max_ei = 120000;
    set @v_total_ei = 200000;
    set @v_rownum = 3;
    set @v_entityid = 14;
    
    --Start: Following code takes days to complete
    
    set @v_sql = 
    'WITH iterator AS (
        SELECT v_max_ei = '+ CAST(@v_max_ei AS varchar(5000)) + ' + N - 1
            ,v_countrownum = N
        FROM tally
        WHERE '+ CAST(@v_max_ei AS varchar(5000)) + ' + N - 1 <= '+ CAST(@v_total_ei AS varchar(5000)) + 
        ' AND N <= '+ CAST(@v_rownum AS varchar(5000)) + '
    )
    INSERT INTO PLMColumnValue(EntityInstanceId, ColumnID, Value)
    SELECT i.v_max_ei, isnull(t.id,''''), isnull(t.columnvalue,'''') 
    FROM iterator i
    LEFT JOIN ' + @temp_table2 + ' t ON t.row = i.v_countrownum
    ORDER BY i.v_countrownum, t.DisplayOrder ';
    
    print @v_sql;
    exec (@v_sql);
    
    WITH iterator AS (
        SELECT v_max_ei = @v_max_ei + N - 1
            ,v_countrownum = N
        FROM tally
        WHERE @v_max_ei + N - 1 <= @v_total_ei AND N <= @v_rownum
    )
    INSERT INTO ITEM_STG (INSTANCE_ID, ITEM_NUMBER, REV)
    SELECT i.v_max_ei
        -- unpivot 
        , isnull(max(CASE c.Name WHEN 'number' THEN Value END),'')
        , isnull(max(CASE c.Name WHEN 'rev' THEN Value END),'')
    FROM iterator i
    LEFT JOIN PLMColumnValue cv ON cv.EntityInstanceId = i.v_max_ei
    LEFT JOIN PLMColumn c ON cv.columnid = c.id 
        AND c.EntityId = @v_entityid 
        AND c.Name IN ( 'number' , 'rev')
    GROUP BY i.v_max_ei;
    
    -- check it
    
    SELECT * FROM ITEM_STG;
    SELECT * FROM PLMColumnValue;