Search code examples
mysqlsqlmergeteradatateradata-sql-assistant

Migration of Merge statement from teradata to MYSQL


Tables:- schema.INFA_TASK_RUN_STG schema.INFA_TASK_RUN

Primary Index of schema.INFA_TASK_RUN_STG :- SUBJECT_AREA Primary Index of schema.INFA_TASK_RUN :- SUBJECT_ID ,WORKFLOW_ID ,WORKFLOW_RUN_ID ,WORKLET_RUN_ID , INSTANCE_ID ,TASK_ID ,START_TIME

Merge statement in Teradata:-

MERGE INTO schema.INFA_TASK_RUN USING schema.INFA_TASK_RUN_STG src
ON
        INFA_TASK_RUN_RAW.SUBJECT_ID = src.SUBJECT_ID
AND     INFA_TASK_RUN_RAW.WORKFLOW_ID = src.WORKFLOW_ID
AND     INFA_TASK_RUN_RAW.WORKFLOW_RUN_ID = src.WORKFLOW_RUN_ID
AND     INFA_TASK_RUN_RAW.WORKLET_RUN_ID = src.WORKLET_RUN_ID
AND     INFA_TASK_RUN_RAW.INSTANCE_ID = src.INSTANCE_ID
AND     INFA_TASK_RUN_RAW.TASK_ID = src.TASK_ID
AND     INFA_TASK_RUN_RAW.START_TIME = src.START_TIME
WHEN MATCHED THEN UPDATE SET
        END_TIME = src.END_TIME
,       RUN_ERR_CODE = src.RUN_ERR_CODE
,       RUN_ERR_MSG = src.RUN_ERR_MSG
,       RUN_STATUS_CODE = src.RUN_STATUS_CODE
WHEN NOT MATCHED THEN INSERT(
                SUBJECT_AREA
        ,       WORKFLOW_NAME
        ,       VERSION_NUMBER
        ,       SUBJECT_ID
        ,       WORKFLOW_ID
        ,       WORKFLOW_RUN_ID
        ,       WORKLET_RUN_ID
        ,       CHILD_RUN_ID
        ,       INSTANCE_ID
        ,       INSTANCE_NAME
        ,       TASK_ID
        ,       TASK_TYPE_NAME
        ,       TASK_TYPE
        ,       START_TIME
        ,       END_TIME
        ,       RUN_ERR_CODE
        ,       RUN_ERR_MSG
        ,       RUN_STATUS_CODE
        ,       TASK_NAME
        ,       TASK_VERSION_NUMBER
        ,       SERVER_ID
        ,       SERVER_NAME
        )VALUES(
                src.SUBJECT_AREA
        ,       src.WORKFLOW_NAME
        ,       src.VERSION_NUMBER
        ,       src.SUBJECT_ID
        ,       src.WORKFLOW_ID
        ,       src.WORKFLOW_RUN_ID
        ,       src.WORKLET_RUN_ID
        ,       src.CHILD_RUN_ID
        ,       src.INSTANCE_ID
        ,       src.INSTANCE_NAME
        ,       src.TASK_ID
        ,       src.TASK_TYPE_NAME
        ,       src.TASK_TYPE
        ,       src.START_TIME
        ,       src.END_TIME
        ,       src.RUN_ERR_CODE
        ,       src.RUN_ERR_MSG
        ,       src.RUN_STATUS_CODE
        ,       src.TASK_NAME
        ,       src.TASK_VERSION_NUMBER
        ,       src.SERVER_ID
        ,       src.SERVER_NAME
        );

As I Know MYSQL DB doesn't support Merge statement. I am trying update and insert statement. but it doesn't seems to be right.

UPDATE schema.INFA_TASK_RUN tgt INNER JOIN schema.INFA_TASK_RUN_STG src
ON
       tgt.SUBJECT_ID = src.SUBJECT_ID
AND     tgt.WORKFLOW_ID = src.WORKFLOW_ID
AND     tgt.WORKFLOW_RUN_ID = src.WORKFLOW_RUN_ID
AND     tgt.WORKLET_RUN_ID = src.WORKLET_RUN_ID
AND     tgt.INSTANCE_ID = src.INSTANCE_ID
AND     tgt.TASK_ID = src.TASK_ID
AND     tgt.START_TIME = src.START_TIME
 SET
        tgt.END_TIME = src.END_TIME
,       tgt.RUN_ERR_CODE = src.RUN_ERR_CODE
,       tgt.RUN_ERR_MSG = src.RUN_ERR_MSG
,       tgt.RUN_STATUS_CODE = src.RUN_STATUS_CODE;

insert into schema.INFA_TASK_RUN (SUBJECT_AREA         ,       WORKFLOW_NAME         ,       VERSION_NUMBER         ,       SUBJECT_ID         ,       WORKFLOW_ID         ,       WORKFLOW_RUN_ID         ,       WORKLET_RUN_ID         ,       CHILD_RUN_ID         ,       INSTANCE_ID         ,       INSTANCE_NAME         ,       TASK_ID         ,       TASK_TYPE_NAME         ,       TASK_TYPE         ,       START_TIME         ,       END_TIME         ,       RUN_ERR_CODE         ,       RUN_ERR_MSG         ,       RUN_STATUS_CODE         ,       TASK_NAME         ,       TASK_VERSION_NUMBER         ,       SERVER_ID         ,       SERVER_NAME)
    select src.SUBJECT_AREA         ,       src.WORKFLOW_NAME         ,       src.VERSION_NUMBER         ,       src.SUBJECT_ID         ,       src.WORKFLOW_ID         ,       src.WORKFLOW_RUN_ID         ,       src.WORKLET_RUN_ID         ,       src.CHILD_RUN_ID         ,       src.INSTANCE_ID         ,       src.INSTANCE_NAME         ,       src.TASK_ID         ,       src.TASK_TYPE_NAME         ,       src.TASK_TYPE         ,       src.START_TIME         ,       src.END_TIME         ,       src.RUN_ERR_CODE         ,       src.RUN_ERR_MSG         ,       src.RUN_STATUS_CODE         ,       src.TASK_NAME         ,       src.TASK_VERSION_NUMBER         ,       src.SERVER_ID         ,       src.SERVER_NAME
    from schema.INFA_TASK_RUN_STG as src
        left outer join schema.INFA_TASK_RUN as tgt  ON
       tgt.SUBJECT_ID != src.SUBJECT_ID
AND     tgt.WORKFLOW_ID != src.WORKFLOW_ID
AND     tgt.WORKFLOW_RUN_ID != src.WORKFLOW_RUN_ID
AND     tgt.WORKLET_RUN_ID != src.WORKLET_RUN_ID
AND     tgt.INSTANCE_ID != src.INSTANCE_ID
AND     tgt.TASK_ID != src.TASK_ID
AND     tgt.START_TIME != src.START_TIME

Solution

  • Believe what you are looking for is something like this (Not tested and will only work when the primary key is set up correctly):

    INSERT INTO schema.INFA_TASK_RUN (
         SUBJECT_AREA
        ,WORKFLOW_NAME
        ,VERSION_NUMBER
        ,SUBJECT_ID
        ,WORKFLOW_ID
        ,WORKFLOW_RUN_ID
        ,WORKLET_RUN_ID
        ,CHILD_RUN_ID
        ,INSTANCE_ID
        ,INSTANCE_NAME
        ,TASK_ID
        ,TASK_TYPE_NAME
        ,TASK_TYPE
        ,START_TIME
        ,END_TIME
        ,RUN_ERR_CODE
        ,RUN_ERR_MSG
        ,RUN_STATUS_CODE
        ,TASK_NAME
        ,TASK_VERSION_NUMBER
        ,SERVER_ID
        ,SERVER_NAME
        )
    SELECT
         SUBJECT_AREA
        ,WORKFLOW_NAME
        ,VERSION_NUMBER
        ,SUBJECT_ID
        ,WORKFLOW_ID
        ,WORKFLOW_RUN_ID
        ,WORKLET_RUN_ID
        ,CHILD_RUN_ID
        ,INSTANCE_ID
        ,INSTANCE_NAME
        ,TASK_ID
        ,TASK_TYPE_NAME
        ,TASK_TYPE
        ,START_TIME
        ,END_TIME
        ,RUN_ERR_CODE
        ,RUN_ERR_MSG
        ,RUN_STATUS_CODE
        ,TASK_NAME
        ,TASK_VERSION_NUMBER
        ,SERVER_ID
        ,SERVER_NAME
    FROM schema.INFA_TASK_RUN_STG src
    ON DUPLICATE KEY UPDATE
         END_TIME = src.END_TIME
        ,RUN_ERR_CODE = src.RUN_ERR_CODE
        ,RUN_ERR_MSG = src.RUN_ERR_MSG
        ,RUN_STATUS_CODE = src.RUN_STATUS_CODE;
    

    Edit on 2020-05-21 to show separate update and insert statements based on the comments:

    The INSERT ... ON DUPLICATE KEY statement will probably be faster.

    From the comments I tested the original statements to do the insert and update from the question.

    Note your update statement works correctly. Only issue is that every row is updated even if there were no changes.

    Could be possible to add conditions to the join like tgt.END_TIME != src.END_TIME to ensure only changed records are updated.

    Original update query in your question:

    UPDATE schema.INFA_TASK_RUN tgt INNER JOIN schema.INFA_TASK_RUN_STG src
    ON
           tgt.SUBJECT_ID = src.SUBJECT_ID
    AND     tgt.WORKFLOW_ID = src.WORKFLOW_ID
    AND     tgt.WORKFLOW_RUN_ID = src.WORKFLOW_RUN_ID
    AND     tgt.WORKLET_RUN_ID = src.WORKLET_RUN_ID
    AND     tgt.INSTANCE_ID = src.INSTANCE_ID
    AND     tgt.TASK_ID = src.TASK_ID
    AND     tgt.START_TIME = src.START_TIME
     SET
            tgt.END_TIME = src.END_TIME
    ,       tgt.RUN_ERR_CODE = src.RUN_ERR_CODE
    ,       tgt.RUN_ERR_MSG = src.RUN_ERR_MSG
    ,       tgt.RUN_STATUS_CODE = src.RUN_STATUS_CODE;
    

    Updated insert:

    Had to change the insert statement, please see the JOIN is where the columns are equal and we only select where there are no matching value in the target table with the check that the target table column is NULL:

    INSERT INTO schema.INFA_TASK_RUN (
         SUBJECT_AREA
        ,WORKFLOW_NAME
        ,VERSION_NUMBER
        ,SUBJECT_ID
        ,WORKFLOW_ID
        ,WORKFLOW_RUN_ID
        ,WORKLET_RUN_ID
        ,CHILD_RUN_ID
        ,INSTANCE_ID
        ,INSTANCE_NAME
        ,TASK_ID
        ,TASK_TYPE_NAME
        ,TASK_TYPE
        ,START_TIME
        ,END_TIME
        ,RUN_ERR_CODE
        ,RUN_ERR_MSG
        ,RUN_STATUS_CODE
        ,TASK_NAME
        ,TASK_VERSION_NUMBER
        ,SERVER_ID
        ,SERVER_NAME
        )
        select src.SUBJECT_AREA
        ,src.WORKFLOW_NAME
        ,src.VERSION_NUMBER
        ,src.SUBJECT_ID
        ,src.WORKFLOW_ID
        ,src.WORKFLOW_RUN_ID
        ,src.WORKLET_RUN_ID
        ,src.CHILD_RUN_ID
        ,src.INSTANCE_ID
        ,src.INSTANCE_NAME
        ,src.TASK_ID
        ,src.TASK_TYPE_NAME
        ,src.TASK_TYPE
        ,src.START_TIME
        ,src.END_TIME
        ,src.RUN_ERR_CODE
        ,src.RUN_ERR_MSG
        ,src.RUN_STATUS_CODE
        ,src.TASK_NAME
        ,src.TASK_VERSION_NUMBER
        ,src.SERVER_ID
        ,src.SERVER_NAME
        FROM schema.INFA_TASK_RUN as tgt
            RIGHT JOIN schema.INFA_TASK_RUN_STG as src  ON
           tgt.SUBJECT_ID = src.SUBJECT_ID
    AND     tgt.WORKFLOW_ID = src.WORKFLOW_ID
    AND     tgt.WORKFLOW_RUN_ID = src.WORKFLOW_RUN_ID
    AND     tgt.WORKLET_RUN_ID = src.WORKLET_RUN_ID
    AND     tgt.INSTANCE_ID = src.INSTANCE_ID
    AND     tgt.TASK_ID = src.TASK_ID
    AND     tgt.START_TIME = src.START_TIME
    WHERE tgt.SUBJECT_ID IS NULL;