Search code examples
powerbipowerquerypowerbi-desktopm

Tips to optimize a Full Outer Join on matching Name and Date/Time fields


The following steps perform a Full Outer Join, then Append operation on two tables of about 1.5 million records each.

The preview has been refreshing for 5 hours on my workstation with no end in sight. How could I make these steps more efficient?

/* 
pfFPTmcumerged -- I performed a Full Outer Join on the common fields 
[Hull Number], [Date], [Time] then Expanded all fields of the second 
table in the join. 
*/

[
    Source = Table.NestedJoin(pfFPTenginemcu, {"Hull Number", "Date", "Time"}, pfFPTswbdmcu, {"Hull Number", "Date", "Time"}, "pfFPTswbdmcu", JoinKind.FullOuter),
    #"Expanded Ref Step" = Table.ExpandTableColumn(Source, "pfFPTswbdmcu", {"Date", "Time", "Hull Number", "40F_101T01", "40F_201T01", "40F_301T01", "40I_101T01", "40I_102T01", "40I_201T01", "40I_202T01", "40U_101T01", "40U_102T01", "40U_103T01", "40U_201T01", "40U_202T01", "40U_203T01", "40U_301T01", "41AZ016T03", "41BP003T03", "41BP007T03", "41BP008T03", "41BP012T03", "41BS001T01", "41BS010T03", "41BT005T03", "41BT009T03", "41BT011T03", "41BT013T03", "41F_001T01", "41I_001T01", "41P_001T01", "41PF001T01", "41Q_001T01", "41S_001T01", "41U_001T01", "41U_014T03", "42AZ016T03", "42BP003T03", "42BP007T03", "42BP008T03", "42BP012T03", "42BS001T01", "42BS010T03", "42BT005T03", "42BT009T03", "42BT011T03", "42BT013T03", "42F_001T01", "42I_001T01", "42P_001T01", "42PF001T01", "42Q_001T01", "42S_001T01", "42U_001T01", "42U_014T03", "43BS001T01", "43F_001T01", "43I_001T01", "43P_001T01", "43PF001T01", "43Q_001T01", "43S_001T01", "43U_001T01"}, {"Date.1", "Time.1", "Hull Number.1", "40F_101T01", "40F_201T01", "40F_301T01", "40I_101T01", "40I_102T01", "40I_201T01", "40I_202T01", "40U_101T01", "40U_102T01", "40U_103T01", "40U_201T01", "40U_202T01", "40U_203T01", "40U_301T01", "41AZ016T03", "41BP003T03", "41BP007T03", "41BP008T03", "41BP012T03", "41BS001T01", "41BS010T03", "41BT005T03", "41BT009T03", "41BT011T03", "41BT013T03", "41F_001T01", "41I_001T01", "41P_001T01", "41PF001T01", "41Q_001T01", "41S_001T01", "41U_001T01", "41U_014T03", "42AZ016T03", "42BP003T03", "42BP007T03", "42BP008T03", "42BP012T03", "42BS001T01", "42BS010T03", "42BT005T03", "42BT009T03", "42BT011T03", "42BT013T03", "42F_001T01", "42I_001T01", "42P_001T01", "42PF001T01", "42Q_001T01", "42S_001T01", "42U_001T01", "42U_014T03", "43BS001T01", "43F_001T01", "43I_001T01", "43P_001T01", "43PF001T01", "43Q_001T01", "43S_001T01", "43U_001T01"})
]

/*
pfFPTmcumergedLeft -- Referencing the full expanded table, I pulled 
the "left" part of the table/records where the common fields had a 
match between the two original tables by filtering one of the common 
fields for <> null. Then just removed the redundant common fields from 
the second table.
*/

let
    Source = pfFPTmcumerged,
    #"Expanded Ref Step" = Source[Expanded Ref Step],
    #"Left of Merge" = Table.SelectRows(#"Expanded Ref Step", each [Date] <> null),
    #"Remove Redundant SWBD Date Time Hull" = Table.RemoveColumns(#"Left of Merge",{"Date.1", "Time.1", "Hull Number.1"})
in
    #"Remove Redundant SWBD Date Time Hull"

/*
pfFPTmcumergedRight -- Referencing the full expanded table, I pulled 
the "right" part of the table/records where the common fields 
did not match between the two original tables by filtering one of 
the common fields for = null. Then kept only the fields of the 
original second table (and renamed them to match the names of the 
common fields of the "Left" table. Now, I only have one set of the 
common fields in the "Left" and "Right" tables.
*/

let
    Source = pfFPTmcumerged,
    #"Expanded Ref Step" = Source[Expanded Ref Step],
    #"Right of Merge" = Table.SelectRows(#"Expanded Ref Step", each [Date] = null),
    #"Keep Only SWBD" = Table.SelectColumns(#"Right of Merge",{"Date.1", "Time.1", "Hull Number.1", "40F_101T01", "40F_201T01", "40F_301T01", "40I_101T01", "40I_102T01", "40I_201T01", "40I_202T01", "40U_101T01", "40U_102T01", "40U_103T01", "40U_201T01", "40U_202T01", "40U_203T01", "40U_301T01", "41AZ016T03", "41BP003T03", "41BP007T03", "41BP008T03", "41BP012T03", "41BS001T01", "41BS010T03", "41BT005T03", "41BT009T03", "41BT011T03", "41BT013T03", "41F_001T01", "41I_001T01", "41P_001T01", "41PF001T01", "41Q_001T01", "41S_001T01", "41U_001T01", "41U_014T03", "42AZ016T03", "42BP003T03", "42BP007T03", "42BP008T03", "42BP012T03", "42BS001T01", "42BS010T03", "42BT005T03", "42BT009T03", "42BT011T03", "42BT013T03", "42F_001T01", "42I_001T01", "42P_001T01", "42PF001T01", "42Q_001T01", "42S_001T01", "42U_001T01", "42U_014T03", "43BS001T01", "43F_001T01", "43I_001T01", "43P_001T01", "43PF001T01", "43Q_001T01", "43S_001T01", "43U_001T01"}),
    #"Renamed Date Time Hull" = Table.RenameColumns(#"Keep Only SWBD",{{"Date.1", "Date"}, {"Time.1", "Time"}, {"Hull Number.1", "Hull Number"}})
in
    #"Renamed Date Time Hull"

/*
fMCUTrend -- Finally, just appended the "Left" and "Right" tables, 
one on top of the other.
*/

let
    Source = Table.Combine({pfFPTmcumergedLeft, pfFPTmcumergedRight})
in
    Source

Solution

  • I didn't try on 1.5m rows. but manipulating lists might be better than manipulating tables:

    enter image description here

    let
      Source = Excel.CurrentWorkbook(),
      pfFPTenginemcu = Table.Buffer(Source{[Name = "pfFPTenginemcu"]}[Content]),
      pfFPTswbdmcu = Table.Buffer(Source{[Name = "pfFPTswbdmcu"]}[Content]),
      // keys in left table
      keys1 = List.Zip({List.Buffer(pfFPTenginemcu[Hull Number]), List.Buffer(pfFPTenginemcu[Date]), List.Buffer(pfFPTenginemcu[Time])}),
      // keys in right table
      keys2 = List.Zip({List.Buffer(pfFPTswbdmcu[Hull Number]), List.Buffer(pfFPTswbdmcu[Date]), List.Buffer(pfFPTswbdmcu[Time])}),
      // find the rows in table1 that are not included in table2
      indices1 = List.Positions(keys1),
      excludedIndices1 = List.Select(indices1, each List.Contains(keys2, keys1{_}) = false),
      // combine the "left part" of left table with right table
      rows1 = List.Buffer(Table.ToRows(pfFPTenginemcu)),
      rows2 = List.Buffer(Table.ToRows(pfFPTswbdmcu)),
      excludedRows1 = List.Transform(excludedIndices1, each rows1{_}),
      combinedRows = Table.FromRows(excludedRows1 & rows2)
    in
      combinedRows
    

    Edited, I checked, I think Table.NestedJoin is actually faster than List.XXXX.

    So probably the only thing that can be improved is using Tabel.Buffer().

      pfFPTenginemcu = Table.Buffer(Source{[Name = "pfFPTenginemcu"]}[Content]),
      pfFPTswbdmcu = Table.Buffer(Source{[Name = "pfFPTswbdmcu"]}[Content]),