The following steps perform a Full Outer Join, then Append operation on two tables of about 1.5 million records each.
The preview has been refreshing for 5 hours on my workstation with no end in sight. How could I make these steps more efficient?
/*
pfFPTmcumerged -- I performed a Full Outer Join on the common fields
[Hull Number], [Date], [Time] then Expanded all fields of the second
table in the join.
*/
[
Source = Table.NestedJoin(pfFPTenginemcu, {"Hull Number", "Date", "Time"}, pfFPTswbdmcu, {"Hull Number", "Date", "Time"}, "pfFPTswbdmcu", JoinKind.FullOuter),
#"Expanded Ref Step" = Table.ExpandTableColumn(Source, "pfFPTswbdmcu", {"Date", "Time", "Hull Number", "40F_101T01", "40F_201T01", "40F_301T01", "40I_101T01", "40I_102T01", "40I_201T01", "40I_202T01", "40U_101T01", "40U_102T01", "40U_103T01", "40U_201T01", "40U_202T01", "40U_203T01", "40U_301T01", "41AZ016T03", "41BP003T03", "41BP007T03", "41BP008T03", "41BP012T03", "41BS001T01", "41BS010T03", "41BT005T03", "41BT009T03", "41BT011T03", "41BT013T03", "41F_001T01", "41I_001T01", "41P_001T01", "41PF001T01", "41Q_001T01", "41S_001T01", "41U_001T01", "41U_014T03", "42AZ016T03", "42BP003T03", "42BP007T03", "42BP008T03", "42BP012T03", "42BS001T01", "42BS010T03", "42BT005T03", "42BT009T03", "42BT011T03", "42BT013T03", "42F_001T01", "42I_001T01", "42P_001T01", "42PF001T01", "42Q_001T01", "42S_001T01", "42U_001T01", "42U_014T03", "43BS001T01", "43F_001T01", "43I_001T01", "43P_001T01", "43PF001T01", "43Q_001T01", "43S_001T01", "43U_001T01"}, {"Date.1", "Time.1", "Hull Number.1", "40F_101T01", "40F_201T01", "40F_301T01", "40I_101T01", "40I_102T01", "40I_201T01", "40I_202T01", "40U_101T01", "40U_102T01", "40U_103T01", "40U_201T01", "40U_202T01", "40U_203T01", "40U_301T01", "41AZ016T03", "41BP003T03", "41BP007T03", "41BP008T03", "41BP012T03", "41BS001T01", "41BS010T03", "41BT005T03", "41BT009T03", "41BT011T03", "41BT013T03", "41F_001T01", "41I_001T01", "41P_001T01", "41PF001T01", "41Q_001T01", "41S_001T01", "41U_001T01", "41U_014T03", "42AZ016T03", "42BP003T03", "42BP007T03", "42BP008T03", "42BP012T03", "42BS001T01", "42BS010T03", "42BT005T03", "42BT009T03", "42BT011T03", "42BT013T03", "42F_001T01", "42I_001T01", "42P_001T01", "42PF001T01", "42Q_001T01", "42S_001T01", "42U_001T01", "42U_014T03", "43BS001T01", "43F_001T01", "43I_001T01", "43P_001T01", "43PF001T01", "43Q_001T01", "43S_001T01", "43U_001T01"})
]
/*
pfFPTmcumergedLeft -- Referencing the full expanded table, I pulled
the "left" part of the table/records where the common fields had a
match between the two original tables by filtering one of the common
fields for <> null. Then just removed the redundant common fields from
the second table.
*/
let
Source = pfFPTmcumerged,
#"Expanded Ref Step" = Source[Expanded Ref Step],
#"Left of Merge" = Table.SelectRows(#"Expanded Ref Step", each [Date] <> null),
#"Remove Redundant SWBD Date Time Hull" = Table.RemoveColumns(#"Left of Merge",{"Date.1", "Time.1", "Hull Number.1"})
in
#"Remove Redundant SWBD Date Time Hull"
/*
pfFPTmcumergedRight -- Referencing the full expanded table, I pulled
the "right" part of the table/records where the common fields
did not match between the two original tables by filtering one of
the common fields for = null. Then kept only the fields of the
original second table (and renamed them to match the names of the
common fields of the "Left" table. Now, I only have one set of the
common fields in the "Left" and "Right" tables.
*/
let
Source = pfFPTmcumerged,
#"Expanded Ref Step" = Source[Expanded Ref Step],
#"Right of Merge" = Table.SelectRows(#"Expanded Ref Step", each [Date] = null),
#"Keep Only SWBD" = Table.SelectColumns(#"Right of Merge",{"Date.1", "Time.1", "Hull Number.1", "40F_101T01", "40F_201T01", "40F_301T01", "40I_101T01", "40I_102T01", "40I_201T01", "40I_202T01", "40U_101T01", "40U_102T01", "40U_103T01", "40U_201T01", "40U_202T01", "40U_203T01", "40U_301T01", "41AZ016T03", "41BP003T03", "41BP007T03", "41BP008T03", "41BP012T03", "41BS001T01", "41BS010T03", "41BT005T03", "41BT009T03", "41BT011T03", "41BT013T03", "41F_001T01", "41I_001T01", "41P_001T01", "41PF001T01", "41Q_001T01", "41S_001T01", "41U_001T01", "41U_014T03", "42AZ016T03", "42BP003T03", "42BP007T03", "42BP008T03", "42BP012T03", "42BS001T01", "42BS010T03", "42BT005T03", "42BT009T03", "42BT011T03", "42BT013T03", "42F_001T01", "42I_001T01", "42P_001T01", "42PF001T01", "42Q_001T01", "42S_001T01", "42U_001T01", "42U_014T03", "43BS001T01", "43F_001T01", "43I_001T01", "43P_001T01", "43PF001T01", "43Q_001T01", "43S_001T01", "43U_001T01"}),
#"Renamed Date Time Hull" = Table.RenameColumns(#"Keep Only SWBD",{{"Date.1", "Date"}, {"Time.1", "Time"}, {"Hull Number.1", "Hull Number"}})
in
#"Renamed Date Time Hull"
/*
fMCUTrend -- Finally, just appended the "Left" and "Right" tables,
one on top of the other.
*/
let
Source = Table.Combine({pfFPTmcumergedLeft, pfFPTmcumergedRight})
in
Source
I didn't try on 1.5m rows. but manipulating lists might be better than manipulating tables:
let
Source = Excel.CurrentWorkbook(),
pfFPTenginemcu = Table.Buffer(Source{[Name = "pfFPTenginemcu"]}[Content]),
pfFPTswbdmcu = Table.Buffer(Source{[Name = "pfFPTswbdmcu"]}[Content]),
// keys in left table
keys1 = List.Zip({List.Buffer(pfFPTenginemcu[Hull Number]), List.Buffer(pfFPTenginemcu[Date]), List.Buffer(pfFPTenginemcu[Time])}),
// keys in right table
keys2 = List.Zip({List.Buffer(pfFPTswbdmcu[Hull Number]), List.Buffer(pfFPTswbdmcu[Date]), List.Buffer(pfFPTswbdmcu[Time])}),
// find the rows in table1 that are not included in table2
indices1 = List.Positions(keys1),
excludedIndices1 = List.Select(indices1, each List.Contains(keys2, keys1{_}) = false),
// combine the "left part" of left table with right table
rows1 = List.Buffer(Table.ToRows(pfFPTenginemcu)),
rows2 = List.Buffer(Table.ToRows(pfFPTswbdmcu)),
excludedRows1 = List.Transform(excludedIndices1, each rows1{_}),
combinedRows = Table.FromRows(excludedRows1 & rows2)
in
combinedRows
Edited, I checked, I think Table.NestedJoin is actually faster than List.XXXX.
So probably the only thing that can be improved is using Tabel.Buffer().
pfFPTenginemcu = Table.Buffer(Source{[Name = "pfFPTenginemcu"]}[Content]),
pfFPTswbdmcu = Table.Buffer(Source{[Name = "pfFPTswbdmcu"]}[Content]),