I need a function in PowerQuery to get additional columns for duplicated data (not just keep/remove duplicates).
Example:
For the given table, I want to get the following info for duplicated columns set {"Date", "Product", "Color"}
:
DuplInfo.MinRowId
- basically, Id of the first occurrence of dataDuplInfo.nDupl
- duplicates counter within MinRowId groupNB! For non duplicates it should return null
values
Please try following function (download):
Function call Example:
tfnAddDuplicatesInfo2(Source,{"Product","Color","Date"},"DuplInfo" ,"RowId")
Function Arguments:
srcTable as table, // input Table
inGroupBy as list, // List of ColumnNames to search duplicates
outDuplInfo as text, // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
optional inRowId as nullable text // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns
Function body:
let
func = (
srcTable as table, // input Table
inGroupBy as list, // List of ColumnNames to search duplicates
outDuplInfo as text, // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
optional inRowId as nullable text // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns
) =>
let
Source = srcTable,
// // To test as script
// inGroupBy = {"Product", "Color","Date"},
// outDuplInfo = "DuplInfo",
// inRowId = "RowId", // null, "RowId",
//> == Variables ===================================================
Columns2Expand = List.Combine({List.Difference(Table.ColumnNames(Source),inGroupBy),{"__outDuplCounter__"}}),
srcType = Value.Type(Source),
srcTypeRow=
Type.ForRecord(
Record.Combine(
{
Type.RecordFields(Type.TableRow(srcType)),
Type.RecordFields(type [__outDuplCounter__= Int64.Type])
}
),
false
),
RowIdType = if inRowId<>null then Type.TableColumn(srcType,inRowId) else Any.Type, // Stores Column Typename
//< == Variables ===================================================
#"Grouped Rows" = Table.Group(
Source,
inGroupBy,
{
{"__tmpCount__" , each Table.RowCount(_), Int64.Type},
{"__MinGroupRowId__", each if inRowId<> null then List.Min( Record.Field(_,inRowId) ) else null, RowIdType},
{"__AllRows__" , each Table.AddIndexColumn(_, "__outDuplCounter__", 0, 1, Int64.Type), type table srcTypeRow}
}
),
#"Expanded __AllRows__" = Table.ExpandTableColumn(#"Grouped Rows", "__AllRows__", Columns2Expand),
nulls4MinRowId = Table.ReplaceValue(#"Expanded __AllRows__",each [__tmpCount__]<=1, null,
(currentValue, isConditionTrue, replacementValue) => if isConditionTrue then null else currentValue, // Replace.Value function
if inRowId<>null then {"__MinGroupRowId__","__outDuplCounter__"} else {"__outDuplCounter__"}
),
Add_outDuplInfo =
if inRowId<> null then
Table.AddColumn(nulls4MinRowId, outDuplInfo,
each
if [__outDuplCounter__]=null
then null
else [MinRowId=[__MinGroupRowId__], nDupl = [__outDuplCounter__]] ,
type nullable [MinRowId = RowIdType, nDupl = Int64.Type]
)
else
Table.AddColumn(nulls4MinRowId, outDuplInfo, each [__outDuplCounter__], Int64.Type),
Result_tfnAddDuplMinRowId = Table.SelectColumns(Add_outDuplInfo, List.Combine({Table.ColumnNames(Source),{outDuplInfo}}))
in
Result_tfnAddDuplMinRowId,
documentation = [
Documentation.Name = " tfnAddDuplicatesInfo2 ",
Documentation.Description = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
Documentation.LongDescription = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
Documentation.Category = " Running Total ",
Documentation.Source = " ",
Documentation.Version = " 1.0 ",
Documentation.Author = " Denis Sipchenko ",
Documentation.Examples = {
[
Description = "tfnAddDuplicatesInfo2 arguments: ",
Code = "
srcTable as table, // input Table
inGroupBy as list, // List of ColumnNames to search duplicates
outDuplInfo as text, // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
optional inRowId as nullable text // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns",
Result =""
],
[
Description = "tfnAddDuplicatesInfo2 function call example ",
Code = "
let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93+w8n2/uZWwBml07NfwVTIS+nN+PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw+x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox+UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
in
tfnAddDuplicatesInfo2(Source,{""Product"",""Color"",""Date""},""DuplInfo"" ,""RowId"")
",
Result = "Adds to Source table ""DuplInfo"" column with records:
""MinRowId"" - Minimal RowId within within given group,
""nDupl"" - given group Occurence Number
"
],
[
Description = "tfnAddDuplicatesInfo2 function short call example ",
Code = "
let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93+w8n2/uZWwBml07NfwVTIS+nN+PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw+x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox+UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
in
tfnAddDuplicatesInfo2(Source,{""Product"",""Color"",""Date""},""nDupl"")
",
Result = "Adds to Source table one column:
""nDupl"" - given group Occurence Number
"
]
}
]
in
Value.ReplaceType(func, Value.ReplaceMetadata(Value.Type(func), documentation))
P.S. Idea about group & expand index column borrowed from horseyride post.
P.S.S. Initially, I took as a source Running Total by Category by Rick de Groot. And than reworked it.