Search code examples
functionpowerbiduplicatespowerquery

Function to get duplicates info for given column names


I need a function in PowerQuery to get additional columns for duplicated data (not just keep/remove duplicates).

Example:

enter image description here

For the given table, I want to get the following info for duplicated columns set {"Date", "Product", "Color"}:

  1. DuplInfo.MinRowId - basically, Id of the first occurrence of data
  2. DuplInfo.nDupl - duplicates counter within MinRowId group

NB! For non duplicates it should return null values


Solution

  • Please try following function (download):

    Function call Example:

    tfnAddDuplicatesInfo2(Source,{"Product","Color","Date"},"DuplInfo" ,"RowId")
    

    Function Arguments:

             srcTable    as           table, // input Table
             inGroupBy   as           list,  // List of ColumnNames to search duplicates 
             outDuplInfo as           text,  // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
    optional inRowId     as nullable  text   // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns
    

    Function body:

    let 
        func = (
                     srcTable    as           table, // input Table
                     inGroupBy   as           list,  // List of ColumnNames to search duplicates 
                     outDuplInfo as           text,  // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
            optional inRowId     as nullable  text   // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns
        ) =>
        let 
            Source = srcTable,
                
            // // To test as script
            // inGroupBy        = {"Product", "Color","Date"},
            // outDuplInfo      = "DuplInfo",        
            // inRowId          = "RowId", // null, "RowId",      
            
            //> == Variables ===================================================
            Columns2Expand = List.Combine({List.Difference(Table.ColumnNames(Source),inGroupBy),{"__outDuplCounter__"}}),
            srcType    = Value.Type(Source),       
            srcTypeRow= 
                Type.ForRecord(
                    Record.Combine( 
                        {
                            Type.RecordFields(Type.TableRow(srcType)),
                            Type.RecordFields(type [__outDuplCounter__= Int64.Type])
                        }
                    ),
                    false
                ),
            RowIdType  = if inRowId<>null then Type.TableColumn(srcType,inRowId) else Any.Type, // Stores Column Typename        
            //< == Variables ===================================================
        
            #"Grouped Rows" = Table.Group(
                Source, 
                inGroupBy, 
                    {
                        {"__tmpCount__"     , each Table.RowCount(_), Int64.Type},
                        {"__MinGroupRowId__", each if  inRowId<> null then List.Min(    Record.Field(_,inRowId) ) else null, RowIdType},
                        {"__AllRows__"      , each Table.AddIndexColumn(_, "__outDuplCounter__", 0, 1, Int64.Type), type table srcTypeRow}
                    }                                           
            ),
            #"Expanded __AllRows__" = Table.ExpandTableColumn(#"Grouped Rows", "__AllRows__", Columns2Expand),
            nulls4MinRowId = Table.ReplaceValue(#"Expanded __AllRows__",each [__tmpCount__]<=1, null,     
            (currentValue, isConditionTrue, replacementValue) => if isConditionTrue then null else currentValue, // Replace.Value function
            if inRowId<>null then {"__MinGroupRowId__","__outDuplCounter__"} else {"__outDuplCounter__"}
            ),
            
            Add_outDuplInfo = 
                if inRowId<> null then
                    Table.AddColumn(nulls4MinRowId, outDuplInfo, 
                        each 
                            if [__outDuplCounter__]=null 
                            then null 
                            else [MinRowId=[__MinGroupRowId__], nDupl = [__outDuplCounter__]] , 
                        type nullable [MinRowId = RowIdType, nDupl = Int64.Type]
                    )
                else
                     Table.AddColumn(nulls4MinRowId, outDuplInfo, each [__outDuplCounter__], Int64.Type),
                       
            Result_tfnAddDuplMinRowId = Table.SelectColumns(Add_outDuplInfo, List.Combine({Table.ColumnNames(Source),{outDuplInfo}}))
        in
            Result_tfnAddDuplMinRowId,
    
        documentation = [
            Documentation.Name =  " tfnAddDuplicatesInfo2 ",
            Documentation.Description     = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
            Documentation.LongDescription = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
            Documentation.Category = " Running Total ",
            Documentation.Source = "  ",
            Documentation.Version = " 1.0 ",
            Documentation.Author = " Denis Sipchenko ",
            Documentation.Examples = {
                [
                    Description =  "tfnAddDuplicatesInfo2 arguments: ",
                    Code = "
             srcTable    as           table, // input Table
             inGroupBy   as           list,  // List of ColumnNames to search duplicates 
             outDuplInfo as           text,  // Output ColumnName for Information about Duplicates - Duplicate number and Minimal RowId (if inRowId provided) within a group
    optional inRowId     as nullable  text   // RowId ColumnName - required for outMinRowId calculation for inGroupBy columns",
                    Result =""
                ],
                [
                    Description =  "tfnAddDuplicatesInfo2 function call example  ",
                    Code = "
    let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93+w8n2/uZWwBml07NfwVTIS+nN+PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw+x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox+UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
    in
        tfnAddDuplicatesInfo2(Source,{""Product"",""Color"",""Date""},""DuplInfo"" ,""RowId"")
    ",
                    Result = "Adds to Source table ""DuplInfo"" column with records:
      ""MinRowId"" - Minimal RowId within within given group,
      ""nDupl""    - given group Occurence Number
    "
                ],
                [
                    Description =  "tfnAddDuplicatesInfo2 function short call example  ",
                    Code = "
    let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93+w8n2/uZWwBml07NfwVTIS+nN+PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw+x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox+UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
    in
        tfnAddDuplicatesInfo2(Source,{""Product"",""Color"",""Date""},""nDupl"")
    ",
                    Result = "Adds to Source table one column:
      ""nDupl""    - given group Occurence Number
    "
                ]
            }
        ]
    in  
        Value.ReplaceType(func, Value.ReplaceMetadata(Value.Type(func), documentation))
    

    P.S. Idea about group & expand index column borrowed from horseyride post.

    P.S.S. Initially, I took as a source Running Total by Category by Rick de Groot. And than reworked it.