Search code examples
pythonpandasdataframeduplicatesconcatenation

Concatenate dataframes and remove duplicates based on multiple columns value


I want to concatenate two dataframes that have the same columns and remove duplicates.

I then want to remove duplicates of the new dataframe based on multiple key columns.

For example I have these two dataframes

import pandas as pd


    data1 = {'first_column':  ['1', '2', '2', '2'],
            'second_column': ['1', '2', '2', '2'],
           'key_column1':['1', '2', '2', '6'],
          'key_column2':['1', '2', '2', '1'],
           'fourth_column':['1', '2', '2', '2'],
            }
    df1 = pd.DataFrame(data1)
    
    data2 = {'first_column':  ['1', '2', '2', '2'],
            'second_column': ['1', '2', '2', '2'],
           'key_column1':['1', '3', '2', '6'],
          'key_column2':['1', '5', '2', '2'],
           'fourth_column':['1', '2', '2', '2'],
            }
    
    df2 = pd.DataFrame(data2)

Expected output:

enter image description here


Solution

  • Use pandas.concat to concatenate a list of dataframes. Then, use pandas.DataFrame.drop_duplicates() to drop the duplicate records. The pandas.DataFrame.drop_duplicates() function has a parameter called subset that you can use to determine which columns to include in the duplicates search. Here's how to do it, using the example you gave:

    
    import pandas as pd
    
    data1 = {
        "first_column": ["1", "2", "2", "2"],
        "second_column": ["1", "2", "2", "2"],
        "key_column1": ["1", "2", "2", "6"],
        "key_column2": ["1", "2", "2", "1"],
        "fourth_column": ["1", "2", "2", "2"],
    }
    df1 = pd.DataFrame(data1)
    
    data2 = {
        "first_column": ["1", "2", "2", "2"],
        "second_column": ["1", "2", "2", "2"],
        "key_column1": ["1", "3", "2", "6"],
        "key_column2": ["1", "5", "2", "2"],
        "fourth_column": ["1", "2", "2", "2"],
    }
    
    df2 = pd.DataFrame(data2)
    
    df_combined = (
        pd.concat([df1, df2])
        .drop_duplicates(subset=["key_column1", "key_column2"], keep="first")
        .reset_index(drop=True)
    )
    df_combined
    
    

    The above code outputs the following:

    enter image description here

    If you want to wrapt it all inside of a function here's how you might consider doing it:

    from __future__ import annotations
    from typing import List
    import pandas as pd
    
    
    def concat_no_dups(*dfs: pd.DataFrame, key_columns: List[str] | str | None = None) -> pd.DataFrame:
        """
        Concatenate dataframes together and drop duplicates based on equality of
        selected columns.
    
        Parameters
        ----------
        dfs : pd.DataFrame
            A sequence of dataframes.
        key_columns : List[str] | str | None, optional
            The columns to use as keys to drop duplicates.
            If no list, or column provided, it uses all the columns
            from the concatenated dataframe, to search for duplicates
    
        Returns
        -------
        df_combined : DataFrame
            The concatenated dataframe with duplicate rows removed.
    
        Raises
        ------
        KeyError
            If one or more keys from `key_columns` do not exist in the dataframe,
            obtained after combining all `dfs` together.
    
        Examples
        --------
        >>> import pandas as pd
        >>> data1 = pd.DataFrame(
            [['a', 12], ['a', 15], ['b', 19], ['c', 18]],
            columns=['key', 'value']
        )
        >>> data2 = pd.DataFrame(
            [['a', 16], ['a', 10], ['b', 13], ['e', 19]],
            columns=['key', 'value']
        )
        >>> concat_no_dups(data1, data2, key_columns='key')
          key  value
        0   a     12
        1   b     19
        2   c     18
        3   e     19
        >>> concat_no_dups(data1, data2, key_columns=['key', 'value'])
          key  value
        0   a     12
        1   a     15
        2   b     19
        3   c     18
        4   a     16
        5   a     10
        6   b     13
        7   e     19
        >>> concat_no_dups(data1, data2)
          key  value
        0   a     12
        1   a     15
        2   b     19
        3   c     18
        4   a     16
        5   a     10
        6   b     13
        7   e     19
        >>> concat_no_dups(data1, data2, data1)
          key  value
        0   a     12
        1   a     15
        2   b     19
        3   c     18
        4   a     16
        5   a     10
        6   b     13
        7   e     19
        """
        df_combined = pd.concat([df for df in dfs])
        if key_columns is None:
            key_columns = list(df_combined.columns)
        elif not hasattr(key_columns, '__iter__') or isinstance(key_columns, str):
            key_columns = [key_columns]
        missing_keys = pd.Index(key_columns).difference(df_combined.columns)
        if not missing_keys.empty:
            raise KeyError(
                f"The following key columns weren't found: {list(missing_keys)}. "
                f"Available columns: {list(df_combined.columns.astype(str))}."
            )
        return df_combined.drop_duplicates(subset=key_columns).reset_index(drop=True)