Search code examples
pythonpandaspandas-groupby

Group by from wide form in Pandas


I have a DataFrame like this one:

customer_type   age    satisfaction    design  food    wifi    service    distance
        Loyal    28   Not Satisfied         0    1        2          2        13.5
        Loyal    55       Satisfied         5    3        5          4        34.2
     Disloyal    36   Not Satisfied         2    0        2          4        55.8
     Disloyal    28   Not Satisfied         3    1        2          2        13.5
     Disloyal    33   Not Satisfied         2    1        2          2        13.5
     Disloyal    35   Not Satisfied         2    1        2          2        13.5
     Disloyal    39   Not Satisfied         1    1        2          2        13.5
     Disloyal    31   Not Satisfied         2    1        2          2        13.5
        Loyal    28   Not Satisfied         0    1        2          2        13.5
     Disloyal    31   Not Satisfied         2    1        2          2        13.5
     Disloyal    40   Not Satisfied         2    1        2          2        13.5
     Disloyal    35   Not Satisfied         2    1        2          2        13.5
     Disloyal    35   Not Satisfied         2    2        2          2        13.5

I want to find out the characteristics of the Disloyal and Not Satisfied customers that are between 30 and 40 years old, grouping them by the service they have rated:

 service   ratings_count    age   age_count   population_pct
  design               8    40    1           7.69
                            36    1           7.69
                            35    3           23.07
                            33    1           7.69
                            31    2           15.38
    food               1    35    1           7.69

I suspect I have to use melt but I can't figure out how to groupby from there.


Solution

  • With the following toy dataframe, inspired by yours but a bit more heterogeneous:

    import pandas as pd
    
    df = pd.DataFrame(
        {
            "customer_type": [
                "Loyal",
                "Loyal",
                "Disloyal",
                "Disloyal",
                "Loyal",
                "Disloyal",
                "Disloyal",
                "Disloyal",
                "Loyal",
                "Disloyal",
                "Disloyal",
                "Loyal",
                "Disloyal",
            ],
            "age": [28, 55, 27, 31, 42, 35, 39, 31, 28, 51, 40, 35, 35],
            "satisfaction": [
                "Not Satisfied",
                "Satisfied",
                "Not Satisfied",
                "Not Satisfied",
                "Not Satisfied",
                "Not Satisfied",
                "Not Satisfied",
                "Not Satisfied",
                "Not Satisfied",
                "Satisfied",
                "Not Satisfied",
                "Not Satisfied",
                "Satisfied",
            ],
            "design": [0, 5, 2, 4, 2, 2, 1, 0, 1, 2, 0, 2, 1],
            "food": [1, 3, 0, 0, 1, 0, 1, 0, 1, 0, 1, 3, 2],
            "wifi": [2, 5, 2, 1, 2, 2, 0, 0, 1, 2, 2, 0, 2],
            "service": [2, 4, 1, 2, 2, 2, 0, 2, 3, 0, 1, 2, 2],
            "distance": [
                13.5,
                34.2,
                55.8,
                21.2,
                23.5,
                13.5,
                9.0,
                13.5,
                13.5,
                22.7,
                13.5,
                13.5,
                18.4,
            ],
        }
    )
    

    Here is one less subtel, although easier, way to do it:

    # Setup
    amenities = ["design", "food", "wifi", "service"]
    dfs = []
    new_df = df[
        df["age"].between(30, 40)
        & (df["customer_type"] == "Disloyal")
        & (df["satisfaction"] == "Not Satisfied")
    ]
    
    # Iterate on each amenity, groupby age and add other columns/values
    for amenity in amenities:
        temp_df = (
            new_df[new_df[amenity] > 0]
            .groupby("age")
            .agg({amenity: "count", "distance": "mean"})
            .reset_index(drop=False)
            .rename(columns={amenity: "age_count", "distance": "distance_mean"})
            .pipe(
                lambda df_: df_.assign(
                    population_pct=(100 * df_["age_count"] / df_["age_count"].sum()).round(
                        2
                    )
                )
            )
        )
        temp_df.loc[0, "amenities"] = amenity
        temp_df.loc[0, "ratings_count"] = temp_df["age_count"].sum()
        temp_df = pd.concat(
            [
                temp_df,
                new_df[(new_df[amenity] != 0)]
                .sort_values(by=["age"])
                .groupby("age")
                .agg({amenity: list})
                .reset_index(drop=True),
            ],
            axis=1,
        )
        temp_df = temp_df.rename(columns={amenity: "ratings"})
        temp_df["ratings"] = temp_df["ratings"].apply(lambda x: x[0] if len(x) == 1 else x)
        dfs.append(temp_df)
    
    # Get final dataframe and cleanup
    new_df = (
        pd.concat(dfs)
        .fillna(method="ffill")
        .reindex(
            columns=[
                "amenities",
                "ratings_count",
                "age",
                "age_count",
                "ratings",
                "distance_mean",
                "population_pct",
            ]
        )
        .astype({"ratings_count": "int"})
        .set_index(["amenities", "ratings_count"])
    )
    

    So that:

    print(new_df)
    # Output
                             age  age_count ratings  distance_mean  population_pct
    amenities ratings_count                                                       
    design    3               31          1       4          21.20           33.33
              3               35          1       2          13.50           33.33
              3               39          1       1           9.00           33.33
    food      2               39          1       1           9.00           50.00
              2               40          1       1          13.50           50.00
    wifi      3               31          1       1          21.20           33.33
              3               35          1       2          13.50           33.33
              3               40          1       2          13.50           33.33
    service   4               31          2  [2, 2]          17.35           50.00
              4               35          1       2          13.50           25.00
              4               40          1       1          13.50           25.00
    

    From here, you can filter ratings_count <=2 like this:

    print(new_df[new_df.index.get_level_values(1) <= 2])
    # Output
                             age  age_count ratings  distance_mean  population_pct
    amenities ratings_count                                                       
    food      2               39          1       1            9.0            50.0
              2               40          1       1           13.5            50.0