Search code examples
pythonpandasmissing-dataimputation

Unable to impute missing numerical values


I want to impute missing values for both numerical and nominal values. My code for the finding missing numerical values did not return anything even though one of the columns HDI for year actually has null values. What is wrong with my code?

import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style="ticks", color_codes=True)
from pandas.api.types import is_numeric_dtype

df.head()
country year    sex age suicides_no population  suicides/100k pop   country-year    HDI for year    gdp_for_year ($)    gdp_per_capita ($)  generation  year_label  sex_label   age_label   generation_label    is_duplicate
0   Albania 1987    male    15-24 years 21  312900  6.71    Albania1987 NaN 2156624900  796 Generation X    2   1   0   2   False
1   Albania 1987    male    35-54 years 16  308000  5.19    Albania1987 NaN 2156624900  796 Silent  2   1   2   5   False
2   Albania 1987    female  15-24 years 14  289700  4.83    Albania1987 NaN 2156624900  796 Generation X    2   0   0   2   False
3   Albania 1987    male    75+ years   1   21800   4.59    Albania1987 NaN 2156624900  796 G.I. Generation 2   1   5   1   False
4   Albania 1987    male    25-34 years 9   274300  3.28    Albania1987 NaN 2156624900  796 Boomers 2   1   1   0   False

The problematic code

# Check if we have NaN numerical values
if is_numeric_dtype(df) is True:
    if df.isnull().any() is True:
        # Mean values of columns
        print(f"mean-df[i].column = {np.mean(df.column)}")
        # Impute
        df = df.fillna(df.mean())

The rest of the code that seems fine

# Check for missing nominal data
else:
    for col in df.columns:
        if df[col].dtype == object:
            print(col, df[col].unique())
            print(f"mode-df[i], df[i].value_counts().index[0]")
            # Replace '?' with mode - value/level with highest frequency in the feature
            df[col] = df[col].replace({'?': 'df[i].value_counts().index[0]'})

Desired output for imputation of numerical values.

# Do we have NaN in our dataset?
df.isnull().any()

country               False
year                  False
sex                   False
age                   False
suicides_no           False
population            False
suicides/100k pop     False
country-year          False
HDI for year           True
gdp_for_year ($)      False
gdp_per_capita ($)    False
generation            False
year_label            False
sex_label             False
age_label             False
generation_label      False
is_duplicate          False
dtype: bool

print(f"mean-HDI-for-year= {np.mean(df['HDI for year'])}")

> mean-HDI-for-year= 0.7766011477761785

# Impute
df['HDI for year'] = df['HDI for year'].fillna(df['HDI for year'].mean())

Solution

  • As stated in the comments, there's no point in using a for loop and iterate through your columns. You can just impute your numeric columns and your categorical columns separately, using select_dtypes:

    Assumed DF:

    >>> df
       year  sex   age    income
    0  2020    M  27.0   50000.0
    1  2020    F   NaN       NaN
    2  2020    M  29.0   20000.0
    3  2020    F   NaN       NaN
    4  2020  NaN  23.0   30000.0
    5  2020  NaN  24.0       NaN
    6  2020    M   NaN  100000.0
    
    >>> df.isnull().sum()
    year      0
    sex       2
    age       3
    income    3
    

    Impute your numeric columns:

    >>> df.fillna(df.select_dtypes(include='number').mean(), inplace=True)
       year  sex    age    income
    0  2020    M  27.00   50000.0
    1  2020    F  25.75   50000.0
    2  2020    M  29.00   20000.0
    3  2020    F  25.75   50000.0
    4  2020  NaN  23.00   30000.0
    5  2020  NaN  24.00   50000.0
    6  2020    M  25.75  100000.0
    

    And then your object columns:

       year sex    age    income
    0  2020   M  27.00   50000.0
    1  2020   F  25.75   50000.0
    2  2020   M  29.00   20000.0
    3  2020   F  25.75   50000.0
    4  2020   M  23.00   30000.0
    5  2020   M  24.00   50000.0
    6  2020   M  25.75  100000.0