Search code examples
pandasdataframepython-3.6preprocessorsklearn-pandas

How to decode LabelEncoder encoded column in pandas DataFrame?


I'm having a dataset. Where I was practicing feature engineering by converting categorical objects to numbers, with the following lines of code:

import pandas as pd 
import numpy as np
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
df.head()
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)

df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])

label_encoder = preprocessing.LabelEncoder() 
for col in colsObj:
    df[col] = label_encoder.fit_transform(df[col])
df.head()
for col in colsObj:
    df[col] = label_encoder.inverse_transform(df[col])
df.head()

But here the inverse_tranform() wasn't returning the original dataset. Please help me!


Solution

  • For correct working it is necessary convert LabelEncoder to dictionary datatype:

    from sklearn import preprocessing
    df = pd.read_csv(r'train.csv',index_col='Id')
    

    print(df.shape)
    print (df.head())
        MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
    Id                                                                    
    1           60       RL         65.0     8450   Pave   NaN      Reg   
    2           20       RL         80.0     9600   Pave   NaN      Reg   
    3           60       RL         68.0    11250   Pave   NaN      IR1   
    4           70       RL         60.0     9550   Pave   NaN      IR1   
    5           60       RL         84.0    14260   Pave   NaN      IR1   
    
       LandContour Utilities LotConfig  ... PoolArea PoolQC Fence MiscFeature  \
    Id                                  ...                                     
    1          Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
    2          Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   
    3          Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
    4          Lvl    AllPub    Corner  ...        0    NaN   NaN         NaN   
    5          Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   
    
       MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
    Id                                                             
    1        0      2    2008        WD         Normal     208500  
    2        0      5    2007        WD         Normal     181500  
    3        0      9    2008        WD         Normal     223500  
    4        0      2    2006        WD        Abnorml     140000  
    5        0     12    2008        WD         Normal     250000  
    
    [5 rows x 80 columns]
    

    colsNum = df.select_dtypes(np.number).columns
    colsObj = df.columns.difference(colsNum)
    
    df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
    df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
    
    from collections import defaultdict
    di = defaultdict(preprocessing.LabelEncoder)
    
    for col in colsObj:
        df[col] = di[col].fit_transform(df[col])
    

    print (df.head())
        MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
    Id                                                                        
    1           60         3         65.0     8450       1      0         3   
    2           20         3         80.0     9600       1      0         3   
    3           60         3         68.0    11250       1      0         0   
    4           70         3         60.0     9550       1      0         0   
    5           60         3         84.0    14260       1      0         0   
    
        LandContour  Utilities  LotConfig  ...  PoolArea  PoolQC  Fence  \
    Id                                     ...                            
    1             3          0          4  ...         0       2      2   
    2             3          0          2  ...         0       2      2   
    3             3          0          4  ...         0       2      2   
    4             3          0          0  ...         0       2      2   
    5             3          0          2  ...         0       2      2   
    
        MiscFeature  MiscVal  MoSold  YrSold  SaleType  SaleCondition  SalePrice  
    Id                                                                            
    1             2        0       2    2008         8              4     208500  
    2             2        0       5    2007         8              4     181500  
    3             2        0       9    2008         8              4     223500  
    4             2        0       2    2006         8              0     140000  
    5             2        0      12    2008         8              4     250000  
    
    [5 rows x 80 columns]
    

    print (di)
    defaultdict(<class 'sklearn.preprocessing._label.LabelEncoder'>, {'Alley': LabelEncoder(), 'BldgType': LabelEncoder(), 'BsmtCond': LabelEncoder(), 'BsmtExposure': LabelEncoder(), 'BsmtFinType1': LabelEncoder(), 'BsmtFinType2': LabelEncoder(), 'BsmtQual': LabelEncoder(), 'CentralAir': LabelEncoder(), 'Condition1': LabelEncoder(), 'Condition2': LabelEncoder(), 'Electrical': LabelEncoder(), 'ExterCond': LabelEncoder(), 'ExterQual': LabelEncoder(), 'Exterior1st': LabelEncoder(), 'Exterior2nd': LabelEncoder(), 'Fence': LabelEncoder(), 'FireplaceQu': LabelEncoder(), 'Foundation': LabelEncoder(), 'Functional': LabelEncoder(), 'GarageCond': LabelEncoder(), 'GarageFinish': LabelEncoder(), 'GarageQual': LabelEncoder(), 'GarageType': LabelEncoder(), 'Heating': LabelEncoder(), 'HeatingQC': LabelEncoder(), 'HouseStyle': LabelEncoder(), 'KitchenQual': LabelEncoder(), 'LandContour': LabelEncoder(), 'LandSlope': LabelEncoder(), 'LotConfig': LabelEncoder(), 'LotShape': LabelEncoder(), 'MSZoning': LabelEncoder(), 'MasVnrType': LabelEncoder(), 'MiscFeature': LabelEncoder(), 'Neighborhood': LabelEncoder(), 'PavedDrive': LabelEncoder(), 'PoolQC': LabelEncoder(), 'RoofMatl': LabelEncoder(), 'RoofStyle': LabelEncoder(), 'SaleCondition': LabelEncoder(), 'SaleType': LabelEncoder(), 'Street': LabelEncoder(), 'Utilities': LabelEncoder()})
    

    for col in colsObj:
        df[col] = di[col].inverse_transform(df[col])
    

    print (df.head())
        MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
    Id                                                                    
    1           60       RL         65.0     8450   Pave  Grvl      Reg   
    2           20       RL         80.0     9600   Pave  Grvl      Reg   
    3           60       RL         68.0    11250   Pave  Grvl      IR1   
    4           70       RL         60.0     9550   Pave  Grvl      IR1   
    5           60       RL         84.0    14260   Pave  Grvl      IR1   
    
       LandContour Utilities LotConfig  ... PoolArea PoolQC  Fence MiscFeature  \
    Id                                  ...                                      
    1          Lvl    AllPub    Inside  ...        0     Gd  MnPrv        Shed   
    2          Lvl    AllPub       FR2  ...        0     Gd  MnPrv        Shed   
    3          Lvl    AllPub    Inside  ...        0     Gd  MnPrv        Shed   
    4          Lvl    AllPub    Corner  ...        0     Gd  MnPrv        Shed   
    5          Lvl    AllPub       FR2  ...        0     Gd  MnPrv        Shed   
    
       MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
    Id                                                             
    1        0      2    2008        WD         Normal     208500  
    2        0      5    2007        WD         Normal     181500  
    3        0      9    2008        WD         Normal     223500  
    4        0      2    2006        WD        Abnorml     140000  
    5        0     12    2008        WD         Normal     250000