Search code examples
machine-learningscikit-learnedx

Why couldn't I predict directly using Features Matrix?


[SOLVED]The below process is where I process my new data and try to predict but fail using the data and my trained model.

First I import,

import pandas as pd
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
import numpy as np
import numpy.random as nr
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import math

%matplotlib inline

Import data and data processing

##test
##prepare test_data
x_test_data = pd.read_csv('AW_test.csv')
x_test_data.loc[:,x_test_data.dtypes==object].isnull().sum()

##dropnan
cols_of_interest = ['Title','MiddleName','Suffix','AddressLine2']
x_test_data.drop(cols_of_interest,axis=1,inplace=True)

##dropduplicate
x_test_data.drop_duplicates(subset = 'CustomerID', keep = 'first', 
inplace=True)
print(x_test_data.shape)

Then I transform my categorical variables features to one-hot encoded matrices

##change categorical variables to numeric variables
def encode_string(cat_features):
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = 
['CountryRegionName','Education','Occupation','Gender','MaritalStatus']
Features = encode_string(x_test_data['CountryRegionName'])
for col in categorical_columns:
    temp = encode_string(x_test_data[col])
    Features = np.concatenate([Features, temp],axis=1)
print(Features)

Then, I add the rest of the numeric features on to the matrices

##add numeric variables
Features = np.concatenate([Features, 
np.array(x_test_data[['HomeOwnerFlag','NumberCarsOwned',
'TotalChildren','YearlyIncome']])], axis=1)

Next, I scale the Feature Matrices

##scale numeric variables
with open('./lin_reg_scaler.pickle', 'rb') as file:
scaler =pickle.load(file)
Features[:,-5:] = scaler.transform(Features[:,-5:])

I load the linear regression model I trained in another file(If needed I can post it)

# Loading the saved linear regression model pickle
import pickle
loaded_model = pickle.load(open('./lin_reg_mod.pickle', 'rb'))

I put my Feature Matrices directly in

#predict
loaded_model.predict(Features)

However, This is what I got

array([-5.71697209e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
   -4.64634881e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
   -5.71697209e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
   -4.64634881e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
   -4.64634881e+12, -5.71697209e+12, -5.71697209e+12, -5.71697209e+12,
   -4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
   -4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
   -5.71697209e+12, -4.64634881e+12, -5.71697209e+12, -5.71697209e+12,
   -4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
   -4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
   -5.71697209e+12, -5.71697209e+12, -4.64634881e+12, -4.64634881e+12,
   -4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -5.71697209e+12,
   -4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
   -4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
   -4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
   -4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -5.71697209e+12,
   -5.71697209e+12, -5.71697209e+12, -5.71697209e+12, -4.64634881e+12,............

In my other file, I've successfully trained my model and test it with my test data.

This is what I got when inputting x_test into my model in that file(The result I want to get):

[83.75482221 66.31820493 47.22211384 ... 69.65032224 88.45908874
  58.45193545]

I have no idea what is going on, can someone help plz

[UPDATE]Below is my code for training the model

custs = pd.read_csv('combined_custs.csv')
custs.dtypes

##avemonthspend data
ams = pd.read_csv('AW_AveMonthSpend.csv')
ams.drop_duplicates(subset='CustomerID', keep='first', inplace=True)
##merge
combined_custs=custs.merge(ams)
combined_custs.to_csv('./ams_combined_custs.csv')
combined_custs.head(20)
##change categorical variables to numeric variables
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = 
['CountryRegionName','Education','Occupation','Gender','MaritalStatus']
Features = encode_string(combined_custs['CountryRegionName'])
for col in categorical_columns:
    temp = encode_string(combined_custs[col])
    Features = np.concatenate([Features, temp],axis=1)
print(Features.shape)
print(Features[:2,:])

##add numeric variables
Features = np.concatenate([Features, 


np.array(combined_custs[['HomeOwnerFlag',
'NumberCarsOwned','TotalChildren','YearlyIncome']])], axis=1)

print(Features.shape)
print(Features)

##train_test_split
nr.seed(9988)
labels = np.array(combined_custs['AveMonthSpend'])
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])
print(x_test.shape)

##scale numeric variables
scaler = preprocessing.StandardScaler().fit(x_train[:,-5:])

x_train[:,-5:] = scaler.transform(x_train[:,-5:])
x_test[:,-5:] = scaler.transform(x_test[:,-5:])
x_train[:2,]

import pickle
file = open('./lin_reg_scaler.pickle', 'wb')
pickle.dump(scaler, file)
file.close()

##define and fit the linear regression model
lin_mod = linear_model.LinearRegression(fit_intercept=False)
lin_mod.fit(x_train,y_train)
print(lin_mod.intercept_)
print(lin_mod.coef_)

import pickle
file = open('./lin_reg_mod.pickle', 'wb')
pickle.dump(lin_mod, file)
file.close()

lin_mod.predict(x_test)

And the prediction for my training model is:

array([ 78.20673535,  91.11860042,  75.27284767,  63.69507673,
   102.10758616,  74.64252358,  92.84218321,  77.9675721 ,
   102.18989779,  96.98098962,  87.61415378,  39.37006326,
    85.81839618,  78.41392293,  45.49439829,  48.0944897 ,
    36.06024114,  70.03880373, 128.90267485,  54.63235443,
    52.20289729,  82.61123334,  41.58779815,  57.6456416 ,
    46.64014991,  78.38639454,  77.61072157,  94.5899366 ,.....

Solution

  • You are using this method in both training and testing:

    def encode_string(cat_features):
        enc = preprocessing.LabelEncoder()
        enc.fit(cat_features)
        enc_cat_features = enc.transform(cat_features)
        ohe = preprocessing.OneHotEncoder()
        encoded = ohe.fit(enc_cat_features.reshape(-1,1))
        return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
    

    by calling:

    Features = encode_string(combined_custs['CountryRegionName'])
    for col in categorical_columns:
        temp = encode_string(combined_custs[col])
        Features = np.concatenate([Features, temp],axis=1)
    

    But as I said in my comment above, you need to apply same preprocessing on the test as you did in train.

    Here what happens is, during testing, depending on the order of data in the x_test_data, the encoding changes. So maybe a string value which got the number 0, during training is now getting number 1, and the order of features in your final Features changes.

    To solve this, you need to save the LabelEncoder and OneHotEncoder for each column separately.

    So during training, do this:

    import pickle
    def encode_string(cat_features):
        enc = preprocessing.LabelEncoder()
        enc.fit(cat_features)
        enc_cat_features = enc.transform(cat_features)
    
        # Save the LabelEncoder for this column
        encoder_file = open('./'+cat_features+'_encoder.pickle', 'wb')
        pickle.dump(lin_mod, encoder_file)
        encoder_file.close()
    
        ohe = preprocessing.OneHotEncoder()
        encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    
        # Same for OHE
        ohe_file = open('./'+cat_features+'_ohe.pickle', 'wb')
        pickle.dump(lin_mod, ohe_file)
        ohe_file.close()
    
        return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
    

    And then, during testing:

    def encode_string(cat_features):
        # Load the previously saved encoder
        with open('./'+cat_features+'_encoder.pickle', 'rb') as file:
            enc = pickle.load(file)
    
        # No fitting, only transform
        enc_cat_features = enc.transform(cat_features)
    
        # Same for OHE
        with open('./'+cat_features+'_ohe.pickle', 'rb') as file:
            enc = pickle.load(file)
    
        return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()