[SOLVED]The below process is where I process my new data and try to predict but fail using the data and my trained model.
First I import,
import pandas as pd
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
import numpy as np
import numpy.random as nr
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import math
%matplotlib inline
Import data and data processing
##test
##prepare test_data
x_test_data = pd.read_csv('AW_test.csv')
x_test_data.loc[:,x_test_data.dtypes==object].isnull().sum()
##dropnan
cols_of_interest = ['Title','MiddleName','Suffix','AddressLine2']
x_test_data.drop(cols_of_interest,axis=1,inplace=True)
##dropduplicate
x_test_data.drop_duplicates(subset = 'CustomerID', keep = 'first',
inplace=True)
print(x_test_data.shape)
Then I transform my categorical variables features to one-hot encoded matrices
##change categorical variables to numeric variables
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
categorical_columns =
['CountryRegionName','Education','Occupation','Gender','MaritalStatus']
Features = encode_string(x_test_data['CountryRegionName'])
for col in categorical_columns:
temp = encode_string(x_test_data[col])
Features = np.concatenate([Features, temp],axis=1)
print(Features)
Then, I add the rest of the numeric features on to the matrices
##add numeric variables
Features = np.concatenate([Features,
np.array(x_test_data[['HomeOwnerFlag','NumberCarsOwned',
'TotalChildren','YearlyIncome']])], axis=1)
Next, I scale the Feature Matrices
##scale numeric variables
with open('./lin_reg_scaler.pickle', 'rb') as file:
scaler =pickle.load(file)
Features[:,-5:] = scaler.transform(Features[:,-5:])
I load the linear regression model I trained in another file(If needed I can post it)
# Loading the saved linear regression model pickle
import pickle
loaded_model = pickle.load(open('./lin_reg_mod.pickle', 'rb'))
I put my Feature Matrices directly in
#predict
loaded_model.predict(Features)
However, This is what I got
array([-5.71697209e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
-5.71697209e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
-4.64634881e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
-4.64634881e+12, -5.71697209e+12, -5.71697209e+12, -5.71697209e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
-5.71697209e+12, -4.64634881e+12, -5.71697209e+12, -5.71697209e+12,
-4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-5.71697209e+12, -5.71697209e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -5.71697209e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -5.71697209e+12,
-5.71697209e+12, -5.71697209e+12, -5.71697209e+12, -4.64634881e+12,............
In my other file, I've successfully trained my model and test it with my test data.
This is what I got when inputting x_test into my model in that file(The result I want to get):
[83.75482221 66.31820493 47.22211384 ... 69.65032224 88.45908874
58.45193545]
I have no idea what is going on, can someone help plz
[UPDATE]Below is my code for training the model
custs = pd.read_csv('combined_custs.csv')
custs.dtypes
##avemonthspend data
ams = pd.read_csv('AW_AveMonthSpend.csv')
ams.drop_duplicates(subset='CustomerID', keep='first', inplace=True)
##merge
combined_custs=custs.merge(ams)
combined_custs.to_csv('./ams_combined_custs.csv')
combined_custs.head(20)
##change categorical variables to numeric variables
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
categorical_columns =
['CountryRegionName','Education','Occupation','Gender','MaritalStatus']
Features = encode_string(combined_custs['CountryRegionName'])
for col in categorical_columns:
temp = encode_string(combined_custs[col])
Features = np.concatenate([Features, temp],axis=1)
print(Features.shape)
print(Features[:2,:])
##add numeric variables
Features = np.concatenate([Features,
np.array(combined_custs[['HomeOwnerFlag',
'NumberCarsOwned','TotalChildren','YearlyIncome']])], axis=1)
print(Features.shape)
print(Features)
##train_test_split
nr.seed(9988)
labels = np.array(combined_custs['AveMonthSpend'])
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])
print(x_test.shape)
##scale numeric variables
scaler = preprocessing.StandardScaler().fit(x_train[:,-5:])
x_train[:,-5:] = scaler.transform(x_train[:,-5:])
x_test[:,-5:] = scaler.transform(x_test[:,-5:])
x_train[:2,]
import pickle
file = open('./lin_reg_scaler.pickle', 'wb')
pickle.dump(scaler, file)
file.close()
##define and fit the linear regression model
lin_mod = linear_model.LinearRegression(fit_intercept=False)
lin_mod.fit(x_train,y_train)
print(lin_mod.intercept_)
print(lin_mod.coef_)
import pickle
file = open('./lin_reg_mod.pickle', 'wb')
pickle.dump(lin_mod, file)
file.close()
lin_mod.predict(x_test)
And the prediction for my training model is:
array([ 78.20673535, 91.11860042, 75.27284767, 63.69507673,
102.10758616, 74.64252358, 92.84218321, 77.9675721 ,
102.18989779, 96.98098962, 87.61415378, 39.37006326,
85.81839618, 78.41392293, 45.49439829, 48.0944897 ,
36.06024114, 70.03880373, 128.90267485, 54.63235443,
52.20289729, 82.61123334, 41.58779815, 57.6456416 ,
46.64014991, 78.38639454, 77.61072157, 94.5899366 ,.....
You are using this method in both training and testing:
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
by calling:
Features = encode_string(combined_custs['CountryRegionName'])
for col in categorical_columns:
temp = encode_string(combined_custs[col])
Features = np.concatenate([Features, temp],axis=1)
But as I said in my comment above, you need to apply same preprocessing on the test as you did in train.
Here what happens is, during testing, depending on the order of data in the x_test_data
, the encoding changes. So maybe a string value which got the number 0, during training is now getting number 1, and the order of features in your final Features
changes.
To solve this, you need to save the LabelEncoder and OneHotEncoder for each column separately.
So during training, do this:
import pickle
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
# Save the LabelEncoder for this column
encoder_file = open('./'+cat_features+'_encoder.pickle', 'wb')
pickle.dump(lin_mod, encoder_file)
encoder_file.close()
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
# Same for OHE
ohe_file = open('./'+cat_features+'_ohe.pickle', 'wb')
pickle.dump(lin_mod, ohe_file)
ohe_file.close()
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
And then, during testing:
def encode_string(cat_features):
# Load the previously saved encoder
with open('./'+cat_features+'_encoder.pickle', 'rb') as file:
enc = pickle.load(file)
# No fitting, only transform
enc_cat_features = enc.transform(cat_features)
# Same for OHE
with open('./'+cat_features+'_ohe.pickle', 'rb') as file:
enc = pickle.load(file)
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()