I am trying to create a machine learning model to predict who would survive on the Titanic. Everytime I try to fit my model, I get this error :
coordinates = np.where(mask.transpose())[::-1]
AttributeError: 'bool' object has no attribute 'transpose'
The code I am running is the following :
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from itertools import combinations
import pandas as pd
import numpy as np
#read in data
training_data = pd.read_csv('train.csv')
testing_data = pd.read_csv('test.csv')
#seperate X and Y
X_train_full = training_data.copy()
y = X_train_full.Survived
X_train_full.drop(['Survived'], axis=1, inplace=True)
y_test = testing_data
#get all str columns
cat_columns1 = [cname for cname in X_train_full.columns if
X_train_full[cname].dtype == "object"]
interactions = pd.DataFrame(index= X_train_full)
#create new features
for combination in combinations(cat_columns1,2):
imputer = SimpleImputer(strategy='constant')
new_col_name = '_'.join(combination)
col1 = X_train_full[combination[0]]
col2 = X_train_full[combination[1]]
col1 = np.array(col1).reshape(-1,1)
col2 = np.array(col2).reshape(-1,1)
col1 = imputer.fit_transform(col1)
col2 = imputer.fit_transform(col2)
new_vals = col1 + '_' + col2
OneHot = OneHotEncoder()
interactions[new_col_name] = OneHot.fit_transform(new_vals)
interactions = interactions.reset_index(drop = True)
#create new dataframe with new features included
new_df = X_train_full.join(interactions)
#do the same for the test file
interactions2 = pd.DataFrame(index= y_test)
for combination in combinations(cat_columns1,2):
imputer = SimpleImputer(strategy='constant')
new_col_name = '_'.join(combination)
col1 = y_test[combination[0]]
col2 = y_test[combination[1]]
col1 = np.array(col1).reshape(-1,1)
col2 = np.array(col2).reshape(-1,1)
col1 = imputer.fit_transform(col1)
col2 = imputer.fit_transform(col2)
new_vals = col1 + '_' + col2
OneHot = OneHotEncoder()
interactions2[new_col_name] = OneHot.fit_transform(new_vals)
interactions2[new_col_name] = new_vals
interactions2 = interactions2.reset_index(drop = True)
y_test = y_test.join(interactions2)
#get names of cat columns (with new features added)
cat_columns = [cname for cname in new_df.columns if
new_df[cname].dtype == "object"]
# Select numerical columns
num_columns = [cname for cname in new_df.columns if
new_df[cname].dtype in ['int64', 'float64']]
#set up pipeline
numerical_transformer = SimpleImputer(strategy = 'constant')
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, num_columns),
('cat', categorical_transformer, cat_columns)
])
model = XGBClassifier()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)
])
#fit model
my_pipeline.fit(new_df,y)
The csv files I am reading are available from Kaggle at this link :
https://www.kaggle.com/c/titanic/data
I cannot figure out what is causing this problem. Any help would be much appreciated.
This probably happens because your data contains pd.NA
values. pd.NA
was introduced in pandas 1.0.0, but is still marked as experimental.
SimpleImputer
will ultimately run data == np.nan
, which would usually return a numpy array. In stead, it is returning a single boolean scalar when data
contains pd.NA
values.
An example:
import pandas as pd
import numpy as np
test_pd_na = pd.DataFrame({"A": [1, 2, 3, pd.NA]})
test_np_nan = pd.DataFrame({"A": [1, 2, 3, np.nan]})
test_np_nan.to_numpy() == np.nan:
> array([[False],
[False],
[False],
[False]])
test_pd_na.to_numpy() == np.nan
> False
The solution would be to convert all pd.NA
values to np.nan
before running SimpleImputer
. You can use .replace({pd.NA: np.nan})
on your data frames for this purpose. The downside is obviously that you loose the benefits pd.NA
brings, such as integer columns with missing data, in stead of those columns being converted to float columns.