can anyone help me to resolve above error?
### using trasnformers
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
column_trans = ColumnTransformer(
[
('CompanyName_bow', TfidfVectorizer(), 'CompanyName'),
('state_category', OneHotEncoder(), ['state']),
('Termination_Reason_Desc_bow', TfidfVectorizer(), 'Termination_Reason_Desc'),
('TermType_category', OneHotEncoder(), ['TermType'])
],
remainder=MinMaxScaler()
)
X = column_trans.fit_transform(X.head(100))
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y.head(100))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
X_train.shape #(80, 92)
X_test.shape #(20, 92)
y_train.shape #(80,)
X_train.todense()
matrix([[0. , 0. , 0. , ..., 0.26921709, 1. ,
0. ],
[0. , 0. , 0. , ..., 0. , 0. ,
1. ],
[0. , 0. , 0. , ..., 0.46148896, 1. ,
0. ],
...,
[0. , 0. , 0. , ..., 0.46148896, 1. ,
0. ],
[0. , 0. , 0. , ..., 0. , 0. ,
1. ],
[0. , 0. , 0. , ..., 0.46148896, 1. ,
0. ]])
type(X_train)
--> scipy.sparse.csr.csr_matrix
print(y_train)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
type(y_train)
numpy.ndarray
# use autokeras to find a model for the sonar dataset
from numpy import asarray
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from autokeras import StructuredDataClassifier
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# define the search
search = StructuredDataClassifier(max_trials=15)
# perform the search
search.fit(x=(X_train), y=y_train, verbose=0)
# evaluate the model
loss, acc = search.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %.3f' % acc)
Error
(80, 92) (20, 92) (80,) (20,)
INFO:tensorflow:Reloading Oracle from existing project .\structured_data_classifier\oracle.json
INFO:tensorflow:Reloading Tuner from .\structured_data_classifier\tuner0.json
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-106-94708e5d279d> in <module>
10 search = StructuredDataClassifier(max_trials=15)
11 # perform the search
---> 12 search.fit(x=(X_train), y=y_train, verbose=0)
13 # evaluate the model
14 loss, acc = search.evaluate(X_test, y_test, verbose=0)
~\anaconda3\lib\site-packages\autokeras\tasks\structured_data.py in fit(self, x, y, epochs, callbacks, validation_split, validation_data, **kwargs)
313 [keras.Model.fit](https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit).
314 """
--> 315 super().fit(
316 x=x,
317 y=y,
~\anaconda3\lib\site-packages\autokeras\tasks\structured_data.py in fit(self, x, y, epochs, callbacks, validation_split, validation_data, **kwargs)
132 self.check_in_fit(x)
133
--> 134 super().fit(
135 x=x,
136 y=y,
~\anaconda3\lib\site-packages\autokeras\auto_model.py in fit(self, x, y, batch_size, epochs, callbacks, validation_split, validation_data, **kwargs)
259 validation_split = 0
260
--> 261 dataset, validation_data = self._convert_to_dataset(
262 x=x, y=y, validation_data=validation_data, batch_size=batch_size
263 )
~\anaconda3\lib\site-packages\autokeras\auto_model.py in _convert_to_dataset(self, x, y, validation_data, batch_size)
373 x = dataset.map(lambda x, y: x)
374 y = dataset.map(lambda x, y: y)
--> 375 x = self._adapt(x, self.inputs, batch_size)
376 y = self._adapt(y, self._heads, batch_size)
377 dataset = tf.data.Dataset.zip((x, y))
~\anaconda3\lib\site-packages\autokeras\auto_model.py in _adapt(self, dataset, hms, batch_size)
287 adapted = []
288 for source, hm in zip(sources, hms):
--> 289 source = hm.get_adapter().adapt(source, batch_size)
290 adapted.append(source)
291 if len(adapted) == 1:
~\anaconda3\lib\site-packages\autokeras\engine\adapter.py in adapt(self, dataset, batch_size)
65 tf.data.Dataset. The converted dataset.
66 """
---> 67 self.check(dataset)
68 dataset = self.convert_to_dataset(dataset, batch_size)
69 return dataset
~\anaconda3\lib\site-packages\autokeras\adapters\input_adapters.py in check(self, x)
63 def check(self, x):
64 if not isinstance(x, (pd.DataFrame, np.ndarray, tf.data.Dataset)):
---> 65 raise TypeError(
66 "Unsupported type {type} for "
67 "{name}.".format(type=type(x), name=self.__class__.__name__)
TypeError: Unsupported type <class 'scipy.sparse.csr.csr_matrix'> for StructuredDataAdapter.
As noticed in the Github issue you opened in parallel with this thread, sparse matrices are not (currently) supported in AutoKeras, and the advice is to convert them to dense Numpy arrays. Indeed, from the documentation of AutoKeras StructuredDataClassifier
, the training data x
in the respective .fit
method are expected to be:
String, numpy.ndarray, pandas.DataFrame or tensorflow.Dataset
and not SciPy sparse matrix.
Given that here your X_train
is really small:
X_train.shape
# (80, 92)
you have absolutely no reason whatsoever to use a sparse matrix. And although here you seem to try to convert X_train
to a dense one, you do not re-assign it, the result being that it remains a sparse one; from your own code above:
X_train.todense()
# ...
type(X_train)
# scipy.sparse.csr.csr_matrix
What you need to do is simply to reassign it to a dense array:
from scipy.sparse import csr_matrix
X_train = X_train.toarray()
Here is a short demo that this works with dummy data:
import numpy as np
from scipy.sparse import csr_matrix
X_train = csr_matrix((3, 4), dtype=np.float)
type(X_train)
# scipy.sparse.csr.csr_matrix
# this will not work:
X_train.todense()
type(X_train)
# scipy.sparse.csr.csr_matrix # still sparse
# this will work:
X_train = X_train.toarray()
type(X_train)
# numpy.ndarray
You should follow a similar procedure for your X_test
data (your y_train
and y_test
seem to be already dense Numpy arrays).