Search code examples
pythonscikit-learnpython-class

Unable to instantiate a python class - AttributeError: class object has no attribute 'language'


I replicated a TextNormalizer class from this book like this

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, language='english'):
        self.stopwords = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

    def remove_concat(self, narrative):
        chars_to_remove = ['-', '_', '+']
        reg_ex = '[' + re.escape (''. join (chars_to_remove)) + ']'
        return re.sub(reg_ex, ' ', narrative) 
    
    def process_narrative(self, narrative):
        cleaned_narrative = self.remove_concat(narrative)
        tokens = nltk.word_tokenize(cleaned_narrative)
        return [token.lower() for token in tokens if token.lower() not in self.stopwords]

I wanted to learn the code step by step by testing it like this

tn = TextNormalizer()
tn

The following error occurred

AttributeError                            Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj, include, exclude)
    968 
    969             if method is not None:
--> 970                 return method(include=include, exclude=exclude)
    971             return None
    972         else:

~\Anaconda3\lib\site-packages\sklearn\base.py in _repr_mimebundle_(self, **kwargs)
    462     def _repr_mimebundle_(self, **kwargs):
    463         """Mime bundle used by jupyter kernels to display estimator"""
--> 464         output = {"text/plain": repr(self)}
    465         if get_config()["display"] == 'diagram':
    466             output["text/html"] = estimator_html_repr(self)

~\Anaconda3\lib\site-packages\sklearn\base.py in __repr__(self, N_CHAR_MAX)
    258             n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
    259 
--> 260         repr_ = pp.pformat(self)
    261 
    262         # Use bruteforce ellipsis when there are a lot of non-blank characters

~\Anaconda3\lib\pprint.py in pformat(self, object)
    151     def pformat(self, object):
    152         sio = _StringIO()
--> 153         self._format(object, sio, 0, 0, {}, 0)
    154         return sio.getvalue()
    155 

~\Anaconda3\lib\pprint.py in _format(self, object, stream, indent, allowance, context, level)
    168             self._readable = False
    169             return
--> 170         rep = self._repr(object, context, level)
    171         max_width = self._width - indent - allowance
    172         if len(rep) > max_width:

~\Anaconda3\lib\pprint.py in _repr(self, object, context, level)
    402 
    403     def _repr(self, object, context, level):
--> 404         repr, readable, recursive = self.format(object, context.copy(),
    405                                                 self._depth, level)
    406         if not readable:

~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in format(self, object, context, maxlevels, level)
    178 
    179     def format(self, object, context, maxlevels, level):
--> 180         return _safe_repr(object, context, maxlevels, level,
    181                           changed_only=self._changed_only)
    182 

~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in _safe_repr(object, context, maxlevels, level, changed_only)
    423         recursive = False
    424         if changed_only:
--> 425             params = _changed_params(object)
    426         else:
    427             params = object.get_params(deep=False)

~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in _changed_params(estimator)
     89     estimator with non-default values."""
     90 
---> 91     params = estimator.get_params(deep=False)
     92     init_func = getattr(estimator.__init__, 'deprecated_original',
     93                         estimator.__init__)

~\Anaconda3\lib\site-packages\sklearn\base.py in get_params(self, deep)
    193         out = dict()
    194         for key in self._get_param_names():
--> 195             value = getattr(self, key)
    196             if deep and hasattr(value, 'get_params'):
    197                 deep_items = value.get_params().items()

AttributeError: 'TextNormalizer' object has no attribute 'language'
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

~\Anaconda3\lib\site-packages\IPython\lib\pretty.py in pretty(self, obj)
    392                         if cls is not object \
    393                                 and callable(cls.__dict__.get('__repr__')):
--> 394                             return _repr_pprint(obj, self, cycle)
    395 
    396             return _default_pprint(obj, self, cycle)

~\Anaconda3\lib\site-packages\IPython\lib\pretty.py in _repr_pprint(obj, p, cycle)
    698     """A pprint that just redirects to the normal repr function."""
    699     # Find newlines and replace them with p.break_()
--> 700     output = repr(obj)
    701     lines = output.splitlines()
    702     with p.group():

~\Anaconda3\lib\site-packages\sklearn\base.py in __repr__(self, N_CHAR_MAX)
    258             n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
    259 
--> 260         repr_ = pp.pformat(self)
    261 
    262         # Use bruteforce ellipsis when there are a lot of non-blank characters

~\Anaconda3\lib\pprint.py in pformat(self, object)
    151     def pformat(self, object):
    152         sio = _StringIO()
--> 153         self._format(object, sio, 0, 0, {}, 0)
    154         return sio.getvalue()
    155 

~\Anaconda3\lib\pprint.py in _format(self, object, stream, indent, allowance, context, level)
    168             self._readable = False
    169             return
--> 170         rep = self._repr(object, context, level)
    171         max_width = self._width - indent - allowance
    172         if len(rep) > max_width:

~\Anaconda3\lib\pprint.py in _repr(self, object, context, level)
    402 
    403     def _repr(self, object, context, level):
--> 404         repr, readable, recursive = self.format(object, context.copy(),
    405                                                 self._depth, level)
    406         if not readable:

~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in format(self, object, context, maxlevels, level)
    178 
    179     def format(self, object, context, maxlevels, level):
--> 180         return _safe_repr(object, context, maxlevels, level,
    181                           changed_only=self._changed_only)
    182 

~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in _safe_repr(object, context, maxlevels, level, changed_only)
    423         recursive = False
    424         if changed_only:
--> 425             params = _changed_params(object)
    426         else:
    427             params = object.get_params(deep=False)

~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in _changed_params(estimator)
     89     estimator with non-default values."""
     90 
---> 91     params = estimator.get_params(deep=False)
     92     init_func = getattr(estimator.__init__, 'deprecated_original',
     93                         estimator.__init__)

~\Anaconda3\lib\site-packages\sklearn\base.py in get_params(self, deep)
    193         out = dict()
    194         for key in self._get_param_names():
--> 195             value = getattr(self, key)
    196             if deep and hasattr(value, 'get_params'):
    197                 deep_items = value.get_params().items()

AttributeError: 'TextNormalizer' object has no attribute 'language'

Although the class TextNormalizer would throw the above error if I tried to instantiate it, but it worked if applied to text like this

df = pd.DataFrame({'description': ['My order_number is A-08', 'It cost me +$80.00']})
tn = TextNormalizer()
df['description'].apply(tn.process_narrative)

Which produced this output

0    [order, number, 08]
1       [cost, $, 80.00]
Name: description, dtype: object

Can someone please explain what is happening? I meant it worked although it seemed wrong. What's the reason for this "phenomenon"?


Solution

  • I had to also download the stopwords, but setting self.language = language in the code and then using either language or self.language to retrieve the correct list resolves the error.

    Removing the language parameter and all lines using it also runs without issue, so it just doesn't like the parameter being specified, but not used.

    import nltk
    import pandas as pd
    nltk.download('stopwords')
    nltk.download('punkt')
    from nltk.stem.wordnet import WordNetLemmatizer
    from sklearn.base import BaseEstimator, TransformerMixin
    class TextNormalizer(BaseEstimator, TransformerMixin):
        def __init__(self, language='english'):
            self.language = language
            self.stopwords = set(nltk.corpus.stopwords.words(self.language))
            self.lemmatizer = WordNetLemmatizer()
    
    test = TextNormalizer()
    test
    

    Addendum

    The reason

    tn
    

    from the OP returns an error is because language isn't being set when the TextNormalizer object is instantiated, and because that value is used in the object's representation of itself.

    When the single line:

    tn
    

    is run, this does not instantiate the object tn, which must already exist. It evaluates and returns the value of __repr__ (which is defined somewhere in nltk). No error arises from creation of the class itself because the __repr__ method is not evaluated until you run tn on its own, or repr(tn).

    df['description'].apply(tn.process_narrative)
    

    does not throw an error because tn already exists, and its __repr__ method is not being used.

    You can fix this by assigning self.language as discussed above, or doing this after the item is created:

    tn = TextNormalizer()
    tn.language = 'english'
    tn