python pandas list-comprehension attributeerror wordnet

AttributeError when building list comprehension for Wordnet.Synsets().Definition()

First off, I'm a python noob and I only half-undestand how some of this stuff works. I've been trying to build word matrices for a tagging project and I hoped I could figure this out on my own, but I'm not seeing a lot of documentation around my particular error. So I apologize up front if this is something super-obvious.

I've tried to get a set of functions to work in a few different variations, but I keep getting "AttributeError: 'list' has no attribute definition."

import pandas as pd
from pandas import DataFrame, Series
import nltk.data
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TreebankWordTokenizer

# Gets synsets for a given term.

def get_synset(word):
    for word in wn.synsets(word):
        return word.name()

#Gets definitions for a synset.

def get_def(syn):
    return wn.synsets(syn).defnition()

# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.

def sector_tagger(frame):
    sentences = frame.tolist()
    tok_list = [tok.tokenize(w) for w in frame]
    split_words = [w.lower() for sub in tok_list for w in sub]
    clean_words = [w for w in split_words if w not in english_stops]
    synset = [get_synset(w) for w in clean_words]
    sector_matrix = DataFrame({'Categories': clean_words,
                               'Synsets': synset})
    sec_syn = sector_matrix['Synsets'].tolist()
    sector_matrix['Definition'] = [get_def(w) for w in sector_matrix['Synsets']]
    return sector_matrix

The functions get called on a dataframe that I read in from excel:

test = pd.read_excel('data.xlsx')

And the sector_tagger function is called as such:

agri_matrix = sector_tagger(agri['Category'])

A previous version called wn.synsets(w).definition() in a list comprehension that populated the DataFrame. Another tried to call the definition after the fact in a Jupyter Notebook. I almost always get the Attribute Error. That said, when I call the datatype on sector_matrix['Synsets'] I get an "object" type, and when I print that column I don't see [] around the items.

I've tried:

Wrapping "w" in str()
Calling the list comprehension in and out of the function (ie - deleting the line and calling it in my notebook)
Passing the 'Synsets' column to a new list and building a list comprehension around that

Curiously enough, I was playing around with this yesterday and was able to make something work in my notebook directly, but (a) it's messy (b) there's no scalability, and (c) it doesn't work on other categories that I apply it to.

agrimask = (df['Agri-Food']==1) & (df['Total']==1)
df_agri = df.loc[agrimask,['Category']]
agri_words = [tok.tokenize(a) for a in df_agri['Category']]
agri_cip_words = [a.lower() for sub in agri_words for a in sub]
agri_clean = [w for w in agri_cip_words if w not in english_stops]
df_agri_clean = DataFrame({'Category': agri_clean})
df_agri_clean = df_agri_clean[df_agri_clean != ','].replace('horticulture/horticultural','horticulture').dropna().drop_duplicates()
df_agri_clean['Synsets'] = [x[0].name() for x in df_agri_clean['Category'].apply(syn)]
df_agri_clean['Definition'] = [wn.synset(x).definition() for x in df_agri_clean['Synsets']]
df_agri_clean['Lemma'] = [wn.synset(x).lemmas()[0].name() for x in df_agri_clean['Synsets']]
df_agri_clean

Edit1: Here's a link to a sample of the data.

Edit2: Also, the static variables I'm using are here (all based around the standard NLTK library):

tok = TreebankWordTokenizer()
english_stops = set(stopwords.words('english'))
french_stops = set(stopwords.words('french'))

Edit3: You can see a working version of this code here: Working Code

Solution

2018-09-18_CIP.ipynb

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TreebankWordTokenizer as tok

english_stops = set(stopwords.words('english'))

# Gets synsets for a given term.
def get_synset(word):
    for word in wn.synsets(word):
        return word.name()

#Gets definitions for a synset.
def get_def(syn):
    return wn.synset(syn).definition()  # your definition is misspelled

# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.
def sector_tagger(frame):
    tok_list = tok().tokenize(frame)
    split_words = [w.lower() for w in tok_list]
    clean_words = [w for w in split_words if w not in english_stops]
    synset = [get_synset(w) for w in clean_words]
    sector_matrix = pd.DataFrame({'Categories': clean_words,
                                  'Synsets': synset})
    sec_syn = list(sector_matrix['Synsets'])
    sector_matrix['Definition'] = [get_def(w) if w != None else '' for w in sec_syn]
    return sector_matrix

agri_matrix = df['Category'].apply(sector_tagger)

if this answers your question, please check it as the answer

The output of get_def is a list of phrases

Alternate Approach

def sector_tagger(frame):
    mapping = [('/', ' '), ('(', ''), (')', ''), (',', '')]
    for k, v in mapping:
        frame = frame.replace(k, v)
    tok_list = tok().tokenize(frame)  # note () after tok
    split_words = [w.lower() for w in tok_list]
    clean_words = [w for w in split_words if w not in english_stops]
    synset = [get_synset(w) for w in clean_words]
    def_matrix = [get_def(w) if w != None else '' for w in synset]
    return clean_words, synset, def_matrix


poo = df['Category'].apply(sector_tagger)

poo[0] = 
(['agricultural', 'domestic', 'animal', 'services'],
 ['agricultural.a.01', 'domestic.n.01', 'animal.n.01', 'services.n.01'],
 ['relating to or used in or promoting agriculture or farming',
  'a servant who is paid to perform menial tasks around the household',
  'a living organism characterized by voluntary movement',
  'performance of duties or provision of space and equipment helpful to others'])

list_clean_words = []
list_synset = []
list_def_matrix = []
for x in poo:
    list_clean_words.append(x[0])
    list_synset.append(x[1])
    list_def_matrix.append(x[2])

agri_matrix = pd.DataFrame()
agri_matrix['Categories'] = list_clean_words
agri_matrix['Synsets'] = list_synset
agri_matrix['Definition'] = list_def_matrix
agri_matrix

                                    Categories      Synsets       Definition
0   [agricultural, domestic, animal, services]  [agricultural.a.01, domestic.n.01, animal.n.01...   [relating to or used in or promoting agricultu...
1   [agricultural, food, products, processing]  [agricultural.a.01, food.n.01, merchandise.n.0...   [relating to or used in or promoting agricultu...
2   [agricultural, business, management]    [agricultural.a.01, business.n.01, management....   [relating to or used in or promoting agricultu...
3   [agricultural, mechanization]   [agricultural.a.01, mechanization.n.01] [relating to or used in or promoting agricultu...
4   [agricultural, production, operations]  [agricultural.a.01, production.n.01, operation...   [relating to or used in or promoting agricultu...

Split each list of lists into a long list (they're ordered)

def create_long_list_from_list_of_lists(list_of_lists):
    long_list = []
    for one_list in list_of_lists:
        for word in one_list:
            long_list.append(word)
    return long_list

long_list_clean_words = create_long_list_from_list_of_lists(list_clean_words)
long_list_synset = create_long_list_from_list_of_lists(list_synset)
long_list_def_matrix = create_long_list_from_list_of_lists(list_def_matrix)

Turn it into a DataFrame of Uniques Categories

agri_df = pd.DataFrame.from_dict(dict([('Categories', long_list_clean_words), ('Synsets', long_list_synset), ('Definitions', long_list_def_matrix)])).drop_duplicates().reset_index(drop=True)

agri_df.head(4)

       Categories              Synsets                         Definitions
0   ceramic               ceramic.n.01  an artifact made of hard brittle material prod...
1   horticultural   horticultural.a.01  of or relating to the cultivation of plants
2   construction     construction.n.01  the act of constructing something
3   building             building.n.01  a structure that has a roof and walls and stan...

Final Note

import from nltk.tokenize import TreebankWordTokenizer as tok

or:

import from nltk.tokenize import word_tokenize

to use:

tok().tokenize(string_text_phrase)  # text is a string phrase, not a list of words

or:

word_tokenize(string_text_phrase)

Both methods appear to produce the same output, which is a list of words.

input = "Agricultural and domestic animal services"

output_of_both_methods = ['Agricultural', 'and', 'domestic', 'animal', 'services']