I adapted the following code from Susan Li's post, but incurred an error when the code tries to tokenize text using NLTK
's resources (or, there could be something wrong with "keyed vectors" loaded from the web). The error occurred on the 5th code block (see below, might take a while to load from the web):
## 1. load packages and data
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize
STOPWORDS = set(stopwords.words('english'))
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
%matplotlib inline
df = pd.read_csv('https://www.dropbox.com/s/b2w7iqi7c92uztt/stack-overflow-data.csv?dl=1')
df = df[pd.notnull(df['tags'])]
my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']
## 2. cleaning
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['post'] = df['post'].apply(clean_text)
## 3. train test split
X = df.post
y = df.tags
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
## 4. load keyed vectors from the web: will take a while to load
import gensim
word2vec_path = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
wv = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
wv.init_sims(replace=True)
## 5. this is where it goes wrong
def w2v_tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text, language='english'):
for word in nltk.word_tokenize(sent, language='english'):
if len(word) < 2:
continue
tokens.append(word)
return tokens
train, test = train_test_split(df, test_size=0.3, random_state = 42)
test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)
## 6. perform logistic regression test
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['tags'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.tags))
print(classification_report(test.tags, y_pred,target_names=my_tags))
Update on part 5 (per @luigigi
's comments)
## 5. download nltk and use apply() function without using lambda
import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize
def w2v_tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text, language='english'):
for word in nltk.word_tokenize(sent, language='english'):
if len(word) < 2:
continue
tokens.append(word)
return tokens
train, test = train_test_split(df, test_size=0.3, random_state = 42)
def w2v_tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text, language='english'):
for word in nltk.word_tokenize(sent, language='english'):
if len(word) < 2:
continue
tokens.append(word)
return tokens
train, test = train_test_split(df, test_size=0.3, random_state = 42)
test_tokenized = test['post'].apply(w2v_tokenize_text).values
train_tokenized = train['post'].apply(w2v_tokenize_text).values
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)
## now run the test
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['tags'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.tags))
print(classification_report(test.tags, y_pred,target_names=my_tags))
This should work.
Then nltk tokenizer expects the punkt resource so you have to download it first:
nltk.download('punkt')
Also, you dont need a lambda
expression to apply your tokenizer function. You can simply use:
test_tokenized = test['post'].apply(w2v_tokenize_text).values
train_tokenized = train['post'].apply(w2v_tokenize_text).values