import nltk
import string
from nltk.corpus import stopwords
from collections import Counter
def get_tokens():
with open('comet_interest.xml','r') as bookmark:
text=bookmark.read()
lowers=text.lower()
no_punctuation=lowers.translate(None,string.punctuation)
tokens=nltk.word_tokenize(no_punctuation)
return tokens
#remove stopwords
tokens=get_tokens()
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
print count.most_common(10)
#stemming
from nltk.stem.porter import *
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
count = Counter(stemmed)
print count.most_common(10)
The results show like these:
[('analysis', 13), ('spatial', 11), ('feb', 8), ('cdata', 8), ('description', 7), ('item', 6), ('many', 6), ('pm', 6), ('link', 6), ('research', 5)]
[(u'analysi', 13), (u'spatial', 11), (u'use', 11), (u'feb', 8), (u'cdata', 8), (u'scienc', 7), (u'descript', 7), (u'item', 6), (u'includ', 6), (u'mani', 6)]
what's the problem with second one of stemming, why every words has an "u" head?
As @kindall noted, it's becaus eof the unicode string.
But more specifically, it's because NLTK uses from __future__ import unicode_literals
which converts ALL strings to unicode by default, see https://github.com/nltk/nltk/blob/develop/nltk/stem/porter.py#L87
So let's try an experiment in python 2.x :
$ python
>>> from nltk.stem import PorterStemmer
>>> porter = PorterStemmer()
>>> word = "analysis"
>>> word
'analysis'
>>> porter.stem(word)
u'analysi'
We see that suddenly the stemmed word became a unicode.
Then, let's try to import the unicode_literals
:
>>> from nltk.stem import PorterStemmer
>>> porter = PorterStemmer()
>>> word = "analysis"
>>> word
'analysis'
>>> porter.stem(word)
u'analysi'
>>> from __future__ import print_function, unicode_literals
>>> word
'analysis'
>>> word2 = "analysis"
>>> word2
u'analysis'
Note that all strings remains still as strings but any string variable that's new after importing unicode_literals will become unicode by default.