I'm trying to go through a whole folder of text files, collect a cumulative frequency of trigrams across all of those files, and print them to a csv file. It does the whole corpus thing but then when it starts to do the work for the trigrams I get the following error:
Traceback (most recent call last):
File "entireCRngrams.py", line 23, in <module>
raw = speeches.raw().encode('ascii', 'ignore').lower()
File "/Library/Python/2.7/site-packages/nltk/corpus/reader/plaintext.py", line 74, in raw
return concat([self.open(f).read() for f in fileids])
File "/Library/Python/2.7/site-packages/nltk/corpus/reader/api.py", line 198, in open
stream = self._root.join(file).open(encoding)
File "/Library/Python/2.7/site-packages/nltk/data.py", line 309, in join
return FileSystemPathPointer(_path)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 85: ordinal not in range(128)
Thank you so much in advance. Here's the script:
import nltk
import re
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import os
import math
from decimal import *
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer, word_tokenize, sent_tokenize
import csv
import string
from nltk.tokenize.punkt import PunktWordTokenizer
#this imports the text files in the folder into corpus called speeches
corpus_root = '/Users/jolijttamanaha/Documents/Senior/Thesis/Python/DailyNews'
speeches = PlaintextCorpusReader(corpus_root, '.*\.txt')
print "Finished importing corpus"
for infile in sorted(speeches.fileids()):
print infile # The fileids of each file.
raw = speeches.raw().lower()
tokens = nltk.word_tokenize(raw.encode('utf-8').translate(None, string.punctuation))
tgs = nltk.trigrams(tokens)
fdist = nltk.FreqDist(tgs)
for k,v in fdist.items():
print k,v
minscore = 1
numwords = len(raw)
print "Words in corpus:"
print numwords
c = csv.writer(open("ngrams.csv", "a"))
for k,v in fdist.items():
if v > minscore:
rf = Decimal(v)/Decimal(numwords)
firstword, secondword, thirdword = k #splits up the list hidden in k
trigram = firstword + " " + secondword + " " + thirdword #turns the list in k into one string
time = year+ month + set
results = time,trigram,v,rf
c.writerow(results)
print "All done."
try:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
see Why should we NOT use sys.setdefaultencoding("utf-8") in a py script?
It's a pesky issue in py2.7 and nltk