I have a directory containing corpus text files, I want to create a table which contains the number of words in each document that is table contains column of document number & row contains word count in that document for each unique word...all should be done in python...please help...thank you...
The table should look like this:
word1 word2 word3 ...
doc1 14 5 45
doc2 6 1 0
.
.
.
import nltk
import collections
import os.path
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
path = "c://Users/Desktop/corpus files"
i=0
for file in os.listdir(path) :
f = open("c://Users/Desktop/corpus files/file%d.txt" %i,'r')
data= f.read()
words = cleanDoc(data)
fw = open("c://Users/Desktop/words/words%d.txt" %i,'w')
fd = collections.Counter(words)
#fd = nltk.FreqDist(words)
#plot(fd)
row_format = "{:>15}" * (len(words) + 1)
print row_format.format("document %d" %i, *words)
#for
fw.write(str(fd))
fw.write(str(words))
fw.close()
i=i+1
f.close()
I think this is fairly close, if not exactly, what you want. In case it isn't, I tried to make things easy to change.
To produce the table desired processing is done two phases. In the first, the unique words in each document file of the formfile<document-number>.txt
are found and saved in a corresponding words<document-number>.txt
file, plus they are added to a set of comprising all the unique words seen among all document files. This set is needed to produce table columns that consist of all the unique words in all the files and is why two phases of processing were required.
In the second phase, the word files are read back in and turned back into dictionies which used to fill in the corresponding columns of the table being printed.
import ast
import collections
import nltk
import re
import os
user_name = "UserName"
path = "c://Users/%s/Desktop/corpus files" % user_name
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens
if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
# phase 1 -- find unique words, create word files, update overall unique word set
corpus_file_pattern = re.compile(r"""file(\d+).txt""")
unique_words = set()
longest_filename = 0
document_nums = []
for filename in os.listdir(path):
corpus_file_match = corpus_file_pattern.match(filename)
if corpus_file_match: # corpus text file?
if len(filename) > longest_filename:
longest_filename = len(filename)
document_num = int(corpus_file_match.group(1))
document_nums.append(document_num)
with open(os.path.join(path, filename)) as file:
data = file.read()
words = cleanDoc(data)
unique_words.update(words)
fd = collections.Counter(words)
words_filename = "words%d.txt" % document_num
with open(os.path.join(path, words_filename), mode = 'wt') as fw:
fw.write(repr(dict(fd)) + '\n') # write representation as dict
# phase 2 -- create table using unique_words and data in word files
unique_words_list = sorted(unique_words)
unique_words_empty_counter = collections.Counter({word: 0 for word
in unique_words})
document_nums = sorted(document_nums)
padding = 2 # spaces between columns
min_col_width = 5
col_headings = ["Document"] + unique_words_list
col_widths = [max(min_col_width, len(word))+padding for word in col_headings]
col_widths[0] = longest_filename+padding # first col is special case
# print table headings
for i, word in enumerate(col_headings):
print "{:{align}{width}}".format(word, align='>' if i else '<',
width=col_widths[i]),
print
for document_num in document_nums:
# read word in document dictionary back in
filename = "words%d.txt" % document_num
file_words = unique_words_empty_counter.copy()
with open(os.path.join(path, filename)) as file:
data = file.read()
# convert data read into dict and update with file word counts
file_words.update(ast.literal_eval(data))
# print row of data
print "{:<{width}}".format(filename, width=col_widths[0]),
for i, word in enumerate(col_headings[1:], 1):
print "{:>{width}n}".format(file_words[word], width=col_widths[i]),
print