My code is running out of memory because of the question I asked in this page. Then, I wrote the second code to have an iterable alldocs
, not an all-in-memory alldocs
. I changed my code based on the explanation of this page. I am not familiar with stream concept and I could not solve the error I got.
This code read all files of all folders of given path.The context of each file is consist of a document name and its context in two lines.For instance:
clueweb09-en0010-07-00000
dove gif clipart pigeon clip art picture image hiox free birds india web icons clipart add stumble upon
clueweb09-en0010-07-00001
google bookmarks yahoo bookmarks php script java script jsp script licensed scripts html tutorials css tutorials
First code:
# coding: utf-8
import string
import nltk
import nltk.tokenize
from nltk.corpus import stopwords
import re
import os, sys
import MySQLRepository
from gensim import utils
from gensim.models.doc2vec import Doc2Vec
import gensim.models.doc2vec
from gensim.models.doc2vec import LabeledSentence
from boto.emr.emrobject import KeyValue
def readAllFiles(path):
dirs = os.listdir( path )
for file in dirs:
if os.path.isfile(path+"/"+file):
prepareDoc2VecSetting(path+'/'+file)
else:
pf=path+"/"+file
readAllFiles(pf)
def prepareDoc2VecSetting (fname):
mapDocName_Id=[]
keyValues=set()
with open(fname) as alldata:
a= alldata.readlines()
end=len(a)
label=0
tokens=[]
for i in range(0,end):
if a[i].startswith('clueweb09-en00'):
mapDocName_Id.insert(label,a[i])
label=label+1
alldocs.append(LabeledSentence(tokens[:],[label]))
keyValues |= set(tokens)
tokens=[]
else:
tokens=tokens+a[i].split()
mydb.insertkeyValueData(keyValues)
mydb.insertDocId(mapDocName_Id)
mydb=MySQLRepository.MySQLRepository()
alldocs = []
pth='/home/flr/Desktop/newInput/tokens'
readAllFiles(ipth)
model = Doc2Vec(alldocs, size = 300, window = 5, min_count = 2, workers = 4)
model.save(pth+'/my_model.doc2vec')
Second code:(I did not consider parts related to DB)
import gensim
import os
from gensim.models.doc2vec import Doc2Vec
import gensim.models.doc2vec
from gensim.models.doc2vec import LabeledSentence
class prepareAllDocs(object):
def __init__(self, top_dir):
self.top_dir = top_dir
def __iter__(self):
mapDocName_Id=[]
label=1
for root, dirs, files in os.walk(top_directory):
for fname in files:
print fname
inputs=[]
tokens=[]
with open(os.path.join(root, fname)) as f:
for i, line in enumerate(f):
if line.startswith('clueweb09-en00'):
mapDocName_Id.append(line)
if tokens:
yield LabeledSentence(tokens[:],[label])
label+=1
tokens=[]
else:
tokens=tokens+line.split()
yield LabeledSentence(tokens[:],[label])
pth='/home/flashkar/Desktop/newInput/tokens/'
allDocs = prepareAllDocs('/home/flashkar/Desktop/newInput/tokens/')
for doc in allDocs:
model = Doc2Vec(allDocs, size = 300, window = 5, min_count = 2, workers = 4)
model.save(pth+'/my_model.doc2vec')
This is the error:
Traceback (most recent call last): File "/home/flashkar/git/doc2vec_annoy/Doc2Vec_Annoy/KNN/testiterator.py", line 44, in model = Doc2Vec(allDocs, size = 300, window = 5, min_count = 2, >workers = 4) File "/home/flashkar/anaconda/lib/python2.7/site->packages/gensim/models/doc2vec.py", line 618, in init self.build_vocab(documents, trim_rule=trim_rule) File >"/home/flashkar/anaconda/lib/python2.7/site->packages/gensim/models/word2vec.py", line 523, in build_vocab self.scan_vocab(sentences, progress_per=progress_per, >trim_rule=trim_rule) # initial survey File "/home/flashkar/anaconda/lib/python2.7/site->packages/gensim/models/doc2vec.py", line 655, in scan_vocab for document_no, document in enumerate(documents): File >"/home/flashkar/git/doc2vec_annoy/Doc2Vec_Annoy/KNN/testiterator.py", line 40, in iter yield LabeledSentence(tokens[:],tpl1) IndexError: list index out of range
You are using a generator function because you don't want to store all of your documents, but you are still storing all of your documents in alldocs
. You can just yield LabeledSentence(tokens[:], tpl[1]]))
.
What is currently happening is you are appending to a list and returning the list. this is why you are getting the AttributeError. Additionally, on each iteration you are appending to the list, which means that on each iteration, i, you are returning i and all documents that came before i!