Please help me in understanding the difference between how TaggedDocument
and LabeledSentence
of gensim
works. My ultimate goal is Text Classification using Doc2Vec
model and any classifier. I am following this blog!
class MyLabeledSentences(object):
def __init__(self, dirname, dataDct={}, sentList=[]):
self.dirname = dirname
self.dataDct = {}
self.sentList = []
def ToArray(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname, fname)) as fin:
for item_no, sentence in enumerate(fin):
self.sentList.append(LabeledSentence([w for w in sentence.lower().split() if w in stopwords.words('english')], [fname.split('.')[0].strip() + '_%s' % item_no]))
return sentList
class MyTaggedDocument(object):
def __init__(self, dirname, dataDct={}, sentList=[]):
self.dirname = dirname
self.dataDct = {}
self.sentList = []
def ToArray(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname, fname)) as fin:
for item_no, sentence in enumerate(fin):
self.sentList.append(TaggedDocument([w for w in sentence.lower().split() if w in stopwords.words('english')], [fname.split('.')[0].strip() + '_%s' % item_no]))
return sentList
sentences = MyLabeledSentences(some_dir_name)
model_l = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=7)
sentences_l = sentences.ToArray()
model_l.build_vocab(sentences_l )
for epoch in range(15): #
random.shuffle(sentences_l )
model.train(sentences_l )
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model_l.alpha
sentences = MyTaggedDocument(some_dir_name)
model_t = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=7)
sentences_t = sentences.ToArray()
model_l.build_vocab(sentences_t)
for epoch in range(15): #
random.shuffle(sentences_t)
model.train(sentences_t)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model_l.alpha
My question is model_l.docvecs['some_word']
is same as model_t.docvecs['some_word']
?
Can you provide me weblink of good sources to get a grasp on how TaggedDocument
or LabeledSentence
works.
LabeledSentence
is an older, deprecated name for the same simple object-type to encapsulate a text-example that is now called TaggedDocument
. Any objects that have words
and tags
properties, each a list, will do. (words
is always a list of strings; tags
can be a mix of integers and strings, but in the common and most-efficient case, is just a list with a single id integer, starting at 0.)
model_l
and model_t
will serve the same purposes, having trained on the same data with the same parameters, using just different names for the objects. But the vectors they'll return for individual word-tokens (model['some_word']
) or document-tags (model.docvecs['somefilename_NN']
) will likely be different – there's randomness in Word2Vec/Doc2Vec initialization and training-sampling, and introduced by ordering-jitter from multithreaded training.