Search code examples
pythondataframedatasetdata-sciencetopic-modeling

Topic modelling error too many values to unpack


I'm trying to perform lda topic modelling with tsne and pyldavis as visualizations. However After performing lda while getting the dominant topics the error is given of too many values to unpack. Code and Error is given below. Any help is highly appreciated.

Code For LdaMulticore Topic Modelling:

import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
def Make_String(text):
    return str(text)

#Reviews.columns=['Reviews']
#print(Reviews.head(10))

df['text']=df['text'].apply(lambda x: Make_String(x))
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
# Import Dataset
#df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
#df = df.loc[df.target_names.isin(['soc.religion.christian', 'rec.sport.hockey', 'talk.politics.mideast', 'rec.motorcycles']) , :]
df=pd.read_csv("/content/drive/My Drive/Negative_data.csv", encoding="ISO-8859-1")

print(df.shape)  #> (2361, 3)
df.head()

    # Create Dictionary
    id2word = corpora.Dictionary(data_ready)
    from gensim.models import LdaMulticore
    
    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_ready]
    lda_model = LdaMulticore( corpus, num_topics=10, id2word=id2word , passes=2, workers=2)
    
    pprint(lda_model.print_topics())
    #> [(0,
    #>   '0.017*"write" + 0.015*"people" + 0.014*"organization" + 0.014*"article" + '
    #>   '0.013*"time" + 0.008*"give" + 0.008*"first" + 0.007*"tell" + 0.007*"new" + '
    #>   '0.007*"question"'),
    #>  (1,
    #>   '0.008*"christian" + 0.008*"believe" + 0.007*"god" + 0.007*"law" + '
    #>   '0.006*"state" + 0.006*"israel" + 0.006*"israeli" + 0.005*"exist" + '
    #>   '0.005*"way" + 0.004*"bible"'),
    #>  (2,
    #>   '0.024*"armenian" + 0.012*"bike" + 0.006*"kill" + 0.006*"work" + '
    #>   '0.005*"well" + 0.005*"year" + 0.005*"sumgait" + 0.005*"soldier" + '
    #>   '0.004*"way" + 0.004*"ride"'),
    #>  (3,
    #>   '0.019*"team" + 0.019*"game" + 0.013*"hockey" + 0.010*"player" + '
    #>   '0.009*"play" + 0.009*"win" + 0.009*"nhl" + 0.009*"year" + 0.009*"hawk" + '
    #>   '0.009*"season"')]

Output:

    [(0,
      '0.340*"seriously" + 0.017*"time" + 0.015*"samsung" + 0.014*"day" + '
      '0.013*"phone" + 0.012*"order" + 0.012*"wait" + 0.011*"week" + 0.011*"damn" '
      '+ 0.011*"next"'),
     (1,
      '0.081*"puma" + 0.068*"shoe" + 0.046*"adida" + 0.017*"site" + 0.017*"como" + '
      '0.014*"wear" + 0.014*"ugly" + 0.011*"shirt" + 0.010*"era" + 0.009*"pumas"'),
     (2,
      '0.033*"watch" + 0.021*"hate" + 0.021*"wear" + 0.020*"shit" + 0.020*"buy" + '
      '0.016*"game" + 0.014*"man" + 0.014*"stop" + 0.014*"time" + 0.013*"still"'),
     (3,
      '0.037*"bad" + 0.014*"year" + 0.013*"pay" + 0.013*"feel" + 0.011*"thing" + '
      '0.011*"really" + 0.011*"last" + 0.011*"ever" + 0.009*"never" + '
      '0.009*"people"'),
     (4,
      '0.332*"com" + 0.173*"twitter" + 0.078*"pic" + 0.036*"status" + '
      '0.036*"https" + 0.029*"nintendo" + 0.015*"apple" + 0.008*"pue" + '
      '0.006*"photo" + 0.004*"iphone"'),
     (5,
      '0.162*"http" + 0.028*"pace" + 0.027*"low" + 0.019*"new" + 0.019*"price" + '
      '0.017*"crushed_km" + 0.017*"size" + 0.014*"video" + 0.012*"sale" + '
      '0.012*"dlvr"'),
     (6,
      '0.062*"nike" + 0.019*"phone" + 0.019*"drop" + 0.018*"work" + 0.013*"tell" + '
      '0.013*"hard" + 0.012*"call" + 0.011*"crazy" + 0.011*"lol" + 0.010*"ass"'),
     (7,
      '0.036*"sin" + 0.036*"die" + 0.024*"kill" + 0.018*"pero" + 0.012*"android" + '
      '0.012*"pro" + 0.009*"death" + 0.008*"igual" + 0.008*"final" + '
      '0.008*"problem"'),
     (8,
      '0.039*"black" + 0.036*"http" + 0.034*"netflix" + 0.020*"fire" + '
      '0.018*"dead" + 0.014*"son" + 0.013*"lose" + 0.011*"tv" + 0.011*"tinyurl" + '
      '0.010*"steal"'),
     (9,
      '0.299*"live" + 0.295*"alone" + 0.038*"seriously" + 0.013*"switch" + '
      '0.008*"mad" + 0.006*"screen" + 0.006*"wrong" + 0.006*"season" + '
      '0.005*"hour" + 0.005*"people"')]

Code for Dominant topics:

# Sentence Coloring of N Sentences
def topics_per_document(model, corpus, start=0, end=1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)

dominant_topics, topic_percentages = topics_per_document(model=lda_model, corpus=corpus, end=-1)            

# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()

# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)  

Error:

<ipython-input-13-5ea2ada44643> in topics_per_document(model, corpus, start, end)
      5     topic_percentages = []
      6     for i, corp in enumerate(corpus_sel):
----> 7         topic_percs, wordid_topics, wordid_phivalues = model[corp]
      8         dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
      9         dominant_topics.append((i, dominant_topic))

ValueError: too many values to unpack (expected 3)

Many Thanks


Solution

  • model[corp] does not return the tuple (topic_percs, wordid_topics, wordid_phivalues) that your code expects. Instead it returns the membership vector of corp i.e. the probability for each topic in your model that corp was generated from that topic. Here corp is an individual document from corpus as you are iterating over enumerate(corpus[0:1]), so you are asking for the membership vector for each document in corpus.

    This can be seen from the example given in the documentation (for the parent class LdaModel of LdaMulticore but they return the same object):

    >>> from gensim.test.utils import common_corpus
    >>> from gensim.models.ldamodel import LdaModel
    >>> lda = LdaModel(common_corpus, num_topics=10, iterations=1)
    >>> doc_bow = [(1, 0.3), (2, 0.1), (0, 0.09)]
    >>> doc_lda = lda[doc_bow]
    >>> doc_lda
    [(0, 0.08579318),
     (1, 0.0858944),
     (2, 0.079572774),
     (3, 0.09752562),
     (4, 0.08426655),
     (5, 0.1231114),
     (6, 0.17063272),
     (7, 0.08766636),
     (8, 0.083353266),
     (9, 0.102183744)]
    

    It seems like you want to call model.get_document_topics(corp) for each document bag of words (which you call corp) in corpus. This return the 3-tuple of the topic distribution for the whole document, the most probable topics per word, and the phi relevance values multiplied by the feature length for each word-topic combination.

    Otherwise, you could change

    topic_percs, wordid_topics, wordid_phivalues = model[corp]
    

    to

    topic_percs = model[corp]
    

    or even clearer to

    topic_percs = model.get_document_topics(corp)
    

    If wordid_topics is supposed to be the probability of each wordid in each topic, then you could call model.get_topic_terms(topicid) to return the probability pairs for the most relevant words generated by the topic or model.get_topics() to get the term-topic matrix.