A practical example of GSDMM in python?

I want to use GSDMM to assign topics to some tweets in my data set. The only examples I found (1 and 2) are not detailed enough. I was wondering if you know of a source (or care enough to make a small example) that shows how GSDMM is implemented using python.

Solution

I finally compiled my code for GSDMM and will put it here from scratch for others' use. I have tried to comment on important parts:

# Imports
import random

import numpy as np
from gensim.models.phrases import Phraser, Phrases
from gensim.utils import simple_preprocess
from gsdmm import MovieGroupProcess


# data
data = ...

# stop words
stop_words = ...

# turning sentences into words
data_words =[]
for doc in data:
    doc = doc.split()
    data_words.append(doc)

# create vocabulary
vocabulary = ...

# Removing stop Words
stop_words.extend(['from', 'rt'])

def remove_stopwords(texts):
    return [
        [
            word
            for word in simple_preprocess(str(doc))
            if word not in stop_words
        ]
        for doc in texts
    ]

data_words_nostops = remove_stopwords(vocabulary)

# building bi-grams 
bigram = Phrases(vocabulary, min_count=5, threshold=100) 
bigram_mod = Phraser(bigram)
print('done!')

# Form Bigrams
data_words_bigrams = [bigram_mod[doc] for doc in data_words_nostops]

# lemmatization
pos_to_use = ['NOUN', 'ADJ', 'VERB', 'ADV']
data_lemmatized = []
for sent in data_words_bigrams:
    doc = nlp(" ".join(sent)) 
    data_lemmatized.append(
        [token.lemma_ for token in doc if token.pos_ in pos_to_use]
    )
      
docs = data_lemmatized
vocab = set(x for doc in docs for x in doc)

# Train a new model 
random.seed(1000)
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=30)

vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
n_docs = len(docs)

# Fit the model on the data given the chosen seeds
y = mgp.fit(docs, n_terms)

def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(
            mgp.cluster_word_distribution[cluster].items(),
            key=lambda k: k[1],
            reverse=True,
        )[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — — ')

doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)


# Show the top 10 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 10)

Links