Search code examples
pythonsvd

singular value decomposition changing results


I am trying to perform text summarization using svds but the summary result is changing everytime I run the function. Can someone please let me know the reason and also a solution for it ? I even checked the indivudual arrays u, s and v even they are changing after every run. How to make them static ? The sentence matrix has been calculated as follows after that svds code. The dataset is some legal document from australian supreme court.

def _compute_matrix(sentences, weighting, norm):
    if weighting.lower() == 'binary':
        vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1), 
        binary=True, stop_words=None)
    elif weighting.lower() == 'frequency':
        vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1), 
        binary=False, stop_words=None)
    elif weighting.lower() == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1), 
        stop_words=None)
    else:
        raise ValueError('Parameter "method" must take one of the values 
        "binary", "frequency" or "tfidf".')

    # Extract word features from sentences using sparse vectorizer
    frequency_matrix = vectorizer.fit_transform(sentences).astype(float)

    terms = vectorizer.get_feature_names()

    if norm in ('l1', 'l2'):
        frequency_matrix = normalize(frequency_matrix, norm=norm, axis=1)
    elif norm is not None:
        raise ValueError('Parameter "norm" can only take values "l1", "l2" 
        or None')

    return frequency_matrix, terms

processed_sentences = _createsentences(raw_content)
sentence_matrix, feature_names = _compute_matrix(processed_sentences, 
weighting='tfidf', norm='l2')
sentence_matrix = sentence_matrix.transpose()
sentence_matrix = sentence_matrix.multiply(sentence_matrix > 0)
print(sentence_matrix.shape)

u, s, v = svds(sentence_matrix, k=20)
topic_sigma_threshold = 0.5
topic_averages = v.mean(axis=1)

for topic_ndx, topic_avg in enumerate(topic_averages):
    v[topic_ndx, v[topic_ndx, :] <= topic_avg] = 0

if 1 <= topic_sigma_threshold < 0:
   raise ValueError('Parameter topic_sigma_threshold must take a value 
   between 0 and 1')

sigma_threshold = max(s) * topic_sigma_threshold
s[s < sigma_threshold] = 0  

saliency_vec = np.dot(np.square(s), np.square(v))

top_sentences = saliency_vec.argsort()[-25:][::-1]
top_sentences.sort()

[processed_sentences[i] for i in top_sentences]

Solution

  • I found a solution by playing with the parameters of svd and understanding the source code of svds. svds uses random intial vector from dimension N of the sparse matrix. So to set the initial vector to a constant choice we must use the v0 parameter and the code is mentioned below.

    np.random.seed(0)
    v0 = np.random.rand(min(sentence_matrix.shape))
    
    u, s, v = svds(sentence_matrix, k=20, v0=v0)