Search code examples
pythondimensionality-reduction

How can i use t-SNE for dimension reduction to visualise my 300 dimension word embeddings?


i am currently trying to visualize word vectors of 300 dimension in 2d. I tried t-SNE with different parameters and read the blog on https://distill.pub/2016/misread-tsne/ but so far i got no useful results.

i want a visualisation that corresponds to the nearest neighbors of a few selected word vectors, but the 2d visualisation is all over the place.

Is it unsuitable to use TSNE for my problem?

from sklearn.manifold import TSNE

arr = []

for category in category_embeddings.keys():
    arr.append(category_embeddings[category][0]) 

perplex = 30
tsne_steps = 50000
lr = 10

fig_tsne = plt.figure(figsize=(18, 18), dpi=800)

tsne = TSNE(perplexity=perplex, 
            n_components=2, 
            init='pca', 
            n_iter=tsne_steps, 
            learning_rate=lr, 
            method="exact")

plot_only = len(category_embeddings.keys())
low_dim_embs = tsne.fit_transform(np.asarray(arr))

for i, title in enumerate(category_embeddings.keys()):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(
        title,
        xy=(x, y),
        xytext=(5, 2),
        textcoords='offset points',
        ha='right',
        va='bottom')

Solution

  • okay, solved.

    Creating a distance matrix and feeding TSNE with the matrix results in a much better 2d visualisation.

    from sklearn.metrics.pairwise import cosine_distances
    
    c1_c2_cos_dist = {}
    
    # Create distance Matrix
    for c1in category_embeddings.keys():
        tmp = {}
        for c2 in category_embeddings.keys():
            cos_dis = cosine_distances(category_embeddings[c1],category_embeddings[
            tmp[c2] = cos_dis[0][0]
    
        c1_c2_cos_dist[c1] = copy(tmp)
    
    # --- 
    
    from sklearn.manifold import TSNE
    
    arr = []
    
    for category in category_embeddings.keys():
        arr.append(category_embeddings[category][0]) 
    
    perplex = 30
    tsne_steps = 50000
    lr = 10
    
    fig_tsne = plt.figure(figsize=(18, 18), dpi=800)
    
    tsne = TSNE(perplexity=perplex, 
                n_components=2, 
                metric="precomputed",
                n_iter=tsne_steps, 
                learning_rate=lr)
    
    distMatrix = []
    for col in c1_c2_cos_dist.keys():
        arr =[]
        for row in c1_c2_cos_dist[col]:
            arr.append(c1_c2_cos_dist[col][row])
        distMatrix.append(copy(arr))  
    
    distMatrix = np.asarray(distMatrix)
    low_dim_embs = tsne.fit_transform(distMatrix)
    
    plot_only = len(category_embeddings.keys())
    
    for i, title in enumerate(category_embeddings.keys()):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(
            title,
            xy=(x, y),
            xytext=(5, 2),
            textcoords='offset points',
            ha='right',
            va='bottom')