Search code examples
pythonfunctionmatplotliblabelpca

How do I change the "str" ​labels in a function to "int" and return a plt.legend() that concatenates the two labels in an "int, str" format?


I have a function that allows me to display the circle of correlations of my pca.

The problem with this function is that the labels of my variables (column names) prevent me from reading my results correctly. To overcome this problem, I have to insert a line of code before my function to associate numbers with the labels of the variables (of the df used to make my pca):

n_labels = [value for value in range(1, (len(df.columns) + 1))]

I tried unsuccessfully to insert this line in my function:

import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd


def display_circles(pcs,
                    n_comp,
                    pca,
                    axis_ranks,
                    labels=None,
                    label_rotation=0,
                    lims=None):
    for d1, d2 in axis_ranks:  # We display the first 3 factorial planes, so the first 6 components
        if d2 < n_comp:

            # figure initialization
            fig, ax = plt.subplots(figsize=(10, 8))

            # determination of graph limits
            if lims is not None:
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30:
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else:
                xmin, xmax, ymin, ymax = min(pcs[d1, :]), max(pcs[d1, :]), min(
                    pcs[d2, :]), max(pcs[d2, :])

            # arrow display
            # if there are more than 30 arrows, the triangle is not displayed at their end
            if pcs.shape[1] < 30:
                plt.quiver(np.zeros(pcs.shape[1]),
                           np.zeros(pcs.shape[1]),
                           pcs[d1, :],
                           pcs[d2, :],
                           angles='xy',
                           scale_units='xy',
                           scale=1,
                           color="grey")
            else:
                lines = [[[0, 0], [x, y]] for x, y in pcs[[d1, d2]].T]
                ax.add_collection(
                    LineCollection(lines, axes=ax, alpha=.1, color='black'))

            # display of variable names
            if labels is not None:
                for i, (x, y) in enumerate(pcs[[d1, d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax:
                        plt.text(x,
                                 y,
                                 labels[i],
                                 fontsize='22',
                                 ha='center',
                                 va='bottom',
                                 rotation=label_rotation,
                                 color="red",
                                 alpha=0.7)

            # circle display
            circle = plt.Circle((0, 0), 1, facecolor='none', edgecolor='b')
            plt.gca().add_artist(circle)

            # setting graph limits
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)

            # display of horizontal and vertical lines
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # names of the axes, with the percentage of inertia explained
            plt.xlabel('F{} ({}%)'.format(
                d1 + 1, round(100 * pca.explained_variance_ratio_[d1], 1)),
                       fontsize=14)
            plt.ylabel('F{} ({}%)'.format(
                d2 + 1, round(100 * pca.explained_variance_ratio_[d2], 1)),
                       fontsize=14)

            plt.title("Circle of correlations (F{} and F{})".format(
                d1 + 1, d2 + 1),
                      size=24)
            plt.show()

This is how I call my function:

import pandas as pd
from sklearn import decomposition, preprocessing


# Here a dataset for the example:
column_1 = [1, 2, 3, 4, 5, 6, 7 ,8]
column_2 = [4, 2, 9, 23, 3, 52, 41, 4]
column_3 = [9, 8, 7, 6, 6, 9, 24, 11]
column_4 = [45, 36, 74, 35, 29, 45, 29, 39]
column_5 = [35, 84, 3, 54, 68, 78, 65, 97]
column_6 = [24, 96, 7, 54, 67, 69, 88, 95]
column_7 = [5, 39, 72, 42, 22, 41, 24, 41]
column_8 = [30, 98, 8, 67, 68, 41, 27, 87]


df = pd.DataFrame({'column_1': column_1,
                  'column_2': column_2,
                  'column_3': column_3, 
                  'column_4': column_4,
                  'column_5': column_5,
                  'column_6': column_6,
                  'column_7': column_7, 
                  'column_8': column_8})


pca_data = preprocessing.scale(df)

pca = decomposition.PCA(n_components = 8)
pca.fit(pca_data)

# We set the number of components
n_comp = 2
# Recovery of the components of the pca object
pcs = pca.components_
# We label a number to each column name
n_labels = [value for value in range(1, (len(df.columns) + 1))]
display_circles(pcs, n_comp, pca, [(0, 1), (0, 2)], labels=n_labels)

for element in zip(n_labels, df.columns):
    print(element)

Here is my obtained result: Circle of correlations

Edit 1: that i would like (UPD: with the answer of @Stef -Thanks you very much and congratulations for this solution-)

it's almost perfect but the problem is when I use this function:

n_comp = 3
pcs = pca.components_
# I always have to write this line to get a label number
n_labels=[value for value in range(1,(len(list_candidates.columns)+1))]
display_circles(pcs, n_comp, pca, [(0, 1), (0, 2)], labels=n_labels)

on my real dataframe, this throws me two problems:

  1. I still have to include the line
n_labels=[value for value in range(1,(len(list_candidates.columns)+1))]

to obtain a label number instead of the name of my variables.

  1. I get the error message "NameError: name 'df' is not defined" when running
display_circles(pcs, n_comp, pca, [(0, 1), (0, 2)], labels=n_labels)

So I'm looking to define my display_circles() function so that when I set the labels="name_of_the_df" argument it returns me the same result as

n_labels=[value for value in range(1,(len("name_of_the_df".columns)+1))]

plus a plt.legend() like the one made by @Steph (thanks)

To get this (desired) result:

Circle of correlations that i want

I also have to modify "name_of_the_df" in the function definition:

   #legend
   plt.legend(n_labels,
              candidate_list.columns,
              handler_map={int: IntHandler()},
              bbox_to_anchor=(1, 1))

Solution

  • You can define your own legend handler for integers:

    from matplotlib.text import Text
    
    class IntHandler:
        def legend_artist(self, legend, orig_handle, fontsize, handlebox):
            x0, y0 = handlebox.xdescent, handlebox.ydescent
            text = Text(x0, y0, str(orig_handle), color='red')
            handlebox.add_artist(text)
            return text
    

    and then call

    plt.legend(n_labels, df.columns, handler_map={int: IntHandler()}, bbox_to_anchor=(1,1))
    

    before plt.show() in display_circles:

    enter image description here


    Full example as per comment below and edited question:

    import matplotlib.pyplot as plt
    from matplotlib.collections import LineCollection
    from matplotlib.text import Text
    import numpy as np
    import pandas as pd
    from sklearn import decomposition, preprocessing
    
    class IntHandler:
        def legend_artist(self, legend, orig_handle, fontsize, handlebox):
            x0, y0 = handlebox.xdescent, handlebox.ydescent
            text = Text(x0, y0, str(orig_handle), color='red')
            handlebox.add_artist(text)
            return text
    
    def display_circles(pcs,
                        n_comp,
                        pca,
                        axis_ranks,
                        labels=None,
                        label_rotation=0,
                        lims=None):
        if labels == None:
            labels = range(1, len(pca.feature_names_in_) + 1)
        
        for d1, d2 in axis_ranks:  # We display the first 3 factorial planes, so the first 6 components
            if d2 < n_comp:
    
                # figure initialization
                fig, ax = plt.subplots(figsize=(10, 8))
    
                # determination of graph limits
                if lims is not None:
                    xmin, xmax, ymin, ymax = lims
                elif pcs.shape[1] < 30:
                    xmin, xmax, ymin, ymax = -1, 1, -1, 1
                else:
                    xmin, xmax, ymin, ymax = min(pcs[d1, :]), max(pcs[d1, :]), min(
                        pcs[d2, :]), max(pcs[d2, :])
    
                # arrow display
                # if there are more than 30 arrows, the triangle is not displayed at their end
                if pcs.shape[1] < 30:
                    plt.quiver(np.zeros(pcs.shape[1]),
                               np.zeros(pcs.shape[1]),
                               pcs[d1, :],
                               pcs[d2, :],
                               angles='xy',
                               scale_units='xy',
                               scale=1,
                               color="grey")
                else:
                    lines = [[[0, 0], [x, y]] for x, y in pcs[[d1, d2]].T]
                    ax.add_collection(
                        LineCollection(lines, axes=ax, alpha=.1, color='black'))
    
                # display of variable names
                if labels is not None:
                    for i, (x, y) in enumerate(pcs[[d1, d2]].T):
                        if x >= xmin and x <= xmax and y >= ymin and y <= ymax:
                            plt.text(x,
                                     y,
                                     labels[i],
                                     fontsize='22',
                                     ha='center',
                                     va='bottom',
                                     rotation=label_rotation,
                                     color="red",
                                     alpha=0.7,
                            )
    
                # circle display
                circle = plt.Circle((0, 0), 1, facecolor='none', edgecolor='b')
                plt.gca().add_artist(circle)
    
                # setting graph limits
                plt.xlim(xmin, xmax)
                plt.ylim(ymin, ymax)
    
                # display of horizontal and vertical lines
                plt.plot([-1, 1], [0, 0], color='grey', ls='--')
                plt.plot([0, 0], [-1, 1], color='grey', ls='--')
    
                # names of the axes, with the percentage of inertia explained
                plt.xlabel('F{} ({}%)'.format(
                    d1 + 1, round(100 * pca.explained_variance_ratio_[d1], 1)),
                           fontsize=14)
                plt.ylabel('F{} ({}%)'.format(
                    d2 + 1, round(100 * pca.explained_variance_ratio_[d2], 1)),
                           fontsize=14)
    
                plt.title("Circle of correlations (F{} and F{})".format(
                    d1 + 1, d2 + 1),
                          size=24)
    
                plt.legend(labels,
                           pca.feature_names_in_,
                           handler_map={int: IntHandler()},
                           bbox_to_anchor=(1,1))
                
                plt.show()
    
    
    
    # Here a dataset for the example:
    column_1 = [1, 2, 3, 4, 5, 6, 7 ,8]
    column_2 = [4, 2, 9, 23, 3, 52, 41, 4]
    column_3 = [9, 8, 7, 6, 6, 9, 24, 11]
    column_4 = [45, 36, 74, 35, 29, 45, 29, 39]
    column_5 = [35, 84, 3, 54, 68, 78, 65, 97]
    column_6 = [24, 96, 7, 54, 67, 69, 88, 95]
    column_7 = [5, 39, 72, 42, 22, 41, 24, 41]
    column_8 = [30, 98, 8, 67, 68, 41, 27, 87]
    
    
    df = pd.DataFrame({'column_1': column_1,
                      'column_2': column_2,
                      'column_3': column_3, 
                      'column_4': column_4,
                      'column_5': column_5,
                      'column_6': column_6,
                      'column_7': column_7, 
                      'column_8': column_8})
    
    
    pca_data = preprocessing.scale(df)
    
    pca = decomposition.PCA(n_components = 8)
    pca.fit(pd.DataFrame(pca_data, columns=df.columns))
    
    # We set the number of components
    n_comp = 2
    # Recovery of the components of the pca object
    pcs = pca.components_
    
    display_circles(pcs, n_comp, pca, [(0, 1), (0, 2)])