Search code examples
pythonpandasnetworkxtorch

Expected index while trying to use Label propagation


I am trying to turn my edge labels into node labels, in order to predict unlabeled nodes. Currently the dataset has edge_labels but I would need to have each node (ID) getting exactly one node_label:

The code I am using is the following:

import networkx as nx
import pandas as pd

data = {'ID': {0: 1, 1: 2, 2: 4, 3: 4, 4: 12, 5: 12, 6: 13, 7: 17},
            'Target': {0: 12, 1: 24, 2: 13, 3: 12, 4: 1, 5: 4, 6: 4, 7: 1},
            'Weight': {0: 0.4, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.1, 5: 0.4, 6: 0.2, 7: 0.1},
            'Label': {0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0}}
    
df = pd.DataFrame.from_dict(data)
    
G = nx.from_pandas_edgelist(df, source='ID', target='Target', edge_attr=['Weight', 'Label']) 
    
width = [d['Weight'] for (u, v, d) in G.edges(data=True)]
edge_color = [d['Label'] for (u, v, d) in G.edges(data=True)]
nx.draw_networkx(G, width=width, edge_color=edge_color)

This should return unique node_labels

df_to_use=df.drop_duplicates(['ID'])
df_to_use=df_to_use[['ID','Label']]
adj_matrix = nx.adjacency_matrix(G).toarray()

Building adjacency matrix

adj_matrix_t = torch.FloatTensor(adj_matrix)
labels_t = torch.LongTensor(df['Label'].tolist())
adj_matrix_t.shape

Using label propagation

label_propagation = LabelPropagation(adj_matrix_t)
print("Label Propagation: ", end="")
label_propagation.fit(labels_t)
label_propagation_output_labels = label_propagation.predict_classes()

The last step gives me the following error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-81-cf4f88a4bb12> in <module>
      2 label_propagation = LabelPropagation(adj_matrix_t)
      3 print("Label Propagation: ", end="")
----> 4 label_propagation.fit(labels_t)
      5 label_propagation_output_labels = label_propagation.predict_classes()
      6 

<ipython-input-1-54a7dbc30bd1> in fit(self, labels, max_iter, tol)
    100 
    101     def fit(self, labels, max_iter=1000, tol=1e-3):
--> 102         super().fit(labels, max_iter, tol)
    103 
    104 ## Label spreading

<ipython-input-1-54a7dbc30bd1> in fit(self, labels, max_iter, tol)
     58             Convergence tolerance: threshold to consider the system at steady state.
     59         """
---> 60         self._one_hot_encode(labels)
     61 
     62         self.predictions = self.one_hot_labels.clone()

<ipython-input-1-54a7dbc30bd1> in _one_hot_encode(self, labels)
     42         labels[unlabeled_mask] = 0
     43         self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
---> 44         self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
     45         self.one_hot_labels[unlabeled_mask, 0] = 0
     46 

RuntimeError: Expected index [8, 1] to be smaller than self [7, 2] apart from dimension 1 and to be smaller size than src [7, 2]

Do you know how I can fix it?


Solution

  • You have nodes that are only appearing in the Target column, so you need to incorporate that column when finding all unique nodes. I did this by concatenating the two columns (along with Label), grouping by node ID while summing the Label values, and then replacing summed labels with 1 if the sum was > 0:

    import networkx as nx
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    
    data = {'ID': {0: 1, 1: 2, 2: 4, 3: 4, 4: 12, 5: 12, 6: 13, 7: 17},
                'Target': {0: 12, 1: 24, 2: 13, 3: 12, 4: 1, 5: 4, 6: 4, 7: 1},
                'Weight': {0: 0.4, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.1, 5: 0.4, 6: 0.2, 7: 0.1},
                'Label': {0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0}}
        
    df = pd.DataFrame.from_dict(data)
        
    G = nx.from_pandas_edgelist(df, source='ID', target='Target', edge_attr=['Weight', 'Label']) 
        
    width = [10 * d['Weight'] for (u, v, d) in G.edges(data=True)]
    edge_color = [d['Label'] for (u, v, d) in G.edges(data=True)]
    
    df1 = df[['ID', 'Label']].rename(columns={'ID':'node'})
    df2 = df[['Target', 'Label']].rename(columns={'Target':'node'})
    df_to_use = pd.concat([df1, df2]).groupby('node').sum().reset_index()
    df_to_use['Label'] = df_to_use['Label'].apply(lambda x: 1 if x > 0 else 0)
    
    print(df_to_use)
    

    which gives

       node  Label
    0     1      1
    1     2      0
    2     4      1
    3    12      1
    4    13      1
    5    17      0
    6    24      0
    

    Couldn't help myself and had to try the scheme too see how it worked:

    node_labels = np.array([df_to_use[df_to_use['node'] == node]['Label'].item() for node in G.nodes()])
    
    idx = np.random.choice(range(len(node_labels)))
    node_labels_missing = node_labels.copy()
    node_labels_missing[idx] = -1
    
    adj_matrix_t = torch.FloatTensor(adj_matrix)
    labels_t = torch.LongTensor(node_labels_missing)
    
    label_propagation = LabelPropagation(adj_matrix_t)
    print("Label Propagation: ", end="")
    label_propagation.fit(labels_t)
    label_propagation_output_labels = label_propagation.predict_classes()
    
    pos = nx.spring_layout(G)
    
    fig = plt.figure(1, figsize=(15, 4)); plt.clf()
    fig, ax = plt.subplots(1, 3, num=1)
    
    ax[0].set_title("Actual Labels")
    ax[1].set_title("One Label Removed")
    ax[2].set_title("With Predicted Label")
    
    ax1 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=node_labels, ax=ax[0])
    ax2 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=[c if c in (0, 1) else 0.5 for c in labels_t], ax=ax[1])
    ax3 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=label_propagation_output_labels, ax=ax[2])
    

    which gives

    node_prediction


    NOTE: For readers without context, this user is trying to reimplement this code. The below custom class definitions must be executed before the code above will run.

    from abc import abstractmethod
    import torch
    
    class BaseLabelPropagation:
        """Base class for label propagation models.
        
        Parameters
        ----------
        adj_matrix: torch.FloatTensor
            Adjacency matrix of the graph.
        """
        def __init__(self, adj_matrix):
            self.norm_adj_matrix = self._normalize(adj_matrix)
            self.n_nodes = adj_matrix.size(0)
            self.one_hot_labels = None 
            self.n_classes = None
            self.labeled_mask = None
            self.predictions = None
    
        @staticmethod
        @abstractmethod
        def _normalize(adj_matrix):
            raise NotImplementedError("_normalize must be implemented")
    
        @abstractmethod
        def _propagate(self):
            raise NotImplementedError("_propagate must be implemented")
    
        def _one_hot_encode(self, labels):
            # Get the number of classes
            classes = torch.unique(labels)
            classes = classes[classes != -1]
            self.n_classes = classes.size(0)
    
            # One-hot encode labeled data instances and zero rows corresponding to unlabeled instances
            unlabeled_mask = (labels == -1)
            labels = labels.clone()  # defensive copying
            labels[unlabeled_mask] = 0
            self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
            self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
            self.one_hot_labels[unlabeled_mask, 0] = 0
    
            self.labeled_mask = ~unlabeled_mask
    
        def fit(self, labels, max_iter, tol):
            """Fits a semi-supervised learning label propagation model.
            
            labels: torch.LongTensor
                Tensor of size n_nodes indicating the class number of each node.
                Unlabeled nodes are denoted with -1.
            max_iter: int
                Maximum number of iterations allowed.
            tol: float
                Convergence tolerance: threshold to consider the system at steady state.
            """
            self._one_hot_encode(labels)
    
            self.predictions = self.one_hot_labels.clone()
            prev_predictions = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
    
            for i in range(max_iter):
                # Stop iterations if the system is considered at a steady state
                variation = torch.abs(self.predictions - prev_predictions).sum().item()
                
                if variation < tol:
                    print(f"The method stopped after {i} iterations, variation={variation:.4f}.")
                    break
    
                prev_predictions = self.predictions
                self._propagate()
    
        def predict(self):
            return self.predictions
    
        def predict_classes(self):
            return self.predictions.max(dim=1).indices
    
    
    class LabelPropagation(BaseLabelPropagation):
        def __init__(self, adj_matrix):
            super().__init__(adj_matrix)
    
        @staticmethod
        def _normalize(adj_matrix):
            """Computes D^-1 * W"""
            degs = adj_matrix.sum(dim=1)
            degs[degs == 0] = 1  # avoid division by 0 error
            return adj_matrix / degs[:, None]
    
        def _propagate(self):
            self.predictions = torch.matmul(self.norm_adj_matrix, self.predictions)
    
            # Put back already known labels
            self.predictions[self.labeled_mask] = self.one_hot_labels[self.labeled_mask]
    
        def fit(self, labels, max_iter=1000, tol=1e-3):
            super().fit(labels, max_iter, tol)