NN training error(the index of the batch data out of range)

I want to train a model but facing a problem: in every batch I will select some node for training, and the id of nodes range from 0 to 999. While training I find the code use the id of the node as the index and try to get the other data of the node so the index is out of range. my code is here

import torch
import torch.nn as nn
import torch.nn.functional as F
from import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
from import DataLoader as loader
import pandas as pd
import torch.optim as optim
from import Dataset
from torch.nn.utils.rnn import pad_sequence

# feature
class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5,
        x = self.conv2(x, edge_index)
        return x

class MyDataset(Dataset):
    def __init__(self, nodes_file, edges_file, communities_file, features_file):
        self.nodes_data = pd.read_csv(nodes_file)
        self.edges_data = pd.read_csv(edges_file)
        self.communities_data = pd.read_csv(communities_file)
        self.features_data = pd.read_csv(features_file)

    def __len__(self):
        return len(self.nodes_data)

    def __getitem__(self, idx):
        node_id = self.nodes_data.iloc[idx]['ID']
        node_category = self.nodes_data.iloc[idx]['Category']
        node_community = self.communities_data[self.communities_data['ID'] == node_id]['Community'].values[0]
        edge_source = self.nodes_data.iloc[idx]['ID']
        edge_targets = eval(self.edges_data.iloc[idx]['neighbor'])

        # Get the corresponding features for the current node_id
        node_features = self.features_data[self.features_data['NodeID'] == node_id].iloc[:, 1:].values
        node_features = torch.tensor(node_features, dtype=torch.float)

        # You can process the node, edge, and community data as per your requirements
        # and return them as tensors
        return node_id, node_category, node_community, edge_source, edge_targets, node_features

def custom_collate_fn(batch):
    node_ids, node_categories, node_communities, edge_sources, edge_targets, node_features = zip(*batch)

    # Convert to PyTorch tensors
    node_ids = torch.tensor(node_ids, dtype=torch.long)
    node_categories = torch.tensor(node_categories, dtype=torch.long)
    node_communities = torch.tensor(node_communities, dtype=torch.long)
    edge_targets = [torch.tensor(targets) for targets in edge_targets]

    # Replicate edge_sources to have the same length as edge_targets
    edge_sources_replicated = [torch.tensor([source] * len(targets), dtype=torch.long) for source, targets in zip(edge_sources, edge_targets)]

    return node_ids, node_categories, node_communities, edge_sources_replicated, edge_targets, node_features

nodes_file = 'nodes.csv'
edges_file = 'RWR_features.csv'
communities_file = 'community.csv'
feature_file = 'node_features.csv'

dataset = MyDataset(nodes_file, edges_file, communities_file, feature_file)
dataloader =, batch_size=64, shuffle=True, collate_fn=custom_collate_fn)

input_dim = 1000  # Replace with the actual input dimension
hidden_dim = 64
output_dim = 32
num_categories = 3  # Replace with the actual number of node categories
num_communities = 5  # Replace with the actual number of communities
num_epochs = 300

gcn_model = GCN(input_dim, hidden_dim, output_dim)

# classify

# train loop
for epoch in range(num_epochs):
    for batch_data in dataloader:
        node_ids, node_categories, node_communities, edge_sources_list, edge_targets_list, node_feature = batch_data
        # print('node_ids:', node_ids)
        # print('len:', len(node_ids))
        # print('node_categories:', node_categories)
        # print('len:', len(node_categories))
        # print('node_communities:', node_communities)
        # print('len:', len(node_communities))
        # print('edge_sources:', edge_sources)
        # print('len:', len(edge_sources))
        # print('edge_targets', edge_targets)
        # print('len:', len(edge_targets))
        node_feature =, dim=0)
        # print('node_feature', node_feature)
        # print('len:', len(node_feature))

        edge_sources =, dim=0)
        edge_targets =, dim=0)
        # print(edge_sources)
        # print(len(edge_sources))
        # print(edge_targets)
        # print(len(edge_targets))

        edge_index = torch.stack([edge_sources, edge_targets], dim=0)

        # print(edge_index)

        x = gcn_model(node_feature, edge_index)

How can I fix this? At first, I want to find his index in batch through the node's ID, and use this index to find the rest data I need, but I don't know how to achieve this, is this method feasible? Thanks a lot.

The error info is below:

RuntimeError                              Traceback (most recent call last)
Cell In[2], line 62
     57 edge_index = torch.stack([edge_sources, edge_targets], dim=0)
     59 # print(edge_index)
---> 62 x = gcn_model(node_feature, edge_index)
     64 # A
     65 classify_logits = classify_model(x)

File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

Cell In[1], line 21, in GCN.forward(self, x, edge_index)
     20 def forward(self, x, edge_index):
---> 21     x = self.conv1(x, edge_index)
     22     x = F.relu(x)
     23     x = F.dropout(x, p=0.5,

File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/nn/conv/, in GCNConv.forward(self, x, edge_index, edge_weight)
    208 cache = self._cached_edge_index
    209 if cache is None:
--> 210     edge_index, edge_weight = gcn_norm(  # yapf: disable
    211         edge_index, edge_weight, x.size(self.node_dim),
    212         self.improved, self.add_self_loops, self.flow, x.dtype)
    213     if self.cached:
    214         self._cached_edge_index = (edge_index, edge_weight)

File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/nn/conv/, in gcn_norm(edge_index, edge_weight, num_nodes, improved, add_self_loops, flow, dtype)
     98 row, col = edge_index[0], edge_index[1]
     99 idx = col if flow == 'source_to_target' else row
--> 100 deg = scatter(edge_weight, idx, dim=0, dim_size=num_nodes, reduce='sum')
    101 deg_inv_sqrt = deg.pow_(-0.5)
    102 deg_inv_sqrt.masked_fill_(deg_inv_sqrt == float('inf'), 0)

File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/utils/, in scatter(src, index, dim, dim_size, reduce)
     72 if reduce == 'sum' or reduce == 'add':
     73     index = broadcast(index, src, dim)
---> 74     return src.new_zeros(size).scatter_add_(dim, index, src)
     76 if reduce == 'mean':
     77     count = src.new_zeros(dim_size)

RuntimeError: index 332 is out of bounds for dimension 0 with size 64

the example of the data is below:

node.csv(total: 1000 node): enter image description here

RWR_features.csv(each node have 30 important neighbor, sampled from the edges file, the class is the class of the source node): enter image description here

community.csv: enter image description here

node_features.csv: enter image description here

07/30/2023 edit

Already fixed. The problem lies in the input of the GCN, I rewrote the code.

import pandas as pd
import numpy as np

def load_data():
    # Load nodes.csv
    nodes_df = pd.read_csv('nodes.csv')
    # Load community.csv
    community_df = pd.read_csv('community.csv')
    # Load node_features.csv
    node_features_df = pd.read_csv('node_features.csv')
    # Load RWR_features.csv
    rwr_features_df = pd.read_csv('RWR_features.csv', usecols=['node', 'neighbor'])
    rwr_features_df['neighbor'] = rwr_features_df['neighbor'].apply(eval)
    rwr_features_df['neighbor'] = rwr_features_df['neighbor'].apply(lambda x: [int(i) for i in x])
    # Create mapping from node ID to index
    node_to_index = {node_id: index for index, node_id in enumerate(nodes_df['ID'])}
    # Create adjacency matrix
    num_nodes = len(nodes_df)
    adj = np.zeros((num_nodes, num_nodes))
    for _, row in rwr_features_df.iterrows():
        node_id = row['node']
        neighbor_ids = row['neighbor']
        for neighbor_id in neighbor_ids:
            adj[node_to_index[node_id]][node_to_index[neighbor_id]] = 1
            adj[node_to_index[neighbor_id]][node_to_index[node_id]] = 1  # Assuming undirected graph
    # Create feature matrix
    feature_columns = node_features_df.columns[1:]  # Exclude the first column (ID)
    feature = node_features_df[feature_columns].values
    # Create class labels
    class_label = nodes_df['Category'].values
    # Create community labels
    com_label = community_df['Community'].values
    # Split indices for train, val, and test sets (You can modify this split ratio as needed)
    num_train = int(num_nodes * 0.6)
    num_val = int(num_nodes * 0.2)
    num_test = num_nodes - num_train - num_val
    # Generate random indices for train, val, and test sets
    indices = np.random.permutation(num_nodes)
    train_idx = indices[:num_train]
    val_idx = indices[num_train:num_train + num_val]
    test_idx = indices[num_train + num_val:]
    return adj, feature, class_label, com_label, train_idx, val_idx, test_idx

# Usage
adj, feature, class_label, com_label, train_idx, val_idx, test_idx = load_data()

class MyDataset(Dataset):
    def __init__(self, adj, feature, class_label, com_label):
        self.adj = adj
        self.feature = feature
        self.class_label = class_label
        self.com_label = com_label
    def __len__(self):
        return len(self.class_label)
    def __getitem__(self, idx):
        return idx, self.adj[idx] , self.feature[idx], self.class_label[idx], self.com_label[idx]

Then, I input the adjacent matrix and the feature matrix into the GCN instead of the edge_index.

gcn_feature = gcn_model(batch_adj, batch_feature)


