Search code examples
pythonpython-3.xmachine-learningregressiondata-science

Getting ValueError: All arrays must be of the same length


I have been trying to convert a dictionary into a dataframe but everytime i keep getting ValueError: All arrays must be of the same length. i Have checkde the length of each array and confirmed them to be the same but i am still getting the same error

def metrics_from_pipes(pipes_dict):
     for name, pipeline in pipes_dict.items():
        
        pipeline.fit(X_train, y_train)
        y_pred_val = pipeline.predict(X_val)
        y_pred_train = pipeline.predict(X_train)


train_metrics = {
            'model':list(pipes_dict.keys()),
            'MAE':train_mae,
            'MAPE':train_mape,
            'RMSE':train_rmse,
            'RSquared':train_rsquared
        }
        
        train_metrics_data = pd.DataFrame(train_metrics)
        val_metrics = {
            'model':list(pipes_dict.keys()),
            'MAE':val_mae,
            'MAPE':val_mape,
            'RMSE':val_rmse,
            'RSquared':val_rsquared            
        }
        
        val_metrics_data = pd.DataFrame(val_metrics,)

        #Merging metrics from train and test set
        train_val_metrics = train_metrics_data.merge(val_metrics_data,
                                               on = 'Model',
                                               how = 'left',
                                               suffixes = ('_train', '_val'))
        
        # sorting columns 
        train_val_metrics = train_val_metrics.reindex(columns = ['Model',
                                                               'MAE_train',
                                                                'MAPE_train',
                                                                'RMSE_train',
                                                                'RSquared_train',
                                                                'MAE_val',
                                                                'MAPE_val',
                                                                'RMSE_val',
                                                                'RSquared_val'])
        
    
    
    return train_val_metrics.set_index('Model').transpose()

# get the metrics table
metrics_table = metrics_from_pipes(pipelines)

running this code gives this error

ValueError                                Traceback (most recent call last)
Cell In[45], line 82
     80     return train_val_metrics.set_index('Model').transpose()
     81 # get the metrics table
---> 82 metrics_table = metrics_from_pipes(pipelines)
     83 #print('Table 1: Base Models Metrics')
     84 #metrics_table.style.background_gradient(cmap = Blues)
     85 metrics_table

Cell In[45], line 50, in metrics_from_pipes(pipes_dict)
     41 # aggregate the performance metric lists into seperate dataframes
     42 train_metrics = {
     43     'model':list(pipes_dict.keys()),
     44     'MAE':train_mae,
   (...)
     47     'RSquared':train_rsquared
     48 }
---> 50 train_metrics_data = pd.DataFrame(train_metrics)
     51 val_metrics = {
     52     'model':list(pipes_dict.keys()),
     53     'MAE':val_mae,
   (...)
     56     'RSquared':val_rsquared            
     57 }
     59 val_metrics_data = pd.DataFrame(val_metrics,)

ValueError: All arrays must be of the same length

when i checked for the result of the dictionary for both train_metrics and val metrics, i got this

({'model': ['Linear Regression',
   'Random Forest Regressor',
   'Gradient Boost Regression',
   'Extra Tree Regressor'],
  'MAE': [829.1023412412194,
   288.33455697065233,
   712.9637267872279,
   0.0010629575741748962],
  'MAPE': [1.0302372135902111,
   0.20937541440883897,
   0.538244903316323,
   6.306697580961048e-07],
  'RMSE': [1120.5542708017374,
   416.48933196590013,
   1012.399201767692,
   0.05804079289490426],
  'RSquared': [0.5598288286601083,
   0.9391916010838417,
   0.6406981997919169,
   0.9999999988190745]},
 {'model': ['Linear Regression',
   'Random Forest Regressor',
   'Gradient Boost Regression',
   'Extra Tree Regressor'],
  'MAE': [855.9254413559535,
   802.5902302175274,
   772.3140648475379,
   839.9018341377154],
  'MAPE': [1.0395487579496652,
   0.5607987708065988,
   0.5438627253681279,
   0.5852285872937784],
  'RMSE': [1148.6549900167981,
   1158.8411708570625,
   1109.6145558003204,
   1223.23337689915],
  'RSquared': [0.5876710102285392,
   0.5803255834810521,
   0.6152231339508221,
   0.5323905190373128]})

Solution

  • Print the contents of train_metrics, just before line 50;

    train_metrics_data = pd.DataFrame(train_metrics)
    

    Then you will see what the dict looked like just before it crashes. I ran a part of the faulty code, and it seems to work just fine.

    Using python 3.10.14 :

    import pandas as pd
    
    train_metrics = {'model': ['Linear Regression',
       'Random Forest Regressor',
       'Gradient Boost Regression',
       'Extra Tree Regressor'],
      'MAE': [829.1023412412194,
       288.33455697065233,
       712.9637267872279,
       0.0010629575741748962],
      'MAPE': [1.0302372135902111,
       0.20937541440883897,
       0.538244903316323,
       6.306697580961048e-07],
      'RMSE': [1120.5542708017374,
       416.48933196590013,
       1012.399201767692,
       0.05804079289490426],
      'RSquared': [0.5598288286601083,
       0.9391916010838417,
       0.6406981997919169,
       0.9999999988190745]}
    
    train_metrics_data = pd.DataFrame(train_metrics)
    
    print(train_metrics_data)
    

    prints:

                           model         MAE          MAPE         RMSE  RSquared
    0          Linear Regression  829.102341  1.030237e+00  1120.554271  0.559829
    1    Random Forest Regressor  288.334557  2.093754e-01   416.489332  0.939192
    2  Gradient Boost Regression  712.963727  5.382449e-01  1012.399202  0.640698
    3       Extra Tree Regressor    0.001063  6.306698e-07     0.058041  1.000000