Search code examples
pythonscikit-learnanacondarandom-forestgrid-search

Python TypeError: range() integer end argument expected, got float. with fit function


I am fairly new to this and have seen other have the same error, but fail to see how I can implement the solutions. I am trying to write a Random Forest machine learning method using a randomised grid search from scikit learn. It works fine with a standard grid search but fails with an odd error in the fit function from scikit learn when I use the randomised grid search. Any suggestions on how to address this would be great

Here is a example that displays the error.

import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py

from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold

data = pd.read_csv("data.csv", sep=",")
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1)  

# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []

# Predictions results initialised 
RFpredictions = []

metcount = 0

# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay =  npArray.shape

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape

# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"

with open(train_name,'w') as ftrain:
        ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
        ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
        ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()

with open(fi_name,'w') as ffeatimp:
        ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()

# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
    ytestdim = yTest.shape[0]
    i = 0
    with open (train_name, 'a') as ftrain:
        while i< ytestdim :
                 ftrain.write(str(round(yTest[i],2))+',\n')
                 i += 1
    ftrain.close()

    print "\n"
    # random forest grid search parameters
    print "------------------- Begining Random Forest Grid Search -------------------"
    rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
    rf = RandomForestRegressor(random_state=0,n_jobs=2)
    RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20)
    start = time()
    RfGridSearch.fit(XTrain,yTrain)

    # Get best random forest parameters
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
    RFtime = time() - start,len(RfGridSearch.grid_scores_)
    report(RfGridSearch.grid_scores_)
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
    ne = RfGridSearch.best_params_['n_estimators']
    print("max_features = %s " % RfGridSearch.best_params_['max_features'])
    mf = RfGridSearch.best_params_['max_features']
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
    md = RfGridSearch.best_params_['max_depth']
    with open (train_name, 'a') as ftrain:
           ftrain.write("Random Forest")
           ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
           ftrain.write("Number of Trees, %s ,\n" % str(ne))
           ftrain.write("Number of feature at split, %s ,\n" % str(mf))
           ftrain.write("Max depth of tree, %s ,\n" % str(md))
     ftrain.close()

The error that is given is below

Traceback (most recent call last):
  File "rgscv.py", line 81, in <module>
    RfGridSearch.fit(XTrain,yTrain)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit
    return self._fit(X, y, sampled_params)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit
    for parameters in parameter_iterable
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
    while self.dispatch_one_batch(iterator):
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch
    job = ImmediateComputeBatch(batch)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__
    self.results = batch()
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit
    for i in range(n_more_estimators):
TypeError: range() integer end argument expected, got float.

At first I thought I had just missed a parameter but this exact method with a straight forward grid search seem to work no problem. The code for this is below. Can anyone suggest to me what is causing this error?

import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py

from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold

data = pd.read_csv("data.csv", sep=",")
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1)  

# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []

# Predictions results initialised 
RFpredictions = []

metcount = 0

# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay =  npArray.shape

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape

# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"

with open(train_name,'w') as ftrain:
        ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
        ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
        ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()

with open(fi_name,'w') as ffeatimp:
        ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()

# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
    ytestdim = yTest.shape[0]
    i = 0
    with open (train_name, 'a') as ftrain:
        while i< ytestdim :
              ftrain.write(str(round(yTest[i],2))+',\n')
              i += 1
    ftrain.close()

    print "\n"
    # random forest grid search parameters
    print "------------------- Begining Random Forest Grid Search -------------------"
    #rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
    rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]}
    rf = RandomForestRegressor(random_state=0,n_jobs=2)
    RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error')
    start = time()
    RfGridSearch.fit(XTrain,yTrain)

    # Get best random forest parameters
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
     RFtime = time() - start,len(RfGridSearch.grid_scores_)
     report(RfGridSearch.grid_scores_)
     print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
     ne = RfGridSearch.best_params_['n_estimators']
     print("max_features = %s " % RfGridSearch.best_params_['max_features'])
     mf = RfGridSearch.best_params_['max_features']
     print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
     md = RfGridSearch.best_params_['max_depth']
     with open (train_name, 'a') as ftrain:
                ftrain.write("Random Forest")
                ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
                ftrain.write("Number of Trees, %s ,\n" % str(ne))
                ftrain.write("Number of feature at split, %s ,\n" % str(mf))
                ftrain.write("Max depth of tree, %s ,\n" % str(md))
     ftrain.close()

Solution

  • Number of estimators has to be integer, and your code produces floats. Create a valid list of n_estimators values which contains integers, and it will be just fine.