Search code examples
pythonmachine-learningxgboostgridsearchcvmake-scorer

Error with precision_score of XGBoost classifier with RandomizedSearchCV


I'm trying to make a classifier with XGBoost, I fit it with RandomizedSearchCV.

Here is the code of my function:

def xgboost_classifier_rscv(x,y):
    from scipy import stats
    from xgboost import XGBClassifier
    from sklearn.metrics import fbeta_score, make_scorer, recall_score, accuracy_score, precision_score
    from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV

    #splitting the dataset into training and test parts
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    #bag of words implmentation
    cv = CountVectorizer()
    x_train = cv.fit_transform(x_train).toarray()

    #TF-IDF implementation
    vector = TfidfTransformer()
    x_train = vector.fit_transform(x_train).toarray()
    x_test = cv.transform(x_test)
    
    scorers = {
            'f1_score':make_scorer(f1_score),
            'precision_score': make_scorer(precision_score),
            'recall_score': make_scorer(recall_score),
            'accuracy_score': make_scorer(accuracy_score)
          }

    param_dist = {'n_estimators': stats.randint(150, 1000),
                  'learning_rate': stats.uniform(0.01, 0.59),
                  'subsample': stats.uniform(0.3, 0.6),
                  'max_depth': [3, 4, 5, 6, 7, 8, 9],
                  'colsample_bytree': stats.uniform(0.5, 0.4),
                  'min_child_weight': [1, 2, 3, 4]
                 }
 n_folds = numFolds)
    skf = StratifiedKFold(n_splits=3, shuffle = True)
    gridCV = RandomizedSearchCV(xgb_model, 
                             param_distributions = param_dist,
                             cv = skf,  
                             n_iter = 5,  
                             scoring = scorers, 
                             verbose = 3, 
                             n_jobs = -1,
                             return_train_score=True,
                             refit = precision_score)

    gridCV.fit(x_train,y_train)
    best_pars = gridCV.best_params_
    print("best params : ", best_pars)
    xgb_predict = gridCV.predict(x_test)
    xgb_pred_prob = gridCV.predict_proba(x_test)
    print('best scores : ', gridCV.grid_scores_)
    scores = [x[1] for x in gridCV.grid_scores_]
    print("best scores : ", scores)

    return y_test, xgb_predict, xgb_pred_prob

When I run the code, I get an error, reported below:

TypeError                                 Traceback (most recent call last)
<ipython-input-30-9adf84d48e5c> in <module>
      1 print("********** Xgboost classifier *************")
      2 start_time = time.monotonic()
----> 3 y_test, xgb_predict, xgb_pred_prob = xgboost_classifier_rscv(x,y)
      4 end_time = time.monotonic()
      5 print("the time consumed is : ", timedelta(seconds=end_time - start_time))

<ipython-input-29-e0c6ae026076> in xgboost_classifier_rscv(x, y)
     70 #                                 verbose=3, random_state=1001, refit='precision_score' )
     71 
---> 72     gridCV.fit(x_train,y_train)
     73     best_pars = gridCV.best_params_
     74     print("best params : ", best_pars)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    858             # parameter set.
    859             if callable(self.refit):
--> 860                 self.best_index_ = self.refit(results)
    861                 if not isinstance(self.best_index_, numbers.Integral):
    862                     raise TypeError('best_index_ returned is not an integer')

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

TypeError: precision_score() missing 1 required positional argument: 'y_pred'

When I do the same thing but with GridSearchCV instead of RandomizedSearchCV, the code runs without any problems!


Solution

  • It's not precision_score it's 'precision_score' (with ' '), like this-

    gridCV = RandomizedSearchCV(xgb_model, 
                             param_distributions = param_dist,
                             cv = skf,  
                             n_iter = 5,  
                             scoring = scorers, 
                             verbose = 3, 
                             n_jobs = -1,
                             return_train_score=True,
                             refit = 'precision_score')
    

    Another error:

    grid_scores_ has been removed, so changed it to cv_results_ (in the last 3rd and 4th line)

    print('best scores : ', gridCV.cv_results_)
    scores = [x[1] for x in gridCV.cv_results_]
    

    One more error:

    You have not defined that xgb_model, so add that.

    xgb_model = XGBClassifier(n_jobs = -1, random_state = 42)