Search code examples
pythonscikit-learnsklearn-pandas

Python sklearn df issue - Field Cady sample code issue


I'm working through Field Cady's "The Data Science Handbook", with sample code here: https://github.com/field-cady/the_data_science_handbook/blob/master/chapter08_classifiers/example.py

I get syntax error from line 23 of this code, namely:

File "<ipython-input-4-02028cc326e3>", line 2
    X, Y = df[df.columns[:3]], (df['species']=='virginica') X_train, X_test, 
Y_train, Y_test = train_test_split(X, Y, test_size=.8)
                                                                  ^
SyntaxError: invalid syntax

I've googled around but can't find any answer - if anyone is able to shine any light I'd really appreciate it.

Many thanks

Full code:

from matplotlib import pyplot as plt
import sklearn
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# name -> (line format, classifier)
CLASS_MAP = {
    'LogisticRegression':
        ('-', LogisticRegression()),
    'Naive Bayes': ('--', GaussianNB()),
    'Decision Tree':
        ('.-', DecisionTreeClassifier(max_depth=5)),
    'Random Forest':
        (':', RandomForestClassifier(
            max_depth=5, n_estimators=10,
            max_features=1)),
}
# Divide cols by independent/dependent, rows by test/ train
X, Y = df[df.columns[:3]], (df['species']=='virginica') X_train, X_test, 
Y_train, Y_test = \
    train_test_split(X, Y, test_size=.8)
for name, (line_fmt, model) in CLASS_MAP.items():
    model.fit(X_train, Y_train)
    # array w one col per label
    preds = model.predict_proba(X_test)
    pred = pd.Series(preds[:,1])
    fpr, tpr, thresholds = roc_curve(Y_test, pred)
    auc_score = auc(fpr, tpr)
    label='%s: auc=%f' % (name, auc_score)
    plt.plot(fpr, tpr, line_fmt,
        linewidth=5, label=label)
plt.legend(loc="lower right")
plt.title('Comparing Classifiers')
plt.plot([0, 1], [0, 1], 'k--') #x=y line.  Visual aid
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
plt.show()

Solution

  • You have to load the iris dataset first. Here is your updated code.

        from matplotlib import pyplot as plt
        import sklearn
        from sklearn.metrics import roc_curve, auc
        from sklearn.cross_validation import train_test_split
        from sklearn.linear_model import LogisticRegression
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.naive_bayes import GaussianNB
        # name -> (line format, classifier)
        from sklearn.datasets import load_iris
        import pandas as pd
        data = load_iris()
        df = pd.DataFrame(data['data'], columns=data['feature_names'])
        df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
    
    
        CLASS_MAP = {
            'LogisticRegression':
                ('-', LogisticRegression()),
            'Naive Bayes': ('--', GaussianNB()),
            'Decision Tree':
                ('.-', DecisionTreeClassifier(max_depth=5)),
            'Random Forest':
                (':', RandomForestClassifier(
                    max_depth=5, n_estimators=10,
                    max_features=1)),
        }
        # Divide cols by independent/dependent, rows by test/ train
        X, Y = df[df.columns[:3]], (df['species']=='virginica') 
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.8)
        for name, (line_fmt, model) in CLASS_MAP.items():
            model.fit(X_train, Y_train)
            # array w one col per label
            preds = model.predict_proba(X_test)
            pred = pd.Series(preds[:,1])
            fpr, tpr, thresholds = roc_curve(Y_test, pred)
            auc_score = auc(fpr, tpr)
            label='%s: auc=%f' % (name, auc_score)
            plt.plot(fpr, tpr, line_fmt,
                linewidth=5, label=label)
        plt.legend(loc="lower right")
        plt.title('Comparing Classifiers')
        plt.plot([0, 1], [0, 1], 'k--') #x=y line.  Visual aid
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate') 
        plt.ylabel('True Positive Rate')
        plt.show()