Python sklearn df issue - Field Cady sample code issue

I'm working through Field Cady's "The Data Science Handbook", with sample code here: https://github.com/field-cady/the_data_science_handbook/blob/master/chapter08_classifiers/example.py

I get syntax error from line 23 of this code, namely:

File "<ipython-input-4-02028cc326e3>", line 2
    X, Y = df[df.columns[:3]], (df['species']=='virginica') X_train, X_test, 
Y_train, Y_test = train_test_split(X, Y, test_size=.8)
                                                                  ^
SyntaxError: invalid syntax

I've googled around but can't find any answer - if anyone is able to shine any light I'd really appreciate it.

Many thanks

Full code:

from matplotlib import pyplot as plt
import sklearn
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# name -> (line format, classifier)
CLASS_MAP = {
    'LogisticRegression':
        ('-', LogisticRegression()),
    'Naive Bayes': ('--', GaussianNB()),
    'Decision Tree':
        ('.-', DecisionTreeClassifier(max_depth=5)),
    'Random Forest':
        (':', RandomForestClassifier(
            max_depth=5, n_estimators=10,
            max_features=1)),
}
# Divide cols by independent/dependent, rows by test/ train
X, Y = df[df.columns[:3]], (df['species']=='virginica') X_train, X_test, 
Y_train, Y_test = \
    train_test_split(X, Y, test_size=.8)
for name, (line_fmt, model) in CLASS_MAP.items():
    model.fit(X_train, Y_train)
    # array w one col per label
    preds = model.predict_proba(X_test)
    pred = pd.Series(preds[:,1])
    fpr, tpr, thresholds = roc_curve(Y_test, pred)
    auc_score = auc(fpr, tpr)
    label='%s: auc=%f' % (name, auc_score)
    plt.plot(fpr, tpr, line_fmt,
        linewidth=5, label=label)
plt.legend(loc="lower right")
plt.title('Comparing Classifiers')
plt.plot([0, 1], [0, 1], 'k--') #x=y line.  Visual aid
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
plt.show()

Solution

You have to load the iris dataset first. Here is your updated code.

    from matplotlib import pyplot as plt
    import sklearn
    from sklearn.metrics import roc_curve, auc
    from sklearn.cross_validation import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.naive_bayes import GaussianNB
    # name -> (line format, classifier)
    from sklearn.datasets import load_iris
    import pandas as pd
    data = load_iris()
    df = pd.DataFrame(data['data'], columns=data['feature_names'])
    df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)


    CLASS_MAP = {
        'LogisticRegression':
            ('-', LogisticRegression()),
        'Naive Bayes': ('--', GaussianNB()),
        'Decision Tree':
            ('.-', DecisionTreeClassifier(max_depth=5)),
        'Random Forest':
            (':', RandomForestClassifier(
                max_depth=5, n_estimators=10,
                max_features=1)),
    }
    # Divide cols by independent/dependent, rows by test/ train
    X, Y = df[df.columns[:3]], (df['species']=='virginica') 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.8)
    for name, (line_fmt, model) in CLASS_MAP.items():
        model.fit(X_train, Y_train)
        # array w one col per label
        preds = model.predict_proba(X_test)
        pred = pd.Series(preds[:,1])
        fpr, tpr, thresholds = roc_curve(Y_test, pred)
        auc_score = auc(fpr, tpr)
        label='%s: auc=%f' % (name, auc_score)
        plt.plot(fpr, tpr, line_fmt,
            linewidth=5, label=label)
    plt.legend(loc="lower right")
    plt.title('Comparing Classifiers')
    plt.plot([0, 1], [0, 1], 'k--') #x=y line.  Visual aid
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate') 
    plt.ylabel('True Positive Rate')
    plt.show()