I'm working through Field Cady's "The Data Science Handbook", with sample code here: https://github.com/field-cady/the_data_science_handbook/blob/master/chapter08_classifiers/example.py
I get syntax error from line 23 of this code, namely:
File "<ipython-input-4-02028cc326e3>", line 2
X, Y = df[df.columns[:3]], (df['species']=='virginica') X_train, X_test,
Y_train, Y_test = train_test_split(X, Y, test_size=.8)
^
SyntaxError: invalid syntax
I've googled around but can't find any answer - if anyone is able to shine any light I'd really appreciate it.
Many thanks
Full code:
from matplotlib import pyplot as plt
import sklearn
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# name -> (line format, classifier)
CLASS_MAP = {
'LogisticRegression':
('-', LogisticRegression()),
'Naive Bayes': ('--', GaussianNB()),
'Decision Tree':
('.-', DecisionTreeClassifier(max_depth=5)),
'Random Forest':
(':', RandomForestClassifier(
max_depth=5, n_estimators=10,
max_features=1)),
}
# Divide cols by independent/dependent, rows by test/ train
X, Y = df[df.columns[:3]], (df['species']=='virginica') X_train, X_test,
Y_train, Y_test = \
train_test_split(X, Y, test_size=.8)
for name, (line_fmt, model) in CLASS_MAP.items():
model.fit(X_train, Y_train)
# array w one col per label
preds = model.predict_proba(X_test)
pred = pd.Series(preds[:,1])
fpr, tpr, thresholds = roc_curve(Y_test, pred)
auc_score = auc(fpr, tpr)
label='%s: auc=%f' % (name, auc_score)
plt.plot(fpr, tpr, line_fmt,
linewidth=5, label=label)
plt.legend(loc="lower right")
plt.title('Comparing Classifiers')
plt.plot([0, 1], [0, 1], 'k--') #x=y line. Visual aid
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
You have to load the iris
dataset first. Here is your updated code.
from matplotlib import pyplot as plt
import sklearn
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# name -> (line format, classifier)
from sklearn.datasets import load_iris
import pandas as pd
data = load_iris()
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
CLASS_MAP = {
'LogisticRegression':
('-', LogisticRegression()),
'Naive Bayes': ('--', GaussianNB()),
'Decision Tree':
('.-', DecisionTreeClassifier(max_depth=5)),
'Random Forest':
(':', RandomForestClassifier(
max_depth=5, n_estimators=10,
max_features=1)),
}
# Divide cols by independent/dependent, rows by test/ train
X, Y = df[df.columns[:3]], (df['species']=='virginica')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.8)
for name, (line_fmt, model) in CLASS_MAP.items():
model.fit(X_train, Y_train)
# array w one col per label
preds = model.predict_proba(X_test)
pred = pd.Series(preds[:,1])
fpr, tpr, thresholds = roc_curve(Y_test, pred)
auc_score = auc(fpr, tpr)
label='%s: auc=%f' % (name, auc_score)
plt.plot(fpr, tpr, line_fmt,
linewidth=5, label=label)
plt.legend(loc="lower right")
plt.title('Comparing Classifiers')
plt.plot([0, 1], [0, 1], 'k--') #x=y line. Visual aid
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()