I have a decision tree classifier that predicts the value of last column in my dataset which is either 'made' or 'missed' and I have ran the program a few times, but the accuracy is always a 100%. I would expect it to be about 95-100%. Any idea why? Here is a snippet of the dataset (the original one has over 74 000 rows):
Here is the code for the classifier:
from math import log
import operator
def load_csv(filename):
headers = ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
df = pd.read_csv(filename, header=None, names=headers, na_values="?")
while i<len(dataset):
labels = ["location","w","final_margin","shot_number","period","game_clock","shot_clock", "dribbles","touch_time",
return new_list, labels
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet: # the the number of unique elements and their occurance
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2) # log base 2
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] # chop out axis used for splitting
reducedFeatVec.extend(featVec[axis + 1:])
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 # the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0;
bestFeature = -1
for i in range(numFeatures): # iterate over all the features
featList = [example[i] for example in dataSet] # create a list of all the examples of this feature
uniqueVals = set(featList) # get a set of unique values
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy # calculate the info gain; ie reduction in entropy
print("feature : " + str(i))
print("baseEntropy : "+str(baseEntropy))
print("newEntropy : " + str(newEntropy))
print("infoGain : " + str(infoGain))
if (infoGain > bestInfoGain): # compare this to the best gain so far
bestInfoGain = infoGain # if better than current best, set to best
bestFeature = i
return bestFeature # returns an integer
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
# extracting data
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0] # stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: # stop splitting when there are no more features in dataSet
return majorityCnt(classList)
# use Information Gain
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
#build a tree recursively
myTree = {bestFeatLabel: {}}
#print("myTree : "+labels[bestFeat])
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
#print("featValues: "+str(featValues))
uniqueVals = set(featValues)
#print("uniqueVals: " + str(uniqueVals))
for value in uniqueVals:
subLabels = labels[:] # copy all of labels, so trees don't mess up existing labels
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
#print("myTree : " + str(myTree))
return myTree
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree)[0] #print("fistStr : "+firstStr)
secondDict = inputTree[firstStr]
#print("secondDict : " + str(secondDict))
featIndex = featLabels.index(firstStr)
#print("featIndex : " + str(featIndex))
key = testVec[featIndex]
#print("key : " + str(key))
valueOfFeat = secondDict[key]
#print("valueOfFeat : " + str(valueOfFeat))
if isinstance(valueOfFeat, dict):
#print("is instance: "+str(valueOfFeat))
classLabel = classify(valueOfFeat, featLabels, testVec)
#print("is Not instance: " + valueOfFeat)
classLabel = valueOfFeat
return classLabel
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# collect data
myDat, labels = load_csv('data/basketball.train.csv')
#build a tree
mytree = createTree(myDat, labels)
#run test
for row in myDat:
prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
"shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
#print('Expected=%s, Got=%s' % (row[-1], prediction))
actual = [row[-1] for row in myDat]
accuracy = accuracy_metric(actual, predictions)
You don't appear to be splitting your dataset into separate training and testing datasets. The result of this is that your classifier is probably over-fitting the dataset, and may not work well with samples from outside the dataset.
Try randomly selecting (say) 75% of the data for training, then testing the accuracy with the remaining 25%. For example, replacing last part of your code:
import random
dataset, labels = load_csv('data/basketball.train.csv')
split_index = int(len(dataset) * 0.75)
train_dataset = dataset[:split_index]
test_dataset = dataset[split_index:]
mytree = createTree(train_dataset, labels)
for row in test_dataset:
prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
"shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
#print('Expected=%s, Got=%s' % (row[-1], prediction))
actual = [row[-1] for row in test_dataset]
accuracy = accuracy_metric(actual, predictions)
(Note: untested)