I want to create predict
and predict_proba
methods in my DecisionTreeClassifier implementation, but it gives the error
Traceback (most recent call last):
File "c:\Users\Nijat\project.py", line 136, in <module>
print(model.predict(X))
^^^^^^^^^^^^^^^^
File "c:\Users\Nijat\project.py", line 128, in predict
return [1 if p[0] > 0.5 else 0 for p in self.predict_proba(X)]
^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\Nijat\project.py", line 121, in predict_proba
class1_proba = self.bypass_tree(self.tree, sample)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\Nijat\project.py", line 105, in bypass_tree
while node['type'] == 'node':
~~~~^^^^^^^^
KeyError: 'type'
Here's my code:
import numpy as np
import pandas as pd
class MyTreeClf:
def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.max_leafs = max_leafs
self.tree = None
self.leafs_cnt = 0
def node_entropy(self, probs):
return -np.sum([p * np.log2(p) for p in probs if p > 0])
def node_ig(self, x_col, y, split_value):
left_mask = x_col <= split_value
right_mask = x_col > split_value
if len(x_col[left_mask]) == 0 or len(x_col[right_mask]) == 0:
return 0
left_counts = np.bincount(y[left_mask])
right_counts = np.bincount(y[right_mask])
left_probs = left_counts / len(y[left_mask]) if len(y[left_mask]) > 0 else np.zeros_like(left_counts)
right_probs = right_counts / len(y[right_mask]) if len(y[right_mask]) > 0 else np.zeros_like(right_counts)
entropy_after = (len(y[left_mask]) / len(y) * self.node_entropy(left_probs) +
len(y[right_mask]) / len(y) * self.node_entropy(right_probs))
entropy_before = self.node_entropy(np.bincount(y) / len(y))
return entropy_before - entropy_after
def get_best_split(self, X: pd.DataFrame, y: pd.Series):
best_col, best_split_value, best_ig = None, None, -np.inf
for col in X.columns:
sorted_unique_values = np.sort(X[col].unique())
for i in range(1, len(sorted_unique_values)):
split_value = (sorted_unique_values[i - 1] + sorted_unique_values[i]) / 2
ig = self.node_ig(X[col], y, split_value)
if ig > best_ig:
best_ig = ig
best_col = col
best_split_value = split_value
return best_col, best_split_value
def fit(self, X: pd.DataFrame, y: pd.Series, depth=1, node=None):
if self.max_leafs < 2:
self.leafs_cnt = 2
return
if node is None:
node = {}
self.tree = node
best_col, best_split_value = self.get_best_split(X, y)
node['type'] = None
node['feature'] = best_col
node['threshold'] = best_split_value
if len(y.unique()) == 1:
self.leafs_cnt += 1
node['type'] = 'leaf'
node['class_counts'] = {y.unique()[0]: len(y)}
return
if len(y) == 1:
self.leafs_cnt += 1
node['type'] = 'leaf'
node['class_counts'] = {y.values[0]: 1}
return
if depth >= self.max_depth or len(y) < self.min_samples_split or (self.leafs_cnt + 2 > self.max_leafs):
self.leafs_cnt += 1
node['type'] = 'leaf'
node['class_counts'] = y.value_counts().to_dict()
return
if best_col is None:
node['type'] = 'leaf'
node['class_counts'] = y.value_counts().to_dict()
self.leafs_cnt += 1
return
node['type'] = 'node'
node['feature'] = best_col
node['threshold'] = best_split_value
left_mask = X[best_col] <= best_split_value
right_mask = X[best_col] > best_split_value
node['left'] = {}
node['right'] = {}
self.fit(X[left_mask], y[left_mask], depth + 1, node['left'])
self.fit(X[right_mask], y[right_mask], depth + 1, node['right'])
def bypass_tree(self, node, sample):
while node['type'] == 'node':
feature_value = sample[node['feature']]
if feature_value <= node['threshold']:
node = node['left']
else:
node = node['right']
total_count = sum(node['class_counts'].values())
class_1_count = node['class_counts'].get(1, 0)
class1_proba = class_1_count / total_count if total_count > 0 else 0
return class1_proba
def predict_proba(self, X: pd.DataFrame):
proba = []
for _, sample in X.iterrows():
class1_proba = self.bypass_tree(self.tree, sample)
proba.append(class1_proba)
return np.array(proba)
def predict(self, X: pd.DataFrame):
return [1 if p[0] > 0.5 else 0 for p in self.predict_proba(X)]
df = pd.read_csv('c:\\Users\\Nijat\\Downloads\\banknote+authentication.zip', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:, :4], df['target']
model = MyTreeClf(max_depth=3, min_samples_split=2, max_leafs=1)
model.fit(X, y)
print(model.predict(X))
The predict
and predict_proba
methods take a matrix of features in the form of a pandas dataframe.
For each row from the dataframe:
predict_proba
- returns probabilities (for the first class).
predict
- translates probabilities into binary classes by threshold > 0.5Validation:
Input data: two sets of parameters for the decision tree
Output: returned predictions (their sum) of probabilities and labels
I exactly don't know about datasets that used for checking the code but I think that it's "Banknote authentication"
Sample input:
{"max_depth": 3, "min_samples_split": 2, "max_leafs": 1}
Sample output:
11.2443438914
Two points here:
model = MyTreeClf(max_depth=3, min_samples_split=2, max_leafs=1)
will trigger a return in the fit due to the max_leafs <2
condition
putting it to two will allow the tree to be build
i do not have your data set, so we'll test with a random one
df = pd.DataFrame(columns=['variance', 'skewness', 'curtosis', 'entropy', 'target'],data=np.random.random(size=(500, 5)))
df['target'] =df['target'].apply(lambda x: 0 if x<0.5 else 1)
X, y = df.iloc[:, :4], df['target']
and then, look at the tree of you model to see if it is built
model = MyTreeClf(max_depth=3, min_samples_split=2, max_leafs=2)
model.fit(X, y)
print(model.tree)
it gives us a tree and as such the node structure will work
you can now run the model.predict_proba(X)
which gives an array, hence the p is a value and not a list, you need to modify the function predict:
def predict(self, X: pd.DataFrame):
return [1 if p > 0.5 else 0 for p in self.predict_proba(X)]