Search code examples
pythonscikit-learndecision-treescipy.statscontingency

ValueError when using scipy.stats.chi2_contingency


import numpy as np
from scipy.stats import chi2_contingency

class Node_chi2:
    def __init__(self, chi2, num_samples, num_samples_per_class, predicted_class):
        self.chi2 = chi2
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

class DecisionTree_chi2():

    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y, depth=0):
        classes = list(set(y))
        num_samples_per_class = [np.sum(y == i) for i in classes]
        predicted_class = classes[np.argmax(num_samples_per_class)]
        node = Node_chi2(
              chi2=0,  # chi2 is 0 at the start
              num_samples=len(y),
              num_samples_per_class=num_samples_per_class,
              predicted_class=predicted_class,
          )

        if depth < self.max_depth:
            idx, thr = self.best_split(X, y, classes)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self.fit(X_left, y_left, depth + 1)
                node.right = self.fit(X_right, y_right, depth + 1)

        self.node = node
        return self.node
    def best_split(self, X, y, classes):
        m, n = X.shape
        if m <= 1:
            return None, None

        best_chi2 = 0  # Best chi2 score
        best_idx, best_thr = None, None

        for idx in range(n):
            thresholds, classes_sorted = zip(*sorted(zip(X[:, idx], y)))
            for i in range(1, m):  # iterate through each threshold
                if thresholds[i] == thresholds[i - 1]:
                    continue
                y_left = np.array(classes_sorted[:i])
                y_right = np.array(classes_sorted[i:])
                left_counts = np.bincount(y_left, minlength=len(classes)) 
                right_counts = np.bincount(y_right, minlength=len(classes))
                chi2, p, _ , _ = chi2_contingency(np.array([left_counts, right_counts]))

                if chi2 > best_chi2:
                    best_chi2 = chi2
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2

        return best_idx, best_thr

    def predict(self, X):
        yhat = []
        for sample in X: 
            node = self.node
            while node.left:
                if sample[node.feature_index] < node.threshold:
                    node = node.left
                else:
                    node = node.right
            yhat.append(node.predicted_class)
        return np.array(yhat)
    



from sklearn.datasets import make_moons, make_classification

X, y = make_moons(n_samples=100, noise=0.1, random_state=42)

def plot_decision_boundaries(clf, X, y, label=''):
    # Generate a grid of points to make predictions:
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, 0.1), np.arange(x2_min, x2_max, 0.1))

    # Use the classifier to make predictions on the grid:
    Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])
    '''Here is where the magic happens! Above we have created the (x1, x2) grid,
    and now we calculate the class for each and every point.'''

    Z = Z.reshape(xx1.shape)

    colors = {0: 'C0', 1: 'C1', 2: 'C2'}
    # colors dictionary indexed by class label

    # Create a contour plot to display the decision boundaries:
    fig = plt.figure(figsize=(4, 4))

    from matplotlib.colors import ListedColormap
    n_classes = len(np.unique(y))
    custom_cmap = ListedColormap(list(colors.values())[:n_classes])

    plt.contourf(xx1, xx2, Z, cmap=custom_cmap, alpha=0.4)

    plt.scatter(X[y==0,0], X[y==0,1], c=colors[0], marker='o', edgecolors='k', alpha=0.5)
    plt.scatter(X[y==1,0], X[y==1,1], c=colors[1], marker='o', edgecolors='k', alpha=0.5)
    plt.scatter(X[y==2,0], X[y==2,1], c=colors[2], marker='o', edgecolors='k', alpha=0.5)

    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.title('Decision Boundaries - '+label+' DT')
    plt.show()


clf_gini = DecisionTree_chi2(max_depth=5)
clf_gini.fit(X, y)

plot_decision_boundaries(clf_gini, X, y, label='Gini')

The error that I am getting is "ValueError: The internally computed table of expected frequencies has a zero element at (0, 0)." . The only solution that I found is to put:

left_counts = np.bincount(y_left, minlength=len(classes)) + 1e-10  # add small constant to avoid zero frequencies
                right_counts = np.bincount(y_right, minlength=len(classes)) + 1e-10  # add small constant to avoid zero frequencies

but I am not satisfied with that answer and I would like to fix it without having to add a small constant. It doesnt not seem correct to me so


Solution

  • The input that is causing the error is essentially

    from scipy import stats
    stats.chi2_contingency([[0, 1], [0, 31]])
    

    I don't know what your use case is, but let's adapt the example from the documentation of chi2_contingency to your data.

    The following table summarizes the results of the experiment in which participants took aspirin or a placebo on a regular basis for several years. Cases of ischemic stroke were recorded:

                      Aspirin   Control/Placebo
    Ischemic stroke     0           1
    No stroke           0          31
    

    Is there evidence that the aspirin reduces the risk of ischemic stroke?

    The test can't cannot be expected to return a meaningful result if none of the participants were given aspirin.

    You'll need to consider what this means in the context of your problem and reassess whether you want to provide that input. Instead of an error, the only correct result that the function could provide would have a statistic of np.nan. Technically, the p-value would also be np.nan, but practically speaking, you could consider it to be 1.0 if you prefer (regardless of your significance threshold, it would not be interpreted as evidence against the null hypothesis).