Search code examples
pythonpython-3.xmathstatisticsnormal-distribution

A Normal Distribution Calculator


so im trying to make a program to solve various normal distribution questions with pure python (no modules other than math) to 4 decimal places only for A Levels, and there is this problem that occurs in the function get_z_less_than_a_equal(0.75):. Apparently, without the assert statement in the except clause, the variables all get messed up, and change. The error, i'm catching is the recursion error. Anyways, if there is an easier and more efficient way to do things, it'd be appreciated.

import math

mean = 0
standard_dev = 1
percentage_points = {0.5000: 0.0000, 0.4000: 0.2533, 0.3000: 0.5244, 0.2000: 0.8416, 0.1000: 1.2816, 0.0500: 1.6440, 0.0250: 1.9600, 0.0100: 2.3263, 0.0050: 2.5758, 0.0010: 3.0902, 0.0005: 3.2905}

def get_z_less_than(x):
    """
    P(Z < x)
    """
    return round(0.5 * (1 + math.erf((x - mean)/math.sqrt(2 * standard_dev**2))), 4)

def get_z_greater_than(x):
    """
    P(Z > x)
    """
    return round(1 - get_z_less_than(x), 4)

def get_z_in_range(lower_bound, upper_bound):
    """
    P(lower_bound < Z < upper_bound)
    """
    return round(get_z_less_than(upper_bound) - get_z_less_than(lower_bound), 4)

def get_z_less_than_a_equal(x):
    """
    P(Z < a) = x
    acquires a, given x


    """
    # first trial: brute forcing
    for i in range(401):
        a = i/100
        p = get_z_less_than(a)
        if x == p:
            return a
        elif p > x:
            break
    # second trial: using symmetry
    try: 
        res = -get_z_less_than_a_equal(1 - x)
    except:
    # third trial: using estimation
        assert a, "error"
        prev = get_z_less_than(a-0.01)
        p = get_z_less_than(a)
        if abs(x - prev) > abs(x - p):
            res = a
        else:
            res = a - 0.01
    return res

def get_z_greater_than_a_equal(x):
    """
    P(Z > a) = x
    """
    if x in percentage_points:
        return percentage_points[x]
    else:
        return get_z_less_than_a_equal(1-x)

    
print(get_z_in_range(-1.20, 1.40))
print(get_z_less_than_a_equal(0.7517))
print(get_z_greater_than_a_equal(0.1000))
print(get_z_greater_than_a_equal(0.0322))
print(get_z_less_than_a_equal(0.1075))
print(get_z_less_than_a_equal(0.75))

Solution

  • Since python3.8, the statistics module in the standard library has a NormalDist class, so we could use that to implement our functions "with pure python" or at least for testing:

    import math
    from statistics import NormalDist
    
    normal_dist = NormalDist(mu=0, sigma=1)
    
    for i in range(-2000, 2000):
        test_val = i / 1000
        assert get_z_less_than(test_val) == round(normal_dist.cdf(test_val), 4)
    
    

    Doesn't throw an error, so that part probably works fine

    Your get_z_less_than_a_equal seems to be the equivalent of NormalDist.inv_cdf

    There are very efficient ways to compute it accurately using the inverse of the error function (see Wikipedia and Python implementation), but we don't have that in the standard library

    Since you only care about the first few digits and get_z_less_than is monotonic, we can use a simple bisection method to find our solution

    Newton's method would be much faster, and not too hard to implement since we know that the derivative of the cdf is just the pdf, but still probably more complex than what we need

    def get_z_less_than_a_equal(x):
        """
        P(Z < a) = x
        acquires a, given x
        """
        if x <= 0.0 or x >= 1.0:
            raise ValueError("x must be >0.0 and <1.0")
        min_res, max_res = -10, 10
        while max_res - min_res > 1e-7:
            mid = (max_res + min_res) / 2
            if get_z_less_than(mid) < x:
                min_res = mid
            else:
                max_res = mid
        return round((max_res + min_res) / 2, 4)
    

    Let's test this:

    for i in range(1, 2000):
        test_val = i / 2000
        left_val = get_z_less_than_a_equal(test_val)
        right_val = round(normal_dist.inv_cdf(test_val), 4)
        assert left_val == right_val, f"{left_val} != {right_val}"
     
    # AssertionError: -3.3201 != -3.2905
    

    We see that we are losing some precision, that's because the error introduced by get_z_less_than (which rounds to 4 digits) gets propagated and amplified when we use it to estimate its inverse (see Wikipedia - error propagation for details)

    So let's add a "digits" parameter to get_z_less_than and change our functions slightly:

    def get_z_less_than(x, digits=4):
        """
        P(Z < x)
        """
        res = 0.5 * (1 + math.erf((x - mean) / math.sqrt(2 * standard_dev ** 2)))
        return round(res, digits)
    
    
    def get_z_less_than_a_equal(x, digits=4):
        """
        P(Z < a) = x
        acquires a, given x
        """
        if x <= 0.0 or x >= 1.0:
            raise ValueError("x must be >0.0 and <1.0")
        min_res, max_res = -10, 10
        while max_res - min_res > 10 ** -(digits * 2):
            mid = (max_res + min_res) / 2
            if get_z_less_than(mid, digits * 2) < x:
                min_res = mid
            else:
                max_res = mid
        return round((max_res + min_res) / 2, digits)
    

    And now we can try the same test again and see it passes