Search code examples
pythonstatisticsgaussianbootstrappingstatistics-bootstrap

Bootstrapping a gaussian and best fit data


I have the following code which provides the best fit curve for my given data. I am not sure how to bootstrap it and use that to find the 95% confidence interval.

This is my code:

import numpy as np
import matplotlib.pyplot as plt
import pylab as plb
from scipy.optimize import curve_fit

data = np.loadtxt('gaussian.dat')
x = data[:, 0]
y = data[:, 1]

n = len(x)                          
mean = sum(x*y)/n                   
sigma = sum(y*(x-mean)**2)/n        

def gauss(x,a,x0,sigma):
    return a*np.exp(-(x-x0)**2/(2*sigma**2))

popt,pcov = curve_fit(gauss,x,y,p0=[1,mean,sigma])

plt.plot(x,y,'b+:',label='data')
plt.plot(x,gauss(x,*popt),'ro:',label='fit')
plt.legend()
plt.title('Gaussian Fit vs Actual Data')
plt.xlabel('x-values')
plt.ylabel('y-values')
plt.show()

Solution

  • Let's say that x, y are generated from the gauss with an error:

    # -- generating custom random dataset 
    # skip this part if you are using data from the file
    x = np.arange(-100,100,step=0.1)
    
    a = 2.4
    x0 = 0.1
    sigma = 3
    
    def gauss(x,a,x0,sigma):
        return a*np.exp(-(x-x0)**2/(2*sigma**2))
    
    # y is a gauss process with random error introduced from the measurement
    y = gauss(x, a, x0, sigma) + 0.1 * np.random.normal(size = x.shape[0])
    
    # <--- end of generating dataset
    
    
    # fitting the curve
    n = len(x)                          
    mean = sum(x*y)/n                   
    sigma = sum(y*(x-mean)**2)/n
    
    popt,pcov = curve_fit(gauss,x,y,p0=[1,mean,sigma])
    
    

    Then to generate confidence interval in any point (for example in x=0.2) you can sample parameters, apply gauss function with this parameters and get y in this point. Then get 2.5% and 97.5% quantile to plot 95% confidence interval:

    def bootstrap_gauss(x: float, popt: np.ndarray, pcov: np.ndarray):
        arr_bootstrap_y = []
    
        # bootstrapping parameters 100 times
        for t in range(100):
            a, mean, sigma = np.random.multivariate_normal(popt, pcov)
            y = gauss(x,a,x0,sigma)
            arr_bootstrap_y.append(y)
    
        return arr_bootstrap_y
    
    
    conf_interval = bootstrap_gauss(0.2, popt, pcov)
    # 95% confidence interval
    q1, q2 = np.quantile(conf_interval, [0.025,0.975])
    print("Confidence interval:", q1, q2)
    

    The remaining question is how to vectorize this calculations to make it more efficient...

    def get_conf_gaus(x: float,
                      popt: np.ndarray, 
                      pcov: np.ndarray,
                      n_boostrap:int = 100):
        
        params = np.random.multivariate_normal(popt, pcov, size = [n_boostrap])
        a = params[:,0]
        mu = params[:,1]
        sigma = params[:,2]
        
        bootstrap_y = gauss(0.2,a,mu,sigma)
        conf = np.quantile(bootstrap_y, [0.025,0.975])
        return conf
    
    conf = get_conf_gaus(0.2, popt, pcov)
    # outputs [2.37079969, 2.41727759]