Search code examples
pythonnumpymachine-learningpca

Generate 200 data points drawn from a multivariate normal distribution with mean μ and covariance matrix S, where


where mean and covarance matrix =:

import numpy as np
   from numpy import sin, cos, pi
   from matplotlib.pyplot import *
   rng = np.random.default_rng(42) 
   N = 200
   center = 10, 15
   sigmas = 10, 2
   theta = 20 / 180 * pi  
   # covariance matrix
   rotmat = np.array([[cos(theta), -sin(theta)],[sin(theta), cos(theta)]])
   diagmat = np.diagflat(sigmas)
   mean =np.array([−1,−2,−3])
   # covar = rotmat @ diagmat @ rotmat.T
   covar= np.array([[2, 2 ,0],[2 ,3, 1],[0, 1 ,19]])
   print('covariance matrix:')
   print(covar)`enter code here`
   eigval, eigvec = np.linalg.eigh(covar)
   print(f'eigenvalues: {eigval}\neigenvectors:\n{eigvec}')
   print('angle of eigvector corresponding to larger eigenvalue:',
      180 /pi * np.arctan2(eigvec[1,1], eigvec[0,1]))

    # PCA
mean = data.mean(axis=0)
print('mean:', mean)
# S1: explicit sum
S1 = np.zeros((2,2), dtype=float)
print(len(data))
for i in range(len(data)):
  S1 += np.outer(data[i] - mean, data[i] - mean)
S1 /= len(data)
print(f'S1= (explicit sum)\n{S1}')
# S2: 
S2 = np.cov(data, rowvar=False, bias=True)
print(f'S2= (np.cov)\n{S2}')
# PCA:
lambdas, u = np.linalg.eigh(S2)
print(f'\nPCA\nlambda={lambdas}\nu=\n{u}')
u1 = u[:,1] # largest
print('u1=\n',u1)
print(f'first principal component angle: {180/pi*np.arctan2(u1[1], u1[0])}')

got error after that I need to Perform PCA on the above data to one principal component and two principal components. What is the fractional explained variance in this two cases


Solution

  • For generating the data, you need two tricks:

    • Compute a "square root" of covariance matrix S using eigenvalue-eigenvector factorization
    • Use the standard formula for generating a random normal with given mean and covariance. With Numpy it works on vectors (quoting from help(np.random.randn)):
    For random samples from :math:`N(\mu, \sigma^2)`, use:
    
    ``sigma * np.random.randn(...) + mu``
    

    Example:

    import numpy as np
    import matplotlib.pyplot as plt
    
    # Part 1: generating random normal data with the given mean and covariance
    N = 200
    
    # covariance matrix
    S = np.array([[2, 2, 0], [2, 3, 1], [0, 1, 19]])
    
    # mean
    mu = np.array([[-1, -2, -3]]).T
    
    # get "square root" of covariance matrix via eigenfactorization
    w, v = np.linalg.eig(S)
    sigma = np.sqrt(w) * v
    
    # ready, set, go!
    A = sigma @ np.random.randn(3, N) + mu
    
    print(f'sample covariance:\n{np.cov(A)}')
    
    # sample covariance:
    # [[ 1.70899164  1.74288639  0.21190326]
    #  [ 1.74288639  2.59595547  1.2822817 ]
    #  [ 0.21190326  1.2822817  22.04077608]]
    
    print(f'sample mean:\n{A.mean(axis=1)}')
    
    # sample mean:
    # [-1.02385787 -1.87783415 -2.96077204]
    
    # --------------------------------------------
    # Part 2: principal component analysis on random data A
    
    # estimate the sample covariance
    R = np.cov(A)
    # do the PCA
    lam, u = np.linalg.eig(R)
    
    # fractional explained variance is the relative magnitude of
    # the accumulated eigenvalues
    
    # reorder the eigenvalues & vectors with hottest eigenvalues first
    col_order = np.argsort(lam)[::-1]
    lam = lam[col_order]
    u = u[:, col_order]
    
    print(f'eigenvalues: {lam}')
    # eigenvalues: [22.13020272  3.87946467  0.3360558 ]
    
    var_explained = lam.cumsum() / lam.sum()
    print(f'fractional explained variance: {var_explained}')
    # fractional explained variance: [0.83999223 0.98724439 1.        ]
    #                                  ^^ 84% in first dimension alone,
    #                                     99% in first two dimensions
    
    # do the projection
    B = u.T @ A
    
    # now the variance in B is concentrated in the first two dimensions
    covariance after PCA projection:
    [[ 2.21302027e+01 -2.68545720e-15 -1.60675493e-15]
     [-2.68545720e-15  3.87946467e+00 -1.19613978e-15]
     [-1.60675493e-15 -1.19613978e-15  3.36055802e-01]]
    
    # scatter plot
    plt.plot(B[0], B[1], '.')
    plt.axis('equal')
    plt.grid('on')
    plt.xlabel('principal axis 0')
    plt.ylabel('principal axis 1')
    plt.title('Random data projected onto two principal axes')
    
    
    # project back using ONLY a two dimensional subspace of B
    #  i.e. drop the last eigenvector
    A_approx = u[:,:2] @ B[:2,:]
    
    # error analysis
    err3 = A - A_approx
    mse = (err3**2).sum(axis=0).mean()
    
    print(f'predicted error variance: {lam[-1]}')
    print(f'measured error variance: {mse}')
    # predicted error variance: 0.3360558019705344
    # measured error variance: 0.41137559916273914