Generate 200 data points drawn from a multivariate normal distribution with mean μ and covariance matrix S, where

import numpy as np
   from numpy import sin, cos, pi
   from matplotlib.pyplot import *
   rng = np.random.default_rng(42) 
   N = 200
   center = 10, 15
   sigmas = 10, 2
   theta = 20 / 180 * pi  
   # covariance matrix
   rotmat = np.array([[cos(theta), -sin(theta)],[sin(theta), cos(theta)]])
   diagmat = np.diagflat(sigmas)
   mean =np.array([−1,−2,−3])
   # covar = rotmat @ diagmat @ rotmat.T
   covar= np.array([[2, 2 ,0],[2 ,3, 1],[0, 1 ,19]])
   print('covariance matrix:')
   print(covar)`enter code here`
   eigval, eigvec = np.linalg.eigh(covar)
   print(f'eigenvalues: {eigval}\neigenvectors:\n{eigvec}')
   print('angle of eigvector corresponding to larger eigenvalue:',
      180 /pi * np.arctan2(eigvec[1,1], eigvec[0,1]))

    # PCA
mean = data.mean(axis=0)
print('mean:', mean)
# S1: explicit sum
S1 = np.zeros((2,2), dtype=float)
print(len(data))
for i in range(len(data)):
  S1 += np.outer(data[i] - mean, data[i] - mean)
S1 /= len(data)
print(f'S1= (explicit sum)\n{S1}')
# S2: 
S2 = np.cov(data, rowvar=False, bias=True)
print(f'S2= (np.cov)\n{S2}')
# PCA:
lambdas, u = np.linalg.eigh(S2)
print(f'\nPCA\nlambda={lambdas}\nu=\n{u}')
u1 = u[:,1] # largest
print('u1=\n',u1)
print(f'first principal component angle: {180/pi*np.arctan2(u1[1], u1[0])}')

after that I need to Perform PCA on the above data to one principal component and two principal components. What is the fractional explained variance in this two cases

Solution

For generating the data, you need two tricks:

Compute a "square root" of covariance matrix S using eigenvalue-eigenvector factorization
Use the standard formula for generating a random normal with given mean and covariance. With Numpy it works on vectors (quoting from help(np.random.randn)):

For random samples from :math:`N(\mu, \sigma^2)`, use:

``sigma * np.random.randn(...) + mu``

Example:

import numpy as np
import matplotlib.pyplot as plt

# Part 1: generating random normal data with the given mean and covariance
N = 200

# covariance matrix
S = np.array([[2, 2, 0], [2, 3, 1], [0, 1, 19]])

# mean
mu = np.array([[-1, -2, -3]]).T

# get "square root" of covariance matrix via eigenfactorization
w, v = np.linalg.eig(S)
sigma = np.sqrt(w) * v

# ready, set, go!
A = sigma @ np.random.randn(3, N) + mu

print(f'sample covariance:\n{np.cov(A)}')

# sample covariance:
# [[ 1.70899164  1.74288639  0.21190326]
#  [ 1.74288639  2.59595547  1.2822817 ]
#  [ 0.21190326  1.2822817  22.04077608]]

print(f'sample mean:\n{A.mean(axis=1)}')

# sample mean:
# [-1.02385787 -1.87783415 -2.96077204]

# --------------------------------------------
# Part 2: principal component analysis on random data A

# estimate the sample covariance
R = np.cov(A)
# do the PCA
lam, u = np.linalg.eig(R)

# fractional explained variance is the relative magnitude of
# the accumulated eigenvalues

# reorder the eigenvalues & vectors with hottest eigenvalues first
col_order = np.argsort(lam)[::-1]
lam = lam[col_order]
u = u[:, col_order]

print(f'eigenvalues: {lam}')
# eigenvalues: [22.13020272  3.87946467  0.3360558 ]

var_explained = lam.cumsum() / lam.sum()
print(f'fractional explained variance: {var_explained}')
# fractional explained variance: [0.83999223 0.98724439 1.        ]
#                                  ^^ 84% in first dimension alone,
#                                     99% in first two dimensions

# do the projection
B = u.T @ A

# now the variance in B is concentrated in the first two dimensions
covariance after PCA projection:
[[ 2.21302027e+01 -2.68545720e-15 -1.60675493e-15]
 [-2.68545720e-15  3.87946467e+00 -1.19613978e-15]
 [-1.60675493e-15 -1.19613978e-15  3.36055802e-01]]

# scatter plot
plt.plot(B[0], B[1], '.')
plt.axis('equal')
plt.grid('on')
plt.xlabel('principal axis 0')
plt.ylabel('principal axis 1')
plt.title('Random data projected onto two principal axes')


# project back using ONLY a two dimensional subspace of B
#  i.e. drop the last eigenvector
A_approx = u[:,:2] @ B[:2,:]

# error analysis
err3 = A - A_approx
mse = (err3**2).sum(axis=0).mean()

print(f'predicted error variance: {lam[-1]}')
print(f'measured error variance: {mse}')
# predicted error variance: 0.3360558019705344
# measured error variance: 0.41137559916273914