import numpy as np
from numpy import sin, cos, pi
from matplotlib.pyplot import *
rng = np.random.default_rng(42)
N = 200
center = 10, 15
sigmas = 10, 2
theta = 20 / 180 * pi
# covariance matrix
rotmat = np.array([[cos(theta), -sin(theta)],[sin(theta), cos(theta)]])
diagmat = np.diagflat(sigmas)
mean =np.array([−1,−2,−3])
# covar = rotmat @ diagmat @ rotmat.T
covar= np.array([[2, 2 ,0],[2 ,3, 1],[0, 1 ,19]])
print('covariance matrix:')
print(covar)`enter code here`
eigval, eigvec = np.linalg.eigh(covar)
print(f'eigenvalues: {eigval}\neigenvectors:\n{eigvec}')
print('angle of eigvector corresponding to larger eigenvalue:',
180 /pi * np.arctan2(eigvec[1,1], eigvec[0,1]))
# PCA
mean = data.mean(axis=0)
print('mean:', mean)
# S1: explicit sum
S1 = np.zeros((2,2), dtype=float)
print(len(data))
for i in range(len(data)):
S1 += np.outer(data[i] - mean, data[i] - mean)
S1 /= len(data)
print(f'S1= (explicit sum)\n{S1}')
# S2:
S2 = np.cov(data, rowvar=False, bias=True)
print(f'S2= (np.cov)\n{S2}')
# PCA:
lambdas, u = np.linalg.eigh(S2)
print(f'\nPCA\nlambda={lambdas}\nu=\n{u}')
u1 = u[:,1] # largest
print('u1=\n',u1)
print(f'first principal component angle: {180/pi*np.arctan2(u1[1], u1[0])}')
after that I need to Perform PCA on the above data to one principal component and two principal components. What is the fractional explained variance in this two
cases
For generating the data, you need two tricks:
For random samples from :math:`N(\mu, \sigma^2)`, use:
``sigma * np.random.randn(...) + mu``
Example:
import numpy as np
import matplotlib.pyplot as plt
# Part 1: generating random normal data with the given mean and covariance
N = 200
# covariance matrix
S = np.array([[2, 2, 0], [2, 3, 1], [0, 1, 19]])
# mean
mu = np.array([[-1, -2, -3]]).T
# get "square root" of covariance matrix via eigenfactorization
w, v = np.linalg.eig(S)
sigma = np.sqrt(w) * v
# ready, set, go!
A = sigma @ np.random.randn(3, N) + mu
print(f'sample covariance:\n{np.cov(A)}')
# sample covariance:
# [[ 1.70899164 1.74288639 0.21190326]
# [ 1.74288639 2.59595547 1.2822817 ]
# [ 0.21190326 1.2822817 22.04077608]]
print(f'sample mean:\n{A.mean(axis=1)}')
# sample mean:
# [-1.02385787 -1.87783415 -2.96077204]
# --------------------------------------------
# Part 2: principal component analysis on random data A
# estimate the sample covariance
R = np.cov(A)
# do the PCA
lam, u = np.linalg.eig(R)
# fractional explained variance is the relative magnitude of
# the accumulated eigenvalues
# reorder the eigenvalues & vectors with hottest eigenvalues first
col_order = np.argsort(lam)[::-1]
lam = lam[col_order]
u = u[:, col_order]
print(f'eigenvalues: {lam}')
# eigenvalues: [22.13020272 3.87946467 0.3360558 ]
var_explained = lam.cumsum() / lam.sum()
print(f'fractional explained variance: {var_explained}')
# fractional explained variance: [0.83999223 0.98724439 1. ]
# ^^ 84% in first dimension alone,
# 99% in first two dimensions
# do the projection
B = u.T @ A
# now the variance in B is concentrated in the first two dimensions
covariance after PCA projection:
[[ 2.21302027e+01 -2.68545720e-15 -1.60675493e-15]
[-2.68545720e-15 3.87946467e+00 -1.19613978e-15]
[-1.60675493e-15 -1.19613978e-15 3.36055802e-01]]
# scatter plot
plt.plot(B[0], B[1], '.')
plt.axis('equal')
plt.grid('on')
plt.xlabel('principal axis 0')
plt.ylabel('principal axis 1')
plt.title('Random data projected onto two principal axes')
# project back using ONLY a two dimensional subspace of B
# i.e. drop the last eigenvector
A_approx = u[:,:2] @ B[:2,:]
# error analysis
err3 = A - A_approx
mse = (err3**2).sum(axis=0).mean()
print(f'predicted error variance: {lam[-1]}')
print(f'measured error variance: {mse}')
# predicted error variance: 0.3360558019705344
# measured error variance: 0.41137559916273914