Correlation Analysis
Correlation and Covariance
import numpy as np
from numba import njit
@njit
def pearson_correlation(x, y):
dx, dy = x - np.mean(x), y - np.mean(y)
return np.sum(dx * dy) / np.sqrt(np.sum(dx**2) * np.sum(dy**2))
@njit
def correlation_matrix(X):
n_samples, n_features = X.shape
corr = np.zeros((n_features, n_features))
means, stds = np.mean(X, axis=0), np.std(X, axis=0)
for i in range(n_features):
corr[i, i] = 1.0
for j in range(i + 1, n_features):
cov = np.sum((X[:, i] - means[i]) * (X[:, j] - means[j]))
corr[i, j] = corr[j, i] = cov / (n_samples * stds[i] * stds[j])
return corr
@njit
def covariance_matrix(X):
X_c = X - np.mean(X, axis=0)
return (X_c.T @ X_c) / (X.shape[0] - 1)
Canonical Correlation Analysis
def canonical_correlation_analysis(X, Y):
"""CCA: maximize correlation between linear combinations of X and Y"""
from scipy.linalg import sqrtm
n = X.shape[0]
X_c, Y_c = X - X.mean(axis=0), Y - Y.mean(axis=0)
Cxx = X_c.T @ X_c / (n - 1) + np.eye(X_c.shape[1]) * 1e-10
Cyy = Y_c.T @ Y_c / (n - 1) + np.eye(Y_c.shape[1]) * 1e-10
Cxy = X_c.T @ Y_c / (n - 1)
Cxx_inv_sqrt = sqrtm(np.linalg.inv(Cxx))
Cyy_inv_sqrt = sqrtm(np.linalg.inv(Cyy))
U, s, Vt = np.linalg.svd(Cxx_inv_sqrt @ Cxy @ Cyy_inv_sqrt)
return s, Cxx_inv_sqrt @ U, Cyy_inv_sqrt @ Vt.T
Cross-Correlation and Rank Correlation
@njit
def cross_correlation(x, y, max_lag):
"""Cross-correlation for lags from -max_lag to +max_lag"""
n, result = len(x), np.zeros(2 * max_lag + 1)
x_norm, y_norm = (x - np.mean(x)) / np.std(x), (y - np.mean(y)) / np.std(y)
for lag_idx, lag in enumerate(range(-max_lag, max_lag + 1)):
corr, count = 0.0, 0
for i in range(n):
j = i + lag
if 0 <= j < n:
corr += x_norm[i] * y_norm[j]
count += 1
result[lag_idx] = corr / count if count > 0 else 0.0
return result
@njit
def spearman_correlation(x, y):
"""Spearman rank correlation"""
rx = (np.argsort(np.argsort(x)) + 1).astype(np.float64)
ry = (np.argsort(np.argsort(y)) + 1).astype(np.float64)
return pearson_correlation(rx, ry)