Statistics & Mathematics
Mathematical foundations for data science, machine learning, and statistical analysis.
Quick Start
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
# Descriptive Statistics
data = np.array([23, 45, 67, 32, 45, 67, 89, 12, 34, 56])
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Std Dev: {np.std(data, ddof=1):.2f}")
print(f"IQR: {np.percentile(data, 75) - np.percentile(data, 25):.2f}")
# Hypothesis Testing
sample_a = [23, 45, 67, 32, 45]
sample_b = [56, 78, 45, 67, 89]
t_stat, p_value = stats.ttest_ind(sample_a, sample_b)
print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05:
print("Reject null hypothesis: significant difference")
else:
print("Fail to reject null hypothesis")
Core Concepts
1. Probability Distributions
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Normal Distribution
mu, sigma = 100, 15
normal_dist = stats.norm(loc=mu, scale=sigma)
x = np.linspace(50, 150, 100)
# PDF, CDF calculations
print(f"P(X < 85): {normal_dist.cdf(85):.4f}")
print(f"P(X > 115): {1 - normal_dist.cdf(115):.4f}")
print(f"95th percentile: {normal_dist.ppf(0.95):.2f}")
# Binomial Distribution (discrete)
n, p = 100, 0.3
binom_dist = stats.binom(n=n, p=p)
print(f"P(X = 30): {binom_dist.pmf(30):.4f}")
print(f"P(X <= 30): {binom_dist.cdf(30):.4f}")
# Poisson Distribution (events per time)
lambda_param = 5
poisson_dist = stats.poisson(mu=lambda_param)
print(f"P(X = 3): {poisson_dist.pmf(3):.4f}")
# Central Limit Theorem demonstration
population = np.random.exponential(scale=10, size=100000)
sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)]
print(f"Sample means are approximately normal: mean={np.mean(sample_means):.2f}")
2. Hypothesis Testing Framework
from scipy import stats
import numpy as np
class HypothesisTest:
"""Framework for statistical hypothesis testing."""
@staticmethod
def two_sample_ttest(group_a, group_b, alpha=0.05):
"""Independent samples t-test."""
t_stat, p_value = stats.ttest_ind(group_a, group_b)
effect_size = (np.mean(group_a) - np.mean(group_b)) / np.sqrt(
(np.var(group_a) + np.var(group_b)) / 2
)
return {
"t_statistic": t_stat,
"p_value": p_value,
"significant": p_value < alpha,
"effect_size_cohens_d": effect_size
}
@staticmethod
def chi_square_test(observed, expected=None, alpha=0.05):
"""Chi-square test for categorical data."""
if expected is None:
chi2, p_value, dof, expected = stats.chi2_contingency(observed)
else:
chi2, p_value = stats.chisquare(observed, expected)
dof = len(observed) - 1
return {
"chi2_statistic": chi2,
"p_value": p_value,
"degrees_of_freedom": dof,
"significant": p_value < alpha
}
@staticmethod
def ab_test_proportion(conversions_a, total_a, conversions_b, total_b, alpha=0.05):
"""Two-proportion z-test for A/B testing."""
p_a = conversions_a / total_a
p_b = conversions_b / total_b
p_pooled = (conversions_a + conversions_b) / (total_a + total_b)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b))
z_stat = (p_a - p_b) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
return {
"conversion_a": p_a,
"conversion_b": p_b,
"lift": (p_b - p_a) / p_a * 100,
"z_statistic": z_stat,
"p_value": p_value,
"significant": p_value < alpha
}
# Usage
result = HypothesisTest.ab_test_proportion(
conversions_a=120, total_a=1000,
conversions_b=150, total_b=1000
)
print(f"Lift: {result['lift']:.1f}%, p-value: {result['p_value']:.4f}")
3. Linear Algebra Essentials
import numpy as np
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Basic operations
print("Matrix multiplication:", A @ B)
print("Element-wise:", A * B)
print("Transpose:", A.T)
print("Inverse:", np.linalg.inv(A))
print("Determinant:", np.linalg.det(A))
# Eigenvalues and eigenvectors (PCA foundation)
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"Eigenvalues: {eigenvalues}")
# Singular Value Decomposition (dimensionality reduction)
U, S, Vt = np.linalg.svd(A)
print(f"Singular values: {S}")
# Solving linear systems: Ax = b
b = np.array([5, 11])
x = np.linalg.solve(A, b)
print(f"Solution: {x}")
# Cosine similarity (NLP, recommendations)
def cosine_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
print(f"Cosine similarity: {cosine_similarity(vec1, vec2):.4f}")
4. Regression Analysis
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
# Multiple Linear Regression with statsmodels
X = np.random.randn(100, 3)
y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(100)*0.5
X_with_const = sm.add_constant(X)
model = sm.OLS(y, X_with_const).fit()
print(model.summary())
print(f"R-squared: {model.rsquared:.4f}")
print(f"Coefficients: {model.params}")
print(f"P-values: {model.pvalues}")
# Regularization comparison
X_train, y_train = X[:80], y[:80]
X_test, y_test = X[80:], y[80:]
models = {
"OLS": LinearRegression(),
"Ridge": Ridge(alpha=1.0),
"Lasso": Lasso(alpha=0.1)
}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"{name}: R²={r2_score(y_test, y_pred):.4f}, RMSE={np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
Tools & Technologies
| Tool |
Purpose |
Version (2025) |
| NumPy |
Numerical computing |
1.26+ |
| SciPy |
Scientific computing |
1.12+ |
| pandas |
Data manipulation |
2.2+ |
| statsmodels |
Statistical models |
0.14+ |
| scikit-learn |
ML algorithms |
1.4+ |
Troubleshooting Guide
| Issue |
Symptoms |
Root Cause |
Fix |
| Low p-value, small effect |
Significant but meaningless |
Large sample size |
Check effect size |
| High variance |
Unstable estimates |
Small sample, outliers |
More data, robust methods |
| Multicollinearity |
Inflated coefficients |
Correlated features |
VIF check, remove features |
| Heteroscedasticity |
Invalid inference |
Non-constant variance |
Weighted least squares |
Best Practices
# ✅ DO: Check assumptions before testing
from scipy.stats import shapiro
stat, p = shapiro(data)
if p > 0.05:
print("Data is approximately normal")
# ✅ DO: Use effect sizes, not just p-values
# ✅ DO: Correct for multiple comparisons (Bonferroni)
# ✅ DO: Report confidence intervals
# ❌ DON'T: p-hack by trying many tests
# ❌ DON'T: Confuse correlation with causation
# ❌ DON'T: Ignore sample size requirements
Resources
Skill Certification Checklist: