Matrix Algorithms
Matrix Multiplication
Cache-Optimized (ikj ordering)
@njit
def matmul_ikj(A, B):
m, k = A.shape
k, n = B.shape
C = np.zeros((m, n))
for i in range(m):
for p in range(k):
for j in range(n): # Inner loop: contiguous access
C[i, j] += A[i, p] * B[p, j]
return C
Blocked/Tiled
@njit
def matmul_blocked(A, B, block_size=64):
m, k = A.shape
n = B.shape[1]
C = np.zeros((m, n))
for ii in range(0, m, block_size):
for jj in range(0, n, block_size):
for kk in range(0, k, block_size):
for i in range(ii, min(ii + block_size, m)):
for j in range(jj, min(jj + block_size, n)):
total = C[i, j]
for p in range(kk, min(kk + block_size, k)):
total += A[i, p] * B[p, j]
C[i, j] = total
return C
LU Decomposition
@njit
def lu_decomposition(A):
n = A.shape[0]
L = np.eye(n)
U = A.copy()
for k in range(n - 1):
for i in range(k + 1, n):
L[i, k] = U[i, k] / U[k, k]
for j in range(k, n):
U[i, j] -= L[i, k] * U[k, j]
return L, U
Triangular Solvers
@njit
def forward_sub(L, b):
"""Solve Lx = b (lower triangular)"""
n = len(b)
x = np.zeros(n)
for i in range(n):
x[i] = (b[i] - sum(L[i, j] * x[j] for j in range(i))) / L[i, i]
return x
@njit
def back_sub(U, b):
"""Solve Ux = b (upper triangular)"""
n = len(b)
x = np.zeros(n)
for i in range(n - 1, -1, -1):
x[i] = (b[i] - sum(U[i, j] * x[j] for j in range(i + 1, n))) / U[i, i]
return x
Matrix-Vector Parallel
@njit(parallel=True)
def matvec_parallel(A, x):
m, n = A.shape
y = np.zeros(m)
for i in prange(m):
y[i] = sum(A[i, j] * x[j] for j in range(n))
return y
| Technique |
When to Use |
| Loop reordering (ikj) |
Always - better cache locality |
| Blocking/Tiling |
Large matrices (>1000x1000) |
| Parallelization |
Independent row operations |