Deep Learning
Production-grade deep learning with PyTorch, neural network architectures, and modern training practices.
Quick Start
# PyTorch Production Training Loop
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import wandb
class TransformerClassifier(nn.Module):
def __init__(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
self.classifier = nn.Linear(d_model, n_classes)
self.dropout = nn.Dropout(0.1)
def forward(self, x, mask=None):
x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
x = self.dropout(x)
x = self.transformer(x, src_key_padding_mask=mask)
x = x.mean(dim=1) # Global average pooling
return self.classifier(x)
# Training configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size=30000).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
criterion = nn.CrossEntropyLoss()
# Training loop with mixed precision
scaler = torch.cuda.amp.GradScaler()
for epoch in range(10):
model.train()
for batch in train_loader:
optimizer.zero_grad()
with torch.cuda.amp.autocast():
logits = model(batch["input_ids"].to(device))
loss = criterion(logits, batch["labels"].to(device))
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
scheduler.step()
Core Concepts
1. Modern Neural Network Architectures
import torch
import torch.nn as nn
import torch.nn.functional as F
class ResidualBlock(nn.Module):
"""Residual block with skip connection."""
def __init__(self, channels: int):
super().__init__()
self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn1 = nn.BatchNorm2d(channels)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn2 = nn.BatchNorm2d(channels)
def forward(self, x):
residual = x
x = F.relu(self.bn1(self.conv1(x)))
x = self.bn2(self.conv2(x))
return F.relu(x + residual)
class AttentionBlock(nn.Module):
"""Multi-head self-attention."""
def __init__(self, d_model: int, n_heads: int = 8):
super().__init__()
self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
self.norm = nn.LayerNorm(d_model)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_model * 4),
nn.GELU(),
nn.Linear(d_model * 4, d_model)
)
self.norm2 = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
attn_out, _ = self.attention(x, x, x, attn_mask=mask)
x = self.norm(x + attn_out)
return self.norm2(x + self.ffn(x))
2. Training Best Practices
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
# Gradient clipping and accumulation
def train_epoch(model, loader, optimizer, accumulation_steps=4):
model.train()
optimizer.zero_grad()
for i, batch in enumerate(loader):
with torch.cuda.amp.autocast():
loss = model(batch) / accumulation_steps
scaler.scale(loss).backward()
if (i + 1) % accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
# Early stopping
class EarlyStopping:
def __init__(self, patience: int = 5, min_delta: float = 0.001):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = float('inf')
def __call__(self, val_loss: float) -> bool:
if val_loss < self.best_loss - self.min_delta:
self.best_loss = val_loss
self.counter = 0
else:
self.counter += 1
return self.counter >= self.patience
# Learning rate finder
def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100):
lrs, losses = [], []
lr_mult = (end_lr / start_lr) ** (1 / num_iter)
for i, batch in enumerate(loader):
if i >= num_iter:
break
lr = start_lr * (lr_mult ** i)
for pg in optimizer.param_groups:
pg['lr'] = lr
loss = train_step(model, batch, optimizer)
lrs.append(lr)
losses.append(loss)
return lrs, losses
3. Model Deployment
import torch.onnx
import onnxruntime as ort
# Export to ONNX
def export_to_onnx(model, sample_input, path="model.onnx"):
model.eval()
torch.onnx.export(
model,
sample_input,
path,
export_params=True,
opset_version=17,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)
# ONNX Runtime inference
class ONNXPredictor:
def __init__(self, model_path: str):
self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
def predict(self, input_data):
return self.session.run(None, {'input': input_data})[0]
# TorchScript for production
scripted_model = torch.jit.script(model)
scripted_model.save("model_scripted.pt")
Tools & Technologies
| Tool |
Purpose |
Version (2025) |
| PyTorch |
Deep learning framework |
2.2+ |
| PyTorch Lightning |
Training framework |
2.2+ |
| Hugging Face |
Transformers, datasets |
4.38+ |
| ONNX Runtime |
Model inference |
1.17+ |
| TensorRT |
GPU optimization |
8.6+ |
| Weights & Biases |
Experiment tracking |
Latest |
| Ray |
Distributed training |
2.9+ |
Troubleshooting Guide
| Issue |
Symptoms |
Root Cause |
Fix |
| Vanishing Gradient |
Loss not decreasing |
Deep network, wrong activation |
Use ReLU/GELU, residual connections |
| Exploding Gradient |
NaN loss |
Learning rate too high |
Gradient clipping, lower LR |
| Overfitting |
Train >> Val accuracy |
Model too complex |
Dropout, regularization, data aug |
| OOM Error |
CUDA out of memory |
Batch too large |
Reduce batch, gradient accumulation |
| Slow Training |
Low GPU utilization |
Data loading bottleneck |
More workers, prefetch |
Debug Commands
# Check GPU memory
print(torch.cuda.memory_summary())
# Profile training
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
) as prof:
train_step(model, batch, optimizer)
print(prof.key_averages().table(sort_by="cuda_time_total"))
# Gradient flow check
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: grad_mean={param.grad.mean():.6f}")
Best Practices
# ✅ DO: Use mixed precision training
with torch.cuda.amp.autocast():
output = model(input)
# ✅ DO: Initialize weights properly
def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
# ✅ DO: Use gradient checkpointing for large models
from torch.utils.checkpoint import checkpoint
x = checkpoint(self.layer, x)
# ✅ DO: Freeze base model for fine-tuning
for param in model.base.parameters():
param.requires_grad = False
# ❌ DON'T: Use dropout during inference
model.eval()
# ❌ DON'T: Forget to move data to device
Resources
Skill Certification Checklist: