| name | pytorch |
| description | Building and training neural networks with PyTorch. Use when implementing deep learning models, training loops, data pipelines, model optimization with torch.compile, distributed training, or deploying PyTorch models. |
Using PyTorch
PyTorch is a deep learning framework with dynamic computation graphs, strong GPU acceleration, and Pythonic design. This skill covers practical patterns for building production-quality neural networks.
Table of Contents
- Core Concepts
- Model Architecture
- Training Loop
- Data Loading
- Performance Optimization
- Distributed Training
- Saving and Loading
Core Concepts
Tensors
import torch
# Create tensors
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
x = torch.zeros(3, 4)
x = torch.randn(3, 4) # Normal distribution
# Device management
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = x.to(device)
# Operations (all return new tensors)
y = x + 1
y = x @ x.T # Matrix multiplication
y = x.view(2, 6) # Reshape
Autograd
# Enable gradient tracking
x = torch.randn(3, requires_grad=True)
y = x ** 2
loss = y.sum()
# Compute gradients
loss.backward()
print(x.grad) # dy/dx
# Disable gradients for inference
with torch.no_grad():
pred = model(x)
# Or use inference mode (more efficient)
with torch.inference_mode():
pred = model(x)
Model Architecture
nn.Module Pattern
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
super().__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, output_dim)
self.dropout = nn.Dropout(0.1)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = F.relu(self.fc1(x))
x = self.dropout(x)
return self.fc2(x)
Common Layers
# Convolution
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
# Normalization
nn.BatchNorm2d(num_features)
nn.LayerNorm(normalized_shape)
# Attention
nn.MultiheadAttention(embed_dim, num_heads)
# Recurrent
nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
Weight Initialization
def init_weights(module):
if isinstance(module, nn.Linear):
nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, std=0.02)
model.apply(init_weights)
Training Loop
Standard Pattern
model = Model(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
for epoch in range(num_epochs):
model.train()
for batch in train_loader:
inputs, targets = batch
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
# Optional: gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
# Validation
model.eval()
with torch.no_grad():
for batch in val_loader:
# ... validation logic
Mixed Precision Training
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for batch in train_loader:
inputs, targets = batch
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
with autocast():
outputs = model(inputs)
loss = criterion(outputs, targets)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
Gradient Accumulation
# Requires setup from Mixed Precision Training above:
# scaler = GradScaler(), model, criterion, optimizer, device
accumulation_steps = 4
for i, batch in enumerate(train_loader):
inputs, targets = batch
inputs, targets = inputs.to(device), targets.to(device)
with autocast():
outputs = model(inputs)
loss = criterion(outputs, targets) / accumulation_steps
scaler.scale(loss).backward()
if (i + 1) % accumulation_steps == 0:
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
Data Loading
Dataset and DataLoader
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
def __init__(self, data, labels, transform=None):
self.data = data
self.labels = labels
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
if self.transform:
x = self.transform(x)
return x, self.labels[idx]
train_loader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=4,
pin_memory=True, # Faster GPU transfer
drop_last=True, # Consistent batch sizes
)
Collate Functions
def collate_fn(batch):
"""Custom batching for variable-length sequences."""
inputs, targets = zip(*batch)
inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
targets = torch.stack(targets)
return inputs, targets
loader = DataLoader(dataset, collate_fn=collate_fn)
Performance Optimization
torch.compile (PyTorch 2.0+)
# Basic compilation
model = torch.compile(model)
# With options
model = torch.compile(
model,
mode="reduce-overhead", # Options: default, reduce-overhead, max-autotune
fullgraph=True, # Enforce no graph breaks
)
# Compile specific functions
@torch.compile
def train_step(model, inputs, targets):
outputs = model(inputs)
return criterion(outputs, targets)
Compilation modes:
default: Good balance of compile time and speedupreduce-overhead: Minimizes framework overhead, good for small modelsmax-autotune: Maximum performance, longer compile time
Memory Optimization
# Activation checkpointing (trade compute for memory)
from torch.utils.checkpoint import checkpoint
class Model(nn.Module):
def forward(self, x):
# Recompute activations during backward
x = checkpoint(self.expensive_layer, x, use_reentrant=False)
return self.output_layer(x)
# Clear cache
torch.cuda.empty_cache()
# Monitor memory
print(torch.cuda.memory_allocated() / 1e9, "GB")
print(torch.cuda.max_memory_allocated() / 1e9, "GB")
Distributed Training
DistributedDataParallel (DDP)
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
def setup(rank, world_size):
dist.init_process_group("nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
def cleanup():
dist.destroy_process_group()
def train(rank, world_size):
setup(rank, world_size)
model = Model().to(rank)
model = DDP(model, device_ids=[rank])
sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
loader = DataLoader(dataset, sampler=sampler)
for epoch in range(num_epochs):
sampler.set_epoch(epoch) # Important for shuffling
# ... training loop
cleanup()
# Launch with: torchrun --nproc_per_node=4 train.py
FullyShardedDataParallel (FSDP)
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision
mp_policy = MixedPrecision(
param_dtype=torch.bfloat16,
reduce_dtype=torch.bfloat16,
buffer_dtype=torch.bfloat16,
)
model = FSDP(
model,
mixed_precision=mp_policy,
use_orig_params=True, # Required for torch.compile compatibility
)
Saving and Loading
Checkpoints
# Save
torch.save({
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"loss": loss,
}, "checkpoint.pt")
# Load
checkpoint = torch.load("checkpoint.pt", map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
Export for Deployment
# TorchScript
scripted = torch.jit.script(model)
scripted.save("model.pt")
# ONNX
torch.onnx.export(
model,
dummy_input,
"model.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},
)
Best Practices
- Always set model mode: Use
model.train()andmodel.eval()appropriately - Use inference_mode over no_grad: More efficient for inference
- Pin memory for GPU training: Set
pin_memory=Truein DataLoader - Profile before optimizing: Use
torch.profilerto find bottlenecks - Prefer bfloat16 over float16: Better numerical stability on modern GPUs
- Use torch.compile: Significant speedups with minimal code changes
- Set deterministic mode for reproducibility:
torch.manual_seed(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False
References
See reference/ for detailed documentation:
training-patterns.md- Advanced training techniquesdebugging.md- Debugging and profiling tools