Claude Code Plugins

Community-maintained marketplace

Feedback

TensorFlow, PyTorch, CNNs, RNNs, transformers, neural networks

Install Skill

1Download skill
2Enable skills in Claude

Open claude.ai/settings/capabilities and find the "Skills" section

3Upload to Claude

Click "Upload skill" and select the downloaded ZIP file

Note: Please verify skill by going through its instructions before using it.

SKILL.md

name deep-learning
description PyTorch, TensorFlow, neural networks, CNNs, transformers, and deep learning for production
sasmp_version 1.3.0
bonded_agent 06-ml-ai-engineer
bond_type PRIMARY_BOND
skill_version 2.0.0
last_updated 2025-01
complexity advanced
estimated_mastery_hours 200
prerequisites python-programming, machine-learning
unlocks llms-generative-ai, mlops

Deep Learning

Production-grade deep learning with PyTorch, neural network architectures, and modern training practices.

Quick Start

# PyTorch Production Training Loop
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import wandb

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
        self.classifier = nn.Linear(d_model, n_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
        x = self.dropout(x)
        x = self.transformer(x, src_key_padding_mask=mask)
        x = x.mean(dim=1)  # Global average pooling
        return self.classifier(x)

# Training configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size=30000).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
criterion = nn.CrossEntropyLoss()

# Training loop with mixed precision
scaler = torch.cuda.amp.GradScaler()

for epoch in range(10):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(batch["input_ids"].to(device))
            loss = criterion(logits, batch["labels"].to(device))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    scheduler.step()

Core Concepts

1. Modern Neural Network Architectures

import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    """Residual block with skip connection."""
    def __init__(self, channels: int):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)

    def forward(self, x):
        residual = x
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        return F.relu(x + residual)

class AttentionBlock(nn.Module):
    """Multi-head self-attention."""
    def __init__(self, d_model: int, n_heads: int = 8):
        super().__init__()
        self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.norm = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.norm(x + attn_out)
        return self.norm2(x + self.ffn(x))

2. Training Best Practices

from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR

# Gradient clipping and accumulation
def train_epoch(model, loader, optimizer, accumulation_steps=4):
    model.train()
    optimizer.zero_grad()

    for i, batch in enumerate(loader):
        with torch.cuda.amp.autocast():
            loss = model(batch) / accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

# Early stopping
class EarlyStopping:
    def __init__(self, patience: int = 5, min_delta: float = 0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')

    def __call__(self, val_loss: float) -> bool:
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience

# Learning rate finder
def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100):
    lrs, losses = [], []
    lr_mult = (end_lr / start_lr) ** (1 / num_iter)

    for i, batch in enumerate(loader):
        if i >= num_iter:
            break

        lr = start_lr * (lr_mult ** i)
        for pg in optimizer.param_groups:
            pg['lr'] = lr

        loss = train_step(model, batch, optimizer)
        lrs.append(lr)
        losses.append(loss)

    return lrs, losses

3. Model Deployment

import torch.onnx
import onnxruntime as ort

# Export to ONNX
def export_to_onnx(model, sample_input, path="model.onnx"):
    model.eval()
    torch.onnx.export(
        model,
        sample_input,
        path,
        export_params=True,
        opset_version=17,
        do_constant_folding=True,
        input_names=['input'],
        output_names=['output'],
        dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
    )

# ONNX Runtime inference
class ONNXPredictor:
    def __init__(self, model_path: str):
        self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])

    def predict(self, input_data):
        return self.session.run(None, {'input': input_data})[0]

# TorchScript for production
scripted_model = torch.jit.script(model)
scripted_model.save("model_scripted.pt")

Tools & Technologies

Tool Purpose Version (2025)
PyTorch Deep learning framework 2.2+
PyTorch Lightning Training framework 2.2+
Hugging Face Transformers, datasets 4.38+
ONNX Runtime Model inference 1.17+
TensorRT GPU optimization 8.6+
Weights & Biases Experiment tracking Latest
Ray Distributed training 2.9+

Troubleshooting Guide

Issue Symptoms Root Cause Fix
Vanishing Gradient Loss not decreasing Deep network, wrong activation Use ReLU/GELU, residual connections
Exploding Gradient NaN loss Learning rate too high Gradient clipping, lower LR
Overfitting Train >> Val accuracy Model too complex Dropout, regularization, data aug
OOM Error CUDA out of memory Batch too large Reduce batch, gradient accumulation
Slow Training Low GPU utilization Data loading bottleneck More workers, prefetch

Debug Commands

# Check GPU memory
print(torch.cuda.memory_summary())

# Profile training
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
) as prof:
    train_step(model, batch, optimizer)
print(prof.key_averages().table(sort_by="cuda_time_total"))

# Gradient flow check
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad_mean={param.grad.mean():.6f}")

Best Practices

# ✅ DO: Use mixed precision training
with torch.cuda.amp.autocast():
    output = model(input)

# ✅ DO: Initialize weights properly
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)

# ✅ DO: Use gradient checkpointing for large models
from torch.utils.checkpoint import checkpoint
x = checkpoint(self.layer, x)

# ✅ DO: Freeze base model for fine-tuning
for param in model.base.parameters():
    param.requires_grad = False

# ❌ DON'T: Use dropout during inference
model.eval()

# ❌ DON'T: Forget to move data to device

Resources


Skill Certification Checklist:

  • Can build and train neural networks in PyTorch
  • Can implement attention mechanisms and transformers
  • Can use mixed precision and gradient accumulation
  • Can export models to ONNX/TorchScript
  • Can debug training issues (gradients, memory)