| name | ml-integration-patterns |
| description | Machine learning integration patterns for rRNA-Phylo covering three use cases - rRNA sequence classification (supervised learning with sklearn/PyTorch), multi-tree consensus (ensemble methods), and generative tree synthesis (GNNs/transformers). Includes feature engineering, model training, hyperparameter tuning, model serving, versioning, and evaluation metrics for bioinformatics ML workflows. |
ML Integration Patterns
Purpose
Provide comprehensive patterns for integrating machine learning into the rRNA-Phylo project across three distinct use cases: rRNA classification, phylogenetic tree consensus, and generative tree synthesis.
When to Use
This skill activates when:
- Training ML models for rRNA detection
- Implementing feature engineering for sequences
- Building ensemble methods for tree consensus
- Working with graph neural networks for trees
- Model serving and inference
- ML model evaluation and metrics
- Hyperparameter tuning
- Model versioning and deployment
Three ML Use Cases
1. rRNA Sequence Classification (Supervised Learning)
Goal: Classify sequences into rRNA types (16S, 18S, 23S, etc.)
2. Multi-Tree Consensus (Ensemble Methods)
Goal: Combine multiple phylogenetic trees into a reliable consensus
3. Generative Tree Synthesis (Deep Learning)
Goal: Generate optimized phylogenetic trees using GenAI
Use Case 1: rRNA Sequence Classification
Overview
Train a supervised classifier to identify rRNA type from DNA/RNA sequences. This complements traditional methods (HMM, BLAST) and can be faster with comparable accuracy.
Feature Engineering
Pattern 1: K-mer Frequency Vectors
# app/services/ml/features/kmers.py
from typing import Dict, List
import numpy as np
from collections import Counter
from itertools import product
class KmerFeatureExtractor:
"""Extract k-mer frequency features from sequences."""
def __init__(self, k: int = 6, normalize: bool = True):
"""
Initialize k-mer extractor.
Args:
k: K-mer size (default 6)
normalize: Whether to normalize frequencies
"""
self.k = k
self.normalize = normalize
self.vocab = self._build_vocab()
def _build_vocab(self) -> Dict[str, int]:
"""Build k-mer vocabulary."""
bases = ['A', 'C', 'G', 'T']
kmers = [''.join(p) for p in product(bases, repeat=self.k)]
return {kmer: idx for idx, kmer in enumerate(kmers)}
def extract(self, sequence: str) -> np.ndarray:
"""
Extract k-mer features from sequence.
Args:
sequence: DNA sequence
Returns:
Feature vector of shape (4^k,)
"""
sequence = sequence.upper().replace('U', 'T')
# Count k-mers
kmers = [
sequence[i:i+self.k]
for i in range(len(sequence) - self.k + 1)
]
counts = Counter(kmers)
# Build feature vector
features = np.zeros(len(self.vocab))
for kmer, count in counts.items():
if kmer in self.vocab:
features[self.vocab[kmer]] = count
# Normalize
if self.normalize and features.sum() > 0:
features = features / features.sum()
return features
Pattern 2: Positional Encoding + One-Hot
# app/services/ml/features/sequence_encoding.py
import numpy as np
from typing import Optional
class SequenceEncoder:
"""Encode sequences for deep learning models."""
def __init__(self, max_length: int = 2000):
self.max_length = max_length
self.base_to_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
def one_hot_encode(
self,
sequence: str,
pad: bool = True
) -> np.ndarray:
"""
One-hot encode sequence.
Args:
sequence: DNA sequence
pad: Whether to pad to max_length
Returns:
Array of shape (length, 5) or (max_length, 5) if padded
"""
sequence = sequence.upper().replace('U', 'T')
# Truncate if needed
if len(sequence) > self.max_length:
sequence = sequence[:self.max_length]
# Encode
encoded = np.zeros((len(sequence), 5))
for i, base in enumerate(sequence):
idx = self.base_to_idx.get(base, 4) # 4 for unknown
encoded[i, idx] = 1
# Pad
if pad and len(sequence) < self.max_length:
padding = np.zeros((self.max_length - len(sequence), 5))
encoded = np.vstack([encoded, padding])
return encoded
def positional_encode(
self,
sequence: str
) -> np.ndarray:
"""
Add positional encoding (for transformers).
Returns:
Array of shape (length, 5 + positional_dims)
"""
# One-hot encode
one_hot = self.one_hot_encode(sequence, pad=False)
# Add positional encoding (simplified)
positions = np.arange(len(sequence))[:, np.newaxis]
pos_encoding = np.sin(positions / 10000)
return np.hstack([one_hot, pos_encoding])
Model Training (Classical ML)
Pattern: sklearn Pipeline
# app/services/ml/models/rrna_classifier.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib
from typing import List, Tuple
import numpy as np
class RRNAClassifier:
"""Classical ML classifier for rRNA type prediction."""
def __init__(self, model_type: str = "random_forest"):
self.model_type = model_type
self.pipeline = self._build_pipeline()
self.classes_ = None
def _build_pipeline(self) -> Pipeline:
"""Build sklearn pipeline."""
if self.model_type == "random_forest":
return Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(
n_estimators=200,
max_depth=20,
min_samples_split=5,
random_state=42,
n_jobs=-1
))
])
# Add other models (XGBoost, SVM, etc.)
def train(
self,
X: np.ndarray,
y: np.ndarray,
test_size: float = 0.2,
tune_hyperparameters: bool = False
) -> dict:
"""
Train the classifier.
Args:
X: Feature matrix (n_samples, n_features)
y: Labels (n_samples,)
test_size: Test set proportion
tune_hyperparameters: Whether to run GridSearch
Returns:
Training metrics
"""
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42, stratify=y
)
# Hyperparameter tuning
if tune_hyperparameters:
self.pipeline = self._tune_hyperparameters(X_train, y_train)
else:
self.pipeline.fit(X_train, y_train)
# Evaluate
y_pred = self.pipeline.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)
self.classes_ = self.pipeline.classes_
return {
"accuracy": report["accuracy"],
"classification_report": report,
"confusion_matrix": conf_matrix.tolist(),
"train_size": len(X_train),
"test_size": len(X_test)
}
def _tune_hyperparameters(
self,
X_train: np.ndarray,
y_train: np.ndarray
) -> Pipeline:
"""Grid search for best hyperparameters."""
param_grid = {
'classifier__n_estimators': [100, 200, 300],
'classifier__max_depth': [10, 20, 30],
'classifier__min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(
self.pipeline,
param_grid,
cv=5,
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
return grid_search.best_estimator_
def predict(
self,
X: np.ndarray,
return_proba: bool = False
) -> np.ndarray:
"""Make predictions."""
if return_proba:
return self.pipeline.predict_proba(X)
return self.pipeline.predict(X)
def save(self, path: str):
"""Save model to disk."""
joblib.dump(self.pipeline, path)
@classmethod
def load(cls, path: str):
"""Load model from disk."""
instance = cls()
instance.pipeline = joblib.load(path)
instance.classes_ = instance.pipeline.classes_
return instance
Model Training (Deep Learning)
Pattern: PyTorch CNN for Sequences
# app/services/ml/models/rrna_cnn.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
class SequenceDataset(Dataset):
"""Dataset for sequence classification."""
def __init__(self, sequences: List[np.ndarray], labels: List[int]):
self.sequences = sequences
self.labels = labels
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
return (
torch.FloatTensor(self.sequences[idx]),
torch.LongTensor([self.labels[idx]])[0]
)
class RRNACNNClassifier(nn.Module):
"""CNN for rRNA sequence classification."""
def __init__(
self,
num_classes: int = 6,
seq_length: int = 2000,
num_filters: int = 128
):
super().__init__()
# Convolutional layers
self.conv1 = nn.Conv1d(5, num_filters, kernel_size=7, padding=3)
self.conv2 = nn.Conv1d(num_filters, num_filters*2, kernel_size=5, padding=2)
self.conv3 = nn.Conv1d(num_filters*2, num_filters*4, kernel_size=3, padding=1)
# Pooling
self.pool = nn.MaxPool1d(2)
# Global pooling
self.global_pool = nn.AdaptiveAvgPool1d(1)
# Fully connected
self.fc1 = nn.Linear(num_filters*4, 256)
self.fc2 = nn.Linear(256, num_classes)
# Regularization
self.dropout = nn.Dropout(0.5)
self.batch_norm1 = nn.BatchNorm1d(num_filters)
self.batch_norm2 = nn.BatchNorm1d(num_filters*2)
def forward(self, x):
# x shape: (batch, seq_length, 5)
x = x.transpose(1, 2) # (batch, 5, seq_length)
# Conv blocks
x = self.pool(torch.relu(self.batch_norm1(self.conv1(x))))
x = self.pool(torch.relu(self.batch_norm2(self.conv2(x))))
x = self.pool(torch.relu(self.conv3(x)))
# Global pooling
x = self.global_pool(x).squeeze(-1)
# FC layers
x = self.dropout(torch.relu(self.fc1(x)))
x = self.fc2(x)
return x
class RRNACNNTrainer:
"""Trainer for CNN model."""
def __init__(
self,
model: RRNACNNClassifier,
device: str = "cuda" if torch.cuda.is_available() else "cpu"
):
self.model = model.to(device)
self.device = device
def train(
self,
train_loader: DataLoader,
val_loader: DataLoader,
epochs: int = 50,
lr: float = 0.001
) -> dict:
"""Train the model."""
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(self.model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', patience=5, factor=0.5
)
history = {"train_loss": [], "val_loss": [], "val_acc": []}
for epoch in range(epochs):
# Training
self.model.train()
train_loss = 0
for sequences, labels in train_loader:
sequences = sequences.to(self.device)
labels = labels.to(self.device)
optimizer.zero_grad()
outputs = self.model(sequences)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= len(train_loader)
# Validation
val_loss, val_acc = self._validate(val_loader, criterion)
scheduler.step(val_loss)
history["train_loss"].append(train_loss)
history["val_loss"].append(val_loss)
history["val_acc"].append(val_acc)
print(f"Epoch {epoch+1}/{epochs}: "
f"Train Loss={train_loss:.4f}, "
f"Val Loss={val_loss:.4f}, "
f"Val Acc={val_acc:.4f}")
return history
def _validate(
self,
loader: DataLoader,
criterion: nn.Module
) -> Tuple[float, float]:
"""Validate the model."""
self.model.eval()
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for sequences, labels in loader:
sequences = sequences.to(self.device)
labels = labels.to(self.device)
outputs = self.model(sequences)
loss = criterion(outputs, labels)
total_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
return total_loss / len(loader), correct / total
def save(self, path: str):
"""Save model checkpoint."""
torch.save({
'model_state_dict': self.model.state_dict(),
'model_config': {
'num_classes': self.model.fc2.out_features,
# Add other config
}
}, path)
@classmethod
def load(cls, path: str):
"""Load model checkpoint."""
checkpoint = torch.load(path)
model = RRNACNNClassifier(**checkpoint['model_config'])
model.load_state_dict(checkpoint['model_state_dict'])
return cls(model)
Model Serving
Pattern: FastAPI Integration
# app/services/ml/inference/rrna_predictor.py
from typing import List, Dict
import numpy as np
from app.services.ml.features.kmers import KmerFeatureExtractor
from app.services.ml.models.rrna_classifier import RRNAClassifier
class RRNAMLPredictor:
"""ML-based rRNA predictor (production)."""
def __init__(self, model_path: str, feature_extractor: str = "kmer"):
self.model = RRNAClassifier.load(model_path)
self.feature_extractor = KmerFeatureExtractor(k=6)
async def predict(
self,
sequence: str,
return_confidence: bool = True
) -> Dict:
"""
Predict rRNA type.
Args:
sequence: DNA sequence
return_confidence: Return confidence scores
Returns:
Prediction with confidence
"""
# Extract features
features = self.feature_extractor.extract(sequence)
features = features.reshape(1, -1)
# Predict
if return_confidence:
proba = self.model.predict(features, return_proba=True)[0]
pred_idx = np.argmax(proba)
pred_class = self.model.classes_[pred_idx]
return {
"predicted_type": pred_class,
"confidence": float(proba[pred_idx]),
"all_probabilities": {
self.model.classes_[i]: float(p)
for i, p in enumerate(proba)
}
}
else:
pred = self.model.predict(features)[0]
return {"predicted_type": pred}
# app/api/v1/rrna.py (API endpoint)
from app.services.ml.inference.rrna_predictor import RRNAMLPredictor
ml_predictor = RRNAMLPredictor("models/rrna_classifier_v1.joblib")
@router.post("/ml-detect")
async def ml_detect_rrna(sequence: str):
"""Detect rRNA using ML model."""
result = await ml_predictor.predict(sequence)
return result
Use Case 2: Multi-Tree Consensus (Ensemble)
Overview
Combine multiple phylogenetic trees (from different methods) into a single consensus tree. This is an ensemble approach similar to model ensembling in ML.
Tree Representation
# app/services/ml/phylo/tree_representation.py
from typing import List, Set, Dict
from ete3 import Tree
import numpy as np
class TreeEnsemble:
"""Ensemble of phylogenetic trees."""
def __init__(self, trees: List[Tree], methods: List[str]):
"""
Initialize tree ensemble.
Args:
trees: List of ete3 Tree objects
methods: Method used for each tree
"""
self.trees = trees
self.methods = methods
self.taxa = self._get_taxa()
def _get_taxa(self) -> Set[str]:
"""Get all taxa across trees."""
taxa = set()
for tree in self.trees:
taxa.update([leaf.name for leaf in tree.get_leaves()])
return taxa
def get_bipartitions(self) -> List[Dict]:
"""
Extract bipartitions from all trees.
Returns:
List of bipartitions with frequencies
"""
bipart_counts = {}
for tree in self.trees:
for node in tree.traverse():
if not node.is_leaf():
# Get leaves under this node
leaves = set([leaf.name for leaf in node.get_leaves()])
# Create bipartition (as frozenset for hashing)
bipart = frozenset(leaves)
# Count
if bipart not in bipart_counts:
bipart_counts[bipart] = 0
bipart_counts[bipart] += 1
# Convert to list with frequencies
bipartitions = [
{
"bipartition": list(bipart),
"frequency": count / len(self.trees),
"count": count
}
for bipart, count in bipart_counts.items()
]
return sorted(bipartitions, key=lambda x: x["frequency"], reverse=True)
Consensus Methods
Pattern: Majority-Rule Consensus
# app/services/ml/phylo/consensus.py
from ete3 import Tree
from typing import List, Dict
class ConsensusTreeBuilder:
"""Build consensus trees from multiple input trees."""
def __init__(self, ensemble: TreeEnsemble):
self.ensemble = ensemble
def strict_consensus(self) -> Tree:
"""
Build strict consensus tree.
Only includes bipartitions present in ALL trees.
"""
bipartitions = self.ensemble.get_bipartitions()
# Filter to 100% frequency
strict_biparts = [
b for b in bipartitions
if b["frequency"] == 1.0
]
return self._build_tree_from_bipartitions(strict_biparts)
def majority_rule_consensus(self, threshold: float = 0.5) -> Tree:
"""
Build majority-rule consensus tree.
Args:
threshold: Minimum frequency to include bipartition
Returns:
Consensus tree
"""
bipartitions = self.ensemble.get_bipartitions()
# Filter by threshold
majority_biparts = [
b for b in bipartitions
if b["frequency"] >= threshold
]
tree = self._build_tree_from_bipartitions(majority_biparts)
# Add support values to nodes
for node in tree.traverse():
if not node.is_leaf():
leaves = set([leaf.name for leaf in node.get_leaves()])
# Find matching bipartition
for bipart in majority_biparts:
if set(bipart["bipartition"]) == leaves:
node.support = bipart["frequency"]
break
return tree
def weighted_consensus(self, weights: List[float]) -> Tree:
"""
Build weighted consensus tree.
Args:
weights: Weight for each input tree
Returns:
Consensus tree with weighted support
"""
# Similar to majority-rule but with weighted frequencies
pass
def _build_tree_from_bipartitions(
self,
bipartitions: List[Dict]
) -> Tree:
"""
Construct tree from bipartitions.
This is non-trivial! Use greedy algorithm or exact methods.
"""
# Start with star tree
taxa = list(self.ensemble.taxa)
tree = Tree()
tree.add_child(name=taxa[0])
# Add bipartitions greedily
for bipart in bipartitions:
leaves_in_bipart = set(bipart["bipartition"])
# Find node to split
# This is a simplified version
pass
return tree
Conflict Detection
# app/services/ml/phylo/conflict.py
from typing import List, Dict, Tuple
class TreeConflictAnalyzer:
"""Analyze conflicts between trees."""
def __init__(self, ensemble: TreeEnsemble):
self.ensemble = ensemble
def find_conflicts(self) -> List[Dict]:
"""
Find conflicting bipartitions across trees.
Returns:
List of conflicts with details
"""
bipartitions = self.ensemble.get_bipartitions()
conflicts = []
for i, b1 in enumerate(bipartitions):
for b2 in bipartitions[i+1:]:
if self._are_conflicting(b1, b2):
conflicts.append({
"bipartition1": b1,
"bipartition2": b2,
"conflict_type": self._classify_conflict(b1, b2)
})
return conflicts
def _are_conflicting(
self,
b1: Dict,
b2: Dict
) -> bool:
"""Check if two bipartitions conflict."""
set1 = set(b1["bipartition"])
set2 = set(b2["bipartition"])
# Bipartitions conflict if they're incompatible
# (i.e., cannot both be in the same tree)
intersect = set1 & set2
return (
len(intersect) > 0 and
len(intersect) < len(set1) and
len(intersect) < len(set2)
)
def robinson_foulds_distance(
self,
tree1: Tree,
tree2: Tree
) -> int:
"""
Calculate Robinson-Foulds distance.
Measures topological difference between trees.
"""
result = tree1.robinson_foulds(tree2)
return result[0] # RF distance
Use Case 3: GenAI Tree Synthesis (Advanced)
Overview
Use generative AI (Graph Neural Networks or Transformers) to synthesize phylogenetic trees from multiple inputs. This is experimental/research-level.
Graph Representation
# app/services/ml/genai/tree_graph.py
import torch
from torch_geometric.data import Data
from ete3 import Tree
from typing import List
class TreeToGraphConverter:
"""Convert phylogenetic trees to graph representation."""
def __init__(self):
self.taxa_to_idx = {}
def tree_to_graph(self, tree: Tree) -> Data:
"""
Convert tree to PyTorch Geometric graph.
Returns:
Graph data object
"""
# Assign indices to all nodes
nodes = list(tree.traverse())
node_to_idx = {id(node): i for i, node in enumerate(nodes)}
# Build edge list
edges = []
for node in nodes:
if node.up:
# Add edge from parent to child
parent_idx = node_to_idx[id(node.up)]
child_idx = node_to_idx[id(node)]
edges.append([parent_idx, child_idx])
edges.append([child_idx, parent_idx]) # Undirected
edge_index = torch.tensor(edges, dtype=torch.long).t()
# Node features (simplified)
node_features = []
for node in nodes:
if node.is_leaf():
# Leaf node: one-hot encode taxon
features = self._encode_taxon(node.name)
else:
# Internal node: aggregate features
features = torch.zeros(100) # Placeholder
node_features.append(features)
x = torch.stack(node_features)
return Data(x=x, edge_index=edge_index)
def _encode_taxon(self, taxon: str) -> torch.Tensor:
"""Encode taxon name."""
if taxon not in self.taxa_to_idx:
self.taxa_to_idx[taxon] = len(self.taxa_to_idx)
idx = self.taxa_to_idx[taxon]
# One-hot encoding
encoding = torch.zeros(100) # Assume max 100 taxa
encoding[idx] = 1
return encoding
GNN Model (Experimental)
# app/services/ml/genai/tree_gnn.py
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, global_mean_pool
class TreeGNN(nn.Module):
"""Graph Neural Network for tree encoding."""
def __init__(
self,
input_dim: int = 100,
hidden_dim: int = 128,
embedding_dim: int = 64
):
super().__init__()
self.conv1 = GCNConv(input_dim, hidden_dim)
self.conv2 = GCNConv(hidden_dim, hidden_dim)
self.conv3 = GCNConv(hidden_dim, embedding_dim)
def forward(self, data):
x, edge_index = data.x, data.edge_index
# Graph convolutions
x = torch.relu(self.conv1(x, edge_index))
x = torch.relu(self.conv2(x, edge_index))
x = self.conv3(x, edge_index)
# Global pooling (tree-level embedding)
embedding = global_mean_pool(x, data.batch)
return embedding
class TreeGenerator(nn.Module):
"""Generate trees from embeddings (very experimental)."""
def __init__(self, embedding_dim: int = 64, max_nodes: int = 20):
super().__init__()
self.max_nodes = max_nodes
# Decoder: embedding -> tree structure
self.fc1 = nn.Linear(embedding_dim, 256)
self.fc2 = nn.Linear(256, max_nodes * max_nodes) # Adjacency matrix
def forward(self, embedding):
x = torch.relu(self.fc1(embedding))
adjacency_logits = self.fc2(x)
# Reshape to adjacency matrix
adj_matrix = adjacency_logits.view(-1, self.max_nodes, self.max_nodes)
# Apply sigmoid to get probabilities
adj_matrix = torch.sigmoid(adj_matrix)
# Make symmetric (undirected tree)
adj_matrix = (adj_matrix + adj_matrix.transpose(1, 2)) / 2
return adj_matrix
# Training loop would learn to:
# 1. Encode multiple trees into embeddings
# 2. Combine embeddings (e.g., average)
# 3. Decode combined embedding into new tree
# 4. Optimize for tree validity + likelihood
Model Versioning & Deployment
# app/services/ml/versioning.py
from pathlib import Path
import json
from datetime import datetime
from typing import Dict, Any
class ModelRegistry:
"""Track and version ML models."""
def __init__(self, registry_path: str = "models/registry.json"):
self.registry_path = Path(registry_path)
self.registry = self._load_registry()
def _load_registry(self) -> Dict:
"""Load model registry."""
if self.registry_path.exists():
with open(self.registry_path) as f:
return json.load(f)
return {"models": []}
def register_model(
self,
name: str,
version: str,
model_path: str,
metrics: Dict[str, Any],
metadata: Dict[str, Any]
):
"""Register a new model version."""
entry = {
"name": name,
"version": version,
"model_path": model_path,
"metrics": metrics,
"metadata": metadata,
"registered_at": datetime.now().isoformat()
}
self.registry["models"].append(entry)
self._save_registry()
def get_latest_model(self, name: str) -> Dict:
"""Get latest version of a model."""
models = [m for m in self.registry["models"] if m["name"] == name]
return max(models, key=lambda m: m["registered_at"])
def _save_registry(self):
"""Save registry to disk."""
with open(self.registry_path, 'w') as f:
json.dump(self.registry, f, indent=2)
Best Practices
✅ DO
- Version all models with metrics
- Use cross-validation for evaluation
- Monitor for model drift
- Keep training/inference code separate
- Use feature stores for consistency
- Validate model outputs
- Log all predictions for analysis
- Use ensemble methods when possible
- Test models on held-out data
- Document feature engineering steps
❌ DON'T
- Train on test data
- Skip data normalization
- Ignore class imbalance
- Hardcode feature extraction
- Deploy without evaluation
- Mix training/serving code
- Skip model validation
- Ignore computational costs
- Overfit to small datasets
- Skip ablation studies
Related Skills: rRNA-prediction-patterns, project-architecture-patterns
Line Count: < 500 lines ✅