name: model-evaluation description: Evaluates machine learning models for performance, fairness, and reliability using appropriate metrics and validation techniques. Trigger keywords: model evaluation, metrics, accuracy, precision, recall, F1, ROC, AUC, cross-validation, ML testing. allowed-tools: Read, Grep, Glob, Edit, Write, Bash
Model Evaluation
Overview
This skill focuses on comprehensive evaluation of machine learning models. It covers metric selection, validation strategies, fairness assessment, and production monitoring for ensuring model quality and reliability.
Instructions
1. Define Evaluation Criteria
- Identify business objectives
- Select appropriate metrics
- Define success thresholds
- Consider fairness requirements
2. Design Evaluation Strategy
- Choose validation approach
- Plan for data splits
- Handle class imbalance
- Account for temporal aspects
3. Conduct Evaluation
- Calculate performance metrics
- Analyze error patterns
- Assess model fairness
- Test edge cases
4. Report and Monitor
- Document evaluation results
- Create monitoring dashboards
- Set up alerting thresholds
- Plan for retraining
Best Practices
- Match Metrics to Goals: Choose metrics aligned with business objectives
- Use Multiple Metrics: No single metric tells the whole story
- Proper Validation: Use appropriate cross-validation schemes
- Test Distribution Shift: Evaluate on out-of-distribution data
- Check for Bias: Assess fairness across demographic groups
- Version Everything: Track models, data, and metrics
- Monitor Production: Continuously track model performance
Examples
Example 1: Classification Model Evaluation
import numpy as np
import pandas as pd
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, average_precision_score, confusion_matrix,
classification_report, roc_curve, precision_recall_curve
)
import matplotlib.pyplot as plt
class ClassificationEvaluator:
"""Comprehensive classification model evaluator."""
def __init__(self, y_true, y_pred, y_prob=None, class_names=None):
self.y_true = y_true
self.y_pred = y_pred
self.y_prob = y_prob
self.class_names = class_names or ['Negative', 'Positive']
def compute_metrics(self) -> dict:
"""Compute all classification metrics."""
metrics = {
'accuracy': accuracy_score(self.y_true, self.y_pred),
'precision': precision_score(self.y_true, self.y_pred, average='weighted'),
'recall': recall_score(self.y_true, self.y_pred, average='weighted'),
'f1': f1_score(self.y_true, self.y_pred, average='weighted'),
}
if self.y_prob is not None:
metrics['roc_auc'] = roc_auc_score(self.y_true, self.y_prob)
metrics['average_precision'] = average_precision_score(self.y_true, self.y_prob)
return metrics
def confusion_matrix_analysis(self) -> dict:
"""Analyze confusion matrix in detail."""
cm = confusion_matrix(self.y_true, self.y_pred)
tn, fp, fn, tp = cm.ravel()
return {
'confusion_matrix': cm,
'true_negatives': tn,
'false_positives': fp,
'false_negatives': fn,
'true_positives': tp,
'specificity': tn / (tn + fp),
'sensitivity': tp / (tp + fn),
'false_positive_rate': fp / (fp + tn),
'false_negative_rate': fn / (fn + tp),
}
def plot_roc_curve(self, save_path=None):
"""Plot ROC curve with AUC."""
if self.y_prob is None:
raise ValueError("Probabilities required for ROC curve")
fpr, tpr, thresholds = roc_curve(self.y_true, self.y_prob)
auc = roc_auc_score(self.y_true, self.y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC (AUC = {auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid(True, alpha=0.3)
if save_path:
plt.savefig(save_path, dpi=150, bbox_inches='tight')
plt.show()
def generate_report(self) -> str:
"""Generate comprehensive evaluation report."""
metrics = self.compute_metrics()
cm_analysis = self.confusion_matrix_analysis()
report = f"""
# Classification Model Evaluation Report
## Overall Metrics
| Metric | Value |
|--------|-------|
| Accuracy | {metrics['accuracy']:.4f} |
| Precision | {metrics['precision']:.4f} |
| Recall | {metrics['recall']:.4f} |
| F1 Score | {metrics['f1']:.4f} |
| ROC AUC | {metrics.get('roc_auc', 'N/A'):.4f if isinstance(metrics.get('roc_auc'), float) else 'N/A'} |
## Confusion Matrix Analysis
| Metric | Value |
|--------|-------|
| True Positives | {cm_analysis['true_positives']} |
| True Negatives | {cm_analysis['true_negatives']} |
| False Positives | {cm_analysis['false_positives']} |
| False Negatives | {cm_analysis['false_negatives']} |
| Sensitivity | {cm_analysis['sensitivity']:.4f} |
| Specificity | {cm_analysis['specificity']:.4f} |
## Detailed Classification Report
{classification_report(self.y_true, self.y_pred, target_names=self.class_names)}
"""
return report
# Usage
evaluator = ClassificationEvaluator(y_true, y_pred, y_prob)
print(evaluator.generate_report())
evaluator.plot_roc_curve('roc_curve.png')
Example 2: Regression Model Evaluation
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error, explained_variance_score
)
import numpy as np
class RegressionEvaluator:
"""Comprehensive regression model evaluator."""
def __init__(self, y_true, y_pred):
self.y_true = np.array(y_true)
self.y_pred = np.array(y_pred)
self.residuals = self.y_true - self.y_pred
def compute_metrics(self) -> dict:
"""Compute all regression metrics."""
mse = mean_squared_error(self.y_true, self.y_pred)
return {
'mse': mse,
'rmse': np.sqrt(mse),
'mae': mean_absolute_error(self.y_true, self.y_pred),
'mape': mean_absolute_percentage_error(self.y_true, self.y_pred) * 100,
'r2': r2_score(self.y_true, self.y_pred),
'explained_variance': explained_variance_score(self.y_true, self.y_pred),
}
def residual_analysis(self) -> dict:
"""Analyze residual patterns."""
return {
'mean_residual': np.mean(self.residuals),
'std_residual': np.std(self.residuals),
'max_overestimate': np.min(self.residuals),
'max_underestimate': np.max(self.residuals),
'residual_skewness': self._skewness(self.residuals),
}
def _skewness(self, data):
"""Calculate skewness."""
n = len(data)
mean = np.mean(data)
std = np.std(data)
return (n / ((n-1) * (n-2))) * np.sum(((data - mean) / std) ** 3)
def plot_diagnostics(self, save_path=None):
"""Plot diagnostic plots for residual analysis."""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Actual vs Predicted
ax1 = axes[0, 0]
ax1.scatter(self.y_true, self.y_pred, alpha=0.5)
ax1.plot([self.y_true.min(), self.y_true.max()],
[self.y_true.min(), self.y_true.max()], 'r--')
ax1.set_xlabel('Actual')
ax1.set_ylabel('Predicted')
ax1.set_title('Actual vs Predicted')
# Residuals vs Predicted
ax2 = axes[0, 1]
ax2.scatter(self.y_pred, self.residuals, alpha=0.5)
ax2.axhline(y=0, color='r', linestyle='--')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Residuals')
ax2.set_title('Residuals vs Predicted')
# Residual histogram
ax3 = axes[1, 0]
ax3.hist(self.residuals, bins=30, edgecolor='black')
ax3.set_xlabel('Residual')
ax3.set_ylabel('Frequency')
ax3.set_title('Residual Distribution')
# Q-Q plot
ax4 = axes[1, 1]
from scipy import stats
stats.probplot(self.residuals, dist="norm", plot=ax4)
ax4.set_title('Q-Q Plot')
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=150, bbox_inches='tight')
plt.show()
Example 3: Cross-Validation Strategies
from sklearn.model_selection import (
cross_val_score, StratifiedKFold, TimeSeriesSplit,
GroupKFold, cross_validate
)
def evaluate_with_cv(model, X, y, cv_strategy='stratified', n_splits=5, groups=None):
"""
Evaluate model with appropriate cross-validation strategy.
Args:
model: Sklearn-compatible model
X: Features
y: Target
cv_strategy: 'stratified', 'timeseries', 'group', or 'kfold'
n_splits: Number of CV folds
groups: Group labels for GroupKFold
Returns:
Dictionary with CV results
"""
# Select CV strategy
if cv_strategy == 'stratified':
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
elif cv_strategy == 'timeseries':
cv = TimeSeriesSplit(n_splits=n_splits)
elif cv_strategy == 'group':
cv = GroupKFold(n_splits=n_splits)
else:
cv = n_splits
# Define scoring metrics
scoring = {
'accuracy': 'accuracy',
'precision': 'precision_weighted',
'recall': 'recall_weighted',
'f1': 'f1_weighted',
'roc_auc': 'roc_auc'
}
# Perform cross-validation
cv_results = cross_validate(
model, X, y,
cv=cv,
scoring=scoring,
groups=groups,
return_train_score=True,
n_jobs=-1
)
# Summarize results
summary = {}
for metric in scoring.keys():
test_scores = cv_results[f'test_{metric}']
train_scores = cv_results[f'train_{metric}']
summary[metric] = {
'test_mean': np.mean(test_scores),
'test_std': np.std(test_scores),
'train_mean': np.mean(train_scores),
'train_std': np.std(train_scores),
'overfit_gap': np.mean(train_scores) - np.mean(test_scores)
}
return summary
# Usage example
results = evaluate_with_cv(model, X, y, cv_strategy='stratified', n_splits=5)
for metric, values in results.items():
print(f"{metric}: {values['test_mean']:.4f} (+/- {values['test_std']:.4f})")
Example 4: Fairness Evaluation
def evaluate_fairness(y_true, y_pred, sensitive_attr, favorable_label=1):
"""
Evaluate model fairness across demographic groups.
Args:
y_true: True labels
y_pred: Predicted labels
sensitive_attr: Protected attribute values
favorable_label: The favorable outcome label
Returns:
Dictionary with fairness metrics
"""
groups = np.unique(sensitive_attr)
results = {'group_metrics': {}}
for group in groups:
mask = sensitive_attr == group
group_true = y_true[mask]
group_pred = y_pred[mask]
# Calculate group-specific metrics
tp = np.sum((group_true == favorable_label) & (group_pred == favorable_label))
fp = np.sum((group_true != favorable_label) & (group_pred == favorable_label))
fn = np.sum((group_true == favorable_label) & (group_pred != favorable_label))
tn = np.sum((group_true != favorable_label) & (group_pred != favorable_label))
results['group_metrics'][group] = {
'selection_rate': np.mean(group_pred == favorable_label),
'tpr': tp / (tp + fn) if (tp + fn) > 0 else 0,
'fpr': fp / (fp + tn) if (fp + tn) > 0 else 0,
'accuracy': np.mean(group_true == group_pred),
'size': len(group_true)
}
# Calculate fairness metrics
selection_rates = [m['selection_rate'] for m in results['group_metrics'].values()]
tprs = [m['tpr'] for m in results['group_metrics'].values()]
fprs = [m['fpr'] for m in results['group_metrics'].values()]
results['fairness_metrics'] = {
'demographic_parity_diff': max(selection_rates) - min(selection_rates),
'equalized_odds_tpr_diff': max(tprs) - min(tprs),
'equalized_odds_fpr_diff': max(fprs) - min(fprs),
}
return results