| name | improvement-pipeline |
| description | Executable implementation of the Propose -> Test -> Compare -> Commit -> Rollback pipeline for recursive self-improvement. Provides concrete commands and workflows for each stage. |
| version | 1.0.0 |
| category | foundry |
| tags | pipeline, improvement, testing, versioning, rollback |
Improvement Pipeline (Executable Stages)
Purpose
Provide concrete, executable implementation for each stage of the improvement pipeline:
PROPOSE -> TEST -> COMPARE -> COMMIT -> MONITOR -> ROLLBACK
Each stage has:
- Clear inputs and outputs
- Executable commands
- Validation checks
- Failure handling
Stage 1: PROPOSE
Generate concrete improvement proposals with diffs.
Input
propose_input:
target: "{path to skill/prompt}"
audit_report: "{from prompt-auditor or skill-auditor}"
improvement_type: "clarity|completeness|precision|safety|technique"
Process
async function generateProposal(target, auditReport) {
const proposal = {
id: `prop-${Date.now()}`,
target,
timestamp: new Date().toISOString(),
changes: [],
predicted_improvement: {},
risk_assessment: {}
};
// 1. Read current version
const currentContent = await readFile(target);
// 2. Identify improvement opportunities from audit
const opportunities = auditReport.issues
.filter(issue => issue.priority === 'critical' || issue.priority === 'high')
.slice(0, 5); // Max 5 changes per proposal
// 3. Generate changes for each opportunity
for (const opp of opportunities) {
const change = await generateChange(currentContent, opp);
proposal.changes.push({
section: opp.section,
location: opp.location,
before: change.before,
after: change.after,
rationale: change.rationale,
technique_applied: change.technique
});
}
// 4. Predict improvement
proposal.predicted_improvement = {
primary_metric: auditReport.lowest_score_dimension,
expected_delta: `+${(opportunities.length * 3)}%`, // ~3% per fix
confidence: 0.7
};
// 5. Assess risk
proposal.risk_assessment = {
regression_risk: opportunities.length > 3 ? 'medium' : 'low',
affected_components: findAffectedComponents(target, proposal.changes),
rollback_complexity: 'simple' // Always simple with archives
};
return proposal;
}
Output
proposal:
id: "prop-1734567890123"
target: ".claude/skills/skill-forge/SKILL.md"
timestamp: "2025-12-15T10:30:00Z"
changes:
- section: "Phase 3: Structural Architecture"
location: "Lines 145-160"
before: |
Design the skill's structure based on progressive disclosure.
after: |
Design the skill's structure based on progressive disclosure.
### Failure Handling (REQUIRED)
For each operation in the skill:
1. Identify possible failure modes
2. Define explicit error messages
3. Specify recovery actions
4. Include timeout handling
```yaml
error_handling:
timeout:
threshold: 30s
action: "Return partial results with warning"
invalid_input:
detection: "Validate against schema"
action: "Return clear error message with fix suggestion"
```
rationale: "Adds explicit failure handling missing from Phase 3"
technique_applied: "completeness_enhancement"
predicted_improvement:
primary_metric: "failure_coverage"
expected_delta: "+9%"
confidence: 0.7
risk_assessment:
regression_risk: "low"
affected_components: ["micro-skill-creator", "agent-creator"]
rollback_complexity: "simple"
Validation
proposal_validation:
required_fields:
- id: "Must be unique"
- target: "Must be valid file path"
- changes: "At least 1 change"
- predicted_improvement: "Must have primary_metric"
- risk_assessment: "Must have regression_risk"
change_validation:
- before: "Must exist in current file"
- after: "Must be different from before"
- rationale: "Must not be empty"
Stage 2: TEST
Run evaluation harness on proposed changes.
Input
test_input:
proposal_id: "prop-1734567890123"
candidate_content: "{content with changes applied}"
benchmark_suite: "prompt-generation-benchmark-v1 | skill-generation-benchmark-v1"
regression_suite: "prompt-forge-regression-v1 | skill-forge-regression-v1"
Process
async function runTests(proposal, candidateContent) {
const results = {
proposal_id: proposal.id,
timestamp: new Date().toISOString(),
benchmarks: {},
regressions: {},
human_gates: []
};
// 1. Determine which test suites to run
const benchmarks = getBenchmarksForTarget(proposal.target);
const regressions = getRegressionsForTarget(proposal.target);
// 2. Run benchmark suite
for (const benchmark of benchmarks) {
const benchResult = await runBenchmark(benchmark, candidateContent);
results.benchmarks[benchmark.id] = {
status: benchResult.score >= benchmark.minimum ? 'PASS' : 'FAIL',
score: benchResult.score,
minimum: benchmark.minimum,
tasks: benchResult.task_results
};
}
// 3. Run regression tests
for (const regression of regressions) {
const regResult = await runRegressionSuite(regression, candidateContent);
results.regressions[regression.id] = {
status: regResult.failed === 0 ? 'PASS' : 'FAIL',
passed: regResult.passed,
failed: regResult.failed,
failed_tests: regResult.failed_tests
};
}
// 4. Check human gates
results.human_gates = checkHumanGates(proposal);
return results;
}
function getBenchmarksForTarget(target) {
if (target.includes('prompt-forge')) {
return [{ id: 'prompt-generation-benchmark-v1', minimum: 0.7 }];
}
if (target.includes('skill-forge') || target.includes('SKILL.md')) {
return [{ id: 'skill-generation-benchmark-v1', minimum: 0.75 }];
}
if (target.includes('expertise')) {
return [{ id: 'expertise-generation-benchmark-v1', minimum: 0.8 }];
}
return [];
}
Output
test_results:
proposal_id: "prop-1734567890123"
timestamp: "2025-12-15T10:35:00Z"
benchmarks:
skill-generation-benchmark-v1:
status: "PASS"
score: 0.87
minimum: 0.75
tasks:
sg-001:
name: "Micro-Skill Generation"
scores:
functionality: 0.85
contract_compliance: 0.90
error_coverage: 0.86
sg-002:
name: "Complex Skill Generation"
scores:
functionality: 0.88
structure_compliance: 0.87
safety_coverage: 0.85
regressions:
skill-forge-regression-v1:
status: "PASS"
passed: 4
failed: 0
failed_tests: []
human_gates: [] # None triggered
Validation
test_validation:
benchmark_check:
- all_benchmarks_run: true
- all_scores_recorded: true
regression_check:
- all_tests_run: true
- failure_details_captured: true
gate_check:
- all_gates_evaluated: true
Stage 3: COMPARE
Compare baseline vs candidate, decide ACCEPT or REJECT.
Input
compare_input:
proposal_id: "prop-1734567890123"
baseline_scores: "{from previous eval}"
candidate_scores: "{from Stage 2}"
test_results: "{full test results}"
Process
function compareAndDecide(baseline, candidate, testResults) {
const comparison = {
proposal_id: testResults.proposal_id,
timestamp: new Date().toISOString(),
baseline_scores: baseline,
candidate_scores: candidate,
delta: {},
verdict: null,
reason: null
};
// 1. Calculate deltas
for (const [metric, candidateScore] of Object.entries(candidate)) {
const baselineScore = baseline[metric] || 0;
comparison.delta[metric] = {
baseline: baselineScore,
candidate: candidateScore,
change: candidateScore - baselineScore,
percent_change: ((candidateScore - baselineScore) / baselineScore * 100).toFixed(2)
};
}
// 2. Check for regressions (hard fail)
for (const [suite, result] of Object.entries(testResults.regressions)) {
if (result.status === 'FAIL') {
comparison.verdict = 'REJECT';
comparison.reason = `Regression test failed: ${result.failed_tests.join(', ')}`;
return comparison;
}
}
// 3. Check benchmarks meet minimum (hard fail)
for (const [suite, result] of Object.entries(testResults.benchmarks)) {
if (result.status === 'FAIL') {
comparison.verdict = 'REJECT';
comparison.reason = `Benchmark ${suite} below minimum: ${result.score} < ${result.minimum}`;
return comparison;
}
}
// 4. Check for improvement (soft requirement)
const avgDelta = Object.values(comparison.delta)
.reduce((sum, d) => sum + d.change, 0) / Object.keys(comparison.delta).length;
if (avgDelta < 0) {
comparison.verdict = 'REJECT';
comparison.reason = `No improvement: average delta = ${avgDelta.toFixed(3)}`;
return comparison;
}
// 5. Check human gates
if (testResults.human_gates.length > 0) {
comparison.verdict = 'PENDING_HUMAN_REVIEW';
comparison.reason = `Human review required: ${testResults.human_gates.join(', ')}`;
return comparison;
}
// 6. All passed - ACCEPT
comparison.verdict = 'ACCEPT';
comparison.reason = `All checks passed. Average improvement: +${(avgDelta * 100).toFixed(2)}%`;
comparison.improvement_summary = {
average_delta: avgDelta,
best_improvement: Object.entries(comparison.delta)
.sort((a, b) => b[1].change - a[1].change)[0],
regressions_passed: Object.keys(testResults.regressions).length,
benchmarks_passed: Object.keys(testResults.benchmarks).length
};
return comparison;
}
Output
comparison_result:
proposal_id: "prop-1734567890123"
timestamp: "2025-12-15T10:40:00Z"
baseline_scores:
clarity: 0.82
completeness: 0.78
precision: 0.80
candidate_scores:
clarity: 0.85
completeness: 0.87
precision: 0.82
delta:
clarity:
baseline: 0.82
candidate: 0.85
change: 0.03
percent_change: "3.66%"
completeness:
baseline: 0.78
candidate: 0.87
change: 0.09
percent_change: "11.54%"
precision:
baseline: 0.80
candidate: 0.82
change: 0.02
percent_change: "2.50%"
verdict: "ACCEPT"
reason: "All checks passed. Average improvement: +4.67%"
improvement_summary:
average_delta: 0.0467
best_improvement: ["completeness", { change: 0.09 }]
regressions_passed: 1
benchmarks_passed: 1
Validation
comparison_validation:
required:
- verdict: "Must be ACCEPT|REJECT|PENDING_HUMAN_REVIEW"
- reason: "Must explain decision"
verdict_rules:
REJECT:
- "Any regression failure"
- "Any benchmark below minimum"
- "Negative improvement delta"
PENDING_HUMAN_REVIEW:
- "Human gate triggered"
ACCEPT:
- "All regressions pass"
- "All benchmarks meet minimum"
- "Positive improvement delta"
- "No human gates"
Stage 4: COMMIT
Apply changes and create version entry.
Input
commit_input:
proposal_id: "prop-1734567890123"
target: "{file path}"
new_content: "{content with changes applied}"
comparison_result: "{from Stage 3}"
Process
async function commitChanges(proposal, target, newContent, comparison) {
const commit = {
id: `commit-${Date.now()}`,
proposal_id: proposal.id,
timestamp: new Date().toISOString(),
target,
actions: []
};
// 1. Archive current version
const archivePath = getArchivePath(target);
const currentVersion = await getCurrentVersion(target);
await writeFile(
`${archivePath}/SKILL-v${currentVersion}.md`,
await readFile(target)
);
commit.actions.push({
action: 'archive',
path: `${archivePath}/SKILL-v${currentVersion}.md`
});
// 2. Apply new content
await writeFile(target, newContent);
commit.actions.push({
action: 'update',
path: target
});
// 3. Increment version
const newVersion = incrementVersion(currentVersion);
await updateVersionInFile(target, newVersion);
commit.actions.push({
action: 'version_bump',
from: currentVersion,
to: newVersion
});
// 4. Update changelog
const changelogEntry = formatChangelogEntry(proposal, comparison, newVersion);
await appendToChangelog(target, changelogEntry);
commit.actions.push({
action: 'changelog_update',
entry: changelogEntry
});
// 5. Store commit record in memory
await storeInMemory(`improvement/commits/${commit.id}`, {
...commit,
proposal,
comparison
});
return commit;
}
function formatChangelogEntry(proposal, comparison, version) {
return `
## v${version} (${new Date().toISOString().split('T')[0]})
**Proposal**: ${proposal.id}
**Improvement**: ${comparison.reason}
**Changes**:
${proposal.changes.map(c => `- ${c.section}: ${c.rationale}`).join('\n')}
**Metrics**:
${Object.entries(comparison.delta)
.map(([k, v]) => `- ${k}: ${v.baseline} -> ${v.candidate} (${v.percent_change})`)
.join('\n')}
`;
}
Output
commit_result:
id: "commit-1734567890456"
proposal_id: "prop-1734567890123"
timestamp: "2025-12-15T10:45:00Z"
target: ".claude/skills/skill-forge/SKILL.md"
actions:
- action: "archive"
path: ".claude/skills/skill-forge/.archive/SKILL-v1.0.0.md"
- action: "update"
path: ".claude/skills/skill-forge/SKILL.md"
- action: "version_bump"
from: "1.0.0"
to: "1.1.0"
- action: "changelog_update"
entry: "## v1.1.0..."
status: "SUCCESS"
Validation
commit_validation:
pre_commit:
- archive_exists: "Verify archive created"
- backup_verified: "Can restore from archive"
post_commit:
- file_updated: "Target file has new content"
- version_incremented: "Version number updated"
- changelog_appended: "Changelog has new entry"
- memory_stored: "Commit record in memory"
Stage 5: MONITOR
Track metrics after commit to detect delayed regressions.
Input
monitor_input:
commit_id: "commit-1734567890456"
target: "{file path}"
metrics_window: "7 days"
alert_thresholds:
regression: 0.03 # 3% regression triggers alert
Process
async function setupMonitoring(commit, window = '7d') {
const monitor = {
commit_id: commit.id,
target: commit.target,
start_time: new Date().toISOString(),
end_time: addDays(new Date(), 7).toISOString(),
baseline_metrics: await getCurrentMetrics(commit.target),
alerts: [],
status: 'ACTIVE'
};
// Store monitoring config
await storeInMemory(`improvement/monitors/${commit.id}`, monitor);
return monitor;
}
async function checkMonitor(commitId) {
const monitor = await retrieveFromMemory(`improvement/monitors/${commitId}`);
if (!monitor || monitor.status !== 'ACTIVE') return null;
const currentMetrics = await getCurrentMetrics(monitor.target);
const alerts = [];
// Check for regressions
for (const [metric, baseline] of Object.entries(monitor.baseline_metrics)) {
const current = currentMetrics[metric] || 0;
const delta = current - baseline;
if (delta < -0.03) { // 3% regression
alerts.push({
type: 'REGRESSION',
metric,
baseline,
current,
delta,
severity: delta < -0.1 ? 'CRITICAL' : 'WARNING'
});
}
}
// Update monitor
monitor.latest_check = new Date().toISOString();
monitor.current_metrics = currentMetrics;
monitor.alerts = alerts;
if (alerts.some(a => a.severity === 'CRITICAL')) {
monitor.status = 'ALERT_CRITICAL';
// Trigger rollback consideration
await notifyRollbackNeeded(monitor);
}
await storeInMemory(`improvement/monitors/${commitId}`, monitor);
return monitor;
}
Output
monitor_status:
commit_id: "commit-1734567890456"
target: ".claude/skills/skill-forge/SKILL.md"
status: "ACTIVE"
baseline_metrics:
clarity: 0.85
completeness: 0.87
precision: 0.82
current_metrics:
clarity: 0.84
completeness: 0.88
precision: 0.82
alerts: []
days_remaining: 5
next_check: "2025-12-16T10:00:00Z"
Stage 6: ROLLBACK
Restore previous version if regressions detected.
Input
rollback_input:
commit_id: "commit-1734567890456"
reason: "regression_detected | manual_request"
evidence: "{alert details or user request}"
Process
async function rollback(commitId, reason, evidence) {
const commit = await retrieveFromMemory(`improvement/commits/${commitId}`);
if (!commit) throw new Error(`Commit not found: ${commitId}`);
const rollback = {
id: `rollback-${Date.now()}`,
commit_id: commitId,
target: commit.target,
timestamp: new Date().toISOString(),
reason,
evidence,
actions: []
};
// 1. Find archived version
const archivePath = getArchivePath(commit.target);
const previousVersion = decrementVersion(commit.actions
.find(a => a.action === 'version_bump').to);
const archiveFile = `${archivePath}/SKILL-v${previousVersion}.md`;
// 2. Verify archive exists
if (!await fileExists(archiveFile)) {
rollback.status = 'FAILED';
rollback.error = `Archive not found: ${archiveFile}`;
return rollback;
}
// 3. Restore archived content
const archivedContent = await readFile(archiveFile);
await writeFile(commit.target, archivedContent);
rollback.actions.push({
action: 'restore',
from: archiveFile,
to: commit.target
});
// 4. Update changelog
const rollbackEntry = `
## ROLLBACK to v${previousVersion} (${new Date().toISOString().split('T')[0]})
**Rolled back from**: ${commit.actions.find(a => a.action === 'version_bump').to}
**Reason**: ${reason}
**Evidence**: ${JSON.stringify(evidence)}
`;
await appendToChangelog(commit.target, rollbackEntry);
rollback.actions.push({
action: 'changelog_update',
entry: rollbackEntry
});
// 5. Mark commit as rolled back
commit.rolled_back = true;
commit.rollback_id = rollback.id;
await storeInMemory(`improvement/commits/${commitId}`, commit);
// 6. Store rollback record
await storeInMemory(`improvement/rollbacks/${rollback.id}`, rollback);
// 7. Cancel monitoring
const monitor = await retrieveFromMemory(`improvement/monitors/${commitId}`);
if (monitor) {
monitor.status = 'CANCELLED_ROLLBACK';
await storeInMemory(`improvement/monitors/${commitId}`, monitor);
}
rollback.status = 'SUCCESS';
rollback.restored_version = previousVersion;
return rollback;
}
Output
rollback_result:
id: "rollback-1734567890789"
commit_id: "commit-1734567890456"
target: ".claude/skills/skill-forge/SKILL.md"
timestamp: "2025-12-15T15:00:00Z"
reason: "regression_detected"
evidence:
alert_type: "REGRESSION"
metric: "clarity"
baseline: 0.85
current: 0.75
delta: -0.10
actions:
- action: "restore"
from: ".claude/skills/skill-forge/.archive/SKILL-v1.0.0.md"
to: ".claude/skills/skill-forge/SKILL.md"
- action: "changelog_update"
entry: "## ROLLBACK to v1.0.0..."
status: "SUCCESS"
restored_version: "1.0.0"
Validation
rollback_validation:
pre_rollback:
- archive_exists: "Verify archived version available"
- target_accessible: "Can write to target file"
post_rollback:
- content_restored: "File matches archive"
- changelog_updated: "Rollback documented"
- commit_marked: "Commit flagged as rolled back"
- monitor_cancelled: "Monitoring stopped"
Pipeline Orchestration
Full Pipeline Execution
async function runImprovementPipeline(target, auditReport) {
const pipeline = {
id: `pipeline-${Date.now()}`,
target,
timestamp: new Date().toISOString(),
stages: {}
};
try {
// Stage 1: PROPOSE
pipeline.stages.propose = await generateProposal(target, auditReport);
if (pipeline.stages.propose.changes.length === 0) {
pipeline.result = 'NO_PROPOSALS';
return pipeline;
}
// Stage 2: TEST
const candidateContent = applyChanges(
await readFile(target),
pipeline.stages.propose.changes
);
pipeline.stages.test = await runTests(pipeline.stages.propose, candidateContent);
// Stage 3: COMPARE
const baseline = await getBaselineScores(target);
const candidate = extractScores(pipeline.stages.test);
pipeline.stages.compare = compareAndDecide(baseline, candidate, pipeline.stages.test);
// Decision point
if (pipeline.stages.compare.verdict === 'REJECT') {
pipeline.result = 'REJECTED';
pipeline.reason = pipeline.stages.compare.reason;
return pipeline;
}
if (pipeline.stages.compare.verdict === 'PENDING_HUMAN_REVIEW') {
pipeline.result = 'PENDING';
pipeline.reason = pipeline.stages.compare.reason;
// Store for human review
await storeInMemory(`improvement/pending/${pipeline.id}`, pipeline);
return pipeline;
}
// Stage 4: COMMIT
pipeline.stages.commit = await commitChanges(
pipeline.stages.propose,
target,
candidateContent,
pipeline.stages.compare
);
// Stage 5: MONITOR
pipeline.stages.monitor = await setupMonitoring(pipeline.stages.commit);
pipeline.result = 'ACCEPTED';
pipeline.reason = pipeline.stages.compare.reason;
} catch (error) {
pipeline.result = 'ERROR';
pipeline.error = error.message;
}
// Store pipeline record
await storeInMemory(`improvement/pipelines/${pipeline.id}`, pipeline);
return pipeline;
}
Memory Namespaces
| Namespace | Purpose | Retention |
|---|---|---|
improvement/proposals/{id} |
Pending proposals | Until resolved |
improvement/commits/{id} |
Committed changes | Permanent |
improvement/rollbacks/{id} |
Rollback events | Permanent |
improvement/monitors/{id} |
Active monitoring | 30 days |
improvement/pipelines/{id} |
Full pipeline runs | 90 days |
improvement/pending/{id} |
Awaiting human review | Until resolved |
Status: Production-Ready Version: 1.0.0 Key Constraint: Every stage has clear inputs, outputs, and validation