ist. This replaces subjective quality signals with measurable ones.
4. Consumption Monitoring: Skills reshape default policies rather than triggering explicit tool calls. The pipeline tracks behavioral shifts post-injection to detect negative transfer early.
Implementation
import { z } from 'zod';
// Domain-specific experience composition profiles
const ExperienceProfile = z.enum(['success_heavy', 'balanced', 'failure_heavy']);
type ExperienceProfile = z.infer<typeof ExperienceProfile>;
interface Trajectory {
id: string;
domain: string;
outcome: 'success' | 'failure';
steps: string[];
metadata: Record<string, unknown>;
}
interface SkillArtifact {
version: string;
dimensions: {
failureMechanisms: string[];
actionableRemedies: string[];
highRiskBlacklist: string[];
};
rawPrompt: string;
extractionModel: string;
createdAt: Date;
}
interface EvaluationMetrics {
extractionEfficacy: number; // EE: How well extractor helps multiple targets
targetEvolvability: number; // TE: How much target improves across extractors
negativeTransferRate: number;
performanceDelta: number;
}
class KnowledgeDistillationPipeline {
private extractorModel: string;
private targetModel: string;
private validatedRubric: string[];
constructor(extractor: string, target: string) {
this.extractorModel = extractor;
this.targetModel = target;
// Empirically validated dimensions from research
this.validatedRubric = [
'Failure Mechanism Encoding: Identify specific failure modes and their triggers',
'Actionable Specificity: Provide executable guidance tailored to concrete situations',
'High-Risk Action Blacklist: Explicitly name operations to avoid in context'
];
}
/**
* Stage 1: Curate experience pool with domain-specific success/failure ratios
*/
async generateExperiencePool(
trajectories: Trajectory[],
profile: ExperienceProfile
): Promise<Trajectory[]> {
const successRatio = {
success_heavy: 0.8,
balanced: 0.5,
failure_heavy: 0.2
}[profile];
const successes = trajectories.filter(t => t.outcome === 'success');
const failures = trajectories.filter(t => t.outcome === 'failure');
const targetSuccessCount = Math.floor(trajectories.length * successRatio);
const targetFailureCount = trajectories.length - targetSuccessCount;
// Deterministic sampling for reproducibility
const sampledSuccesses = successes.slice(0, targetSuccessCount);
const sampledFailures = failures.slice(0, targetFailureCount);
if (sampledFailures.length === 0) {
throw new Error('All-failure pools consistently produce degraded skills. Ensure positive procedural signals exist.');
}
return [...sampledSuccesses, ...sampledFailures];
}
/**
* Stage 2: Extract skill using validated rubric injection
*/
async extractSkill(
experiencePool: Trajectory[],
domain: string
): Promise<SkillArtifact> {
const rubricPrompt = this.validatedRubric.join('\n');
const trajectorySummary = experiencePool.map(t =>
`[${t.outcome.toUpperCase()}] ${t.steps.join(' -> ')}`
).join('\n');
const extractionPrompt = `
Extract procedural knowledge from the following trajectories.
Adhere strictly to these validated dimensions:
${rubricPrompt}
Trajectories:
${trajectorySummary}
Output format: JSON with keys: failureMechanisms, actionableRemedies, highRiskBlacklist, rawPrompt
`;
// Simulated LLM extraction call
const rawResponse = await this.callExtractor(extractionPrompt);
const parsed = JSON.parse(rawResponse);
return {
version: `v1.0-${Date.now()}`,
dimensions: {
failureMechanisms: parsed.failureMechanisms,
actionableRemedies: parsed.actionableRemedies,
highRiskBlacklist: parsed.highRiskBlacklist
},
rawPrompt: parsed.rawPrompt,
extractionModel: this.extractorModel,
createdAt: new Date()
};
}
/**
* Stage 3: Evaluate consumption and compute EE/TE metrics
*/
async evaluateConsumption(
skill: SkillArtifact,
testTasks: string[]
): Promise<EvaluationMetrics> {
const baselineResults = await this.runBaseline(testTasks);
const skillResults = await this.runWithSkill(testTasks, skill.rawPrompt);
const performanceDelta = skillResults.avgScore - baselineResults.avgScore;
const negativeTransfers = skillResults.regressions.length;
const totalTasks = testTasks.length;
return {
extractionEfficacy: this.calculateEE(skill),
targetEvolvability: this.calculateTE(skill),
negativeTransferRate: negativeTransfers / totalTasks,
performanceDelta
};
}
private async callExtractor(prompt: string): Promise<string> {
// Replace with actual LLM API call
return JSON.stringify({
failureMechanisms: ['Formula evaluation fails on dynamic strings'],
actionableRemedies: ['Precompute static values and write directly to cells'],
highRiskBlacklist: ['Nested volatile functions in conditional branches'],
rawPrompt: `When handling spreadsheet formulas, precompute static values and write them directly to cells. Avoid nested volatile functions in conditional branches.`
});
}
private async runBaseline(tasks: string[]) {
return { avgScore: 0.72, regressions: [] };
}
private async runWithSkill(tasks: string[], prompt: string) {
return { avgScore: 0.81, regressions: [] };
}
private calculateEE(skill: SkillArtifact): number {
// EE measures extractor consistency across multiple targets
return 0.78; // Placeholder for actual cross-model evaluation
}
private calculateTE(skill: SkillArtifact): number {
// TE measures target improvement across multiple extractors
return 0.65; // Placeholder for actual cross-extractor evaluation
}
}
Why This Architecture Works
The pipeline enforces empirical validation at every stage. Experience curation prevents the common mistake of feeding only successful trajectories, which strips away critical failure-mode signals. The validated rubric replaces subjective extraction prompts with dimensions proven to correlate with downstream utility. Finally, consumption evaluation computes EE and TE metrics, allowing teams to track whether improvements stem from better extraction or better target adaptability. Skills are versioned and treated as trainable artifacts, enabling rollback and A/B testing identical to model deployment workflows.
Pitfall Guide
1. The Readability Trap
Explanation: Teams optimize skills for markdown formatting, bullet hierarchy, and prose fluency. Statistical analysis confirms format variations have zero impact on performance (p > 0.34).
Fix: Strip formatting constraints from extraction prompts. Prioritize concrete failure mechanisms and executable remedies over structural aesthetics.
2. The Success-Only Bias
Explanation: Curating experience pools with only successful trajectories removes negative signals. All-success pools consistently produce skills that fail to generalize to edge cases.
Fix: Implement domain-specific success/failure ratios. Embodied planning and complex tool-use domains require failure-heavy pools (60-80% failures) to capture invalid action boundaries. Spreadsheet and code domains benefit from success-heavy pools (70-80% successes) to reinforce correct procedural patterns.
3. The Model Strength Fallacy
Explanation: Assuming a top-tier execution model will automatically produce high-quality skills. Research shows lightweight models often outperform flagship models as extractors due to different capability profiles.
Fix: Decouple extractor and target models. Run cross-matrix evaluations to identify optimal extractor/target pairings. Do not default to the same model for both roles.
4. The Plausibility Judge Mirage
Explanation: Using an LLM to evaluate skill quality based on textual coherence. Unguided judges achieve 46.4% accuracy and actively invert utility on high-gap pairs, preferring fluent but ineffective instructions.
Fix: Replace textual judges with empirical rubrics. Score skills against the three validated dimensions: Failure Mechanism Encoding, Actionable Specificity, and High-Risk Action Blacklist. Validate rubric alignment against actual downstream performance before deployment.
5. Static Skill Deployment
Explanation: Treating skills as immutable prompts after initial validation. Skills reshape target model policies rather than triggering explicit tool calls, causing behavioral drift over time or across task distributions.
Fix: Implement consumption monitoring. Track policy shifts, tool-call patterns, and regression rates post-deployment. Version skills like model weights and maintain rollback capabilities.
6. Domain-Agnostic Experience Ratios
Explanation: Applying a uniform success/failure split across all domains. Different task types require different procedural signals.
Fix: Profile each domain empirically. Run ablation studies with varying ratios (0%/25%/50%/75%/100% success) and measure resulting skill quality. Lock in domain-specific profiles before scaling extraction.
7. Rubric Bloat
Explanation: Adding unvalidated dimensions to extraction prompts based on intuition. Plausibility-focused rubrics actively degrade performance (-0.59pp average regression).
Fix: Restrict extraction prompts to empirically validated dimensions only. Run automated rubric discovery pipelines to merge candidate features, measure better-rates, and prune dimensions that lack statistical correlation with utility.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Low-resource prototyping | Manual curation + format optimization | Fast iteration, acceptable for internal tools where 25% negative transfer is tolerable | Low (minimal compute, fast turnaround) |
| High-stakes enterprise deployment | Empirically validated rubric extraction + consumption monitoring | Prevents negative transfer, ensures measurable utility gains, supports rollback | Medium-High (dual-model calls, evaluation overhead) |
| Multi-domain agent suite | Domain-specific experience profiling + cross-matrix extractor selection | Different domains require different success/failure ratios and optimal extractors | High (ablation studies, matrix evaluations) |
| Rapid skill iteration | SkillOpt-style continuous optimization loop | Treats skills as trainable artifacts, enables automated gradient-like improvements | Medium (requires monitoring infrastructure, but reduces manual tuning) |
Configuration Template
// skill-pipeline.config.ts
export const PipelineConfig = {
models: {
extractor: 'gemini-3.1-flash-lite', // Often outperforms larger models as extractor
target: 'gpt-5.4',
judge: null // Disable unguided LLM judges; use empirical rubrics instead
},
experienceProfiles: {
'spreadsheet-automation': { ratio: 0.8, type: 'success_heavy' as const },
'software-engineering': { ratio: 0.7, type: 'success_heavy' as const },
'embodied-planning': { ratio: 0.2, type: 'failure_heavy' as const },
'web-search': { ratio: 0.5, type: 'balanced' as const },
'tool-calling': { ratio: 0.6, type: 'success_heavy' as const }
},
validatedRubric: [
'Failure Mechanism Encoding: Identify specific failure modes and their triggers',
'Actionable Specificity: Provide executable guidance tailored to concrete situations',
'High-Risk Action Blacklist: Explicitly name operations to avoid in context'
],
evaluation: {
minPerformanceDelta: 0.02, // 2% threshold for deployment
maxNegativeTransferRate: 0.15, // Reject skills exceeding 15% regression
trackMetrics: ['extractionEfficacy', 'targetEvolvability', 'policyShift']
},
versioning: {
strategy: 'semantic',
retention: 10, // Keep last 10 versions for rollback
artifactStore: 's3://agent-skills-bucket'
}
};
Quick Start Guide
- Initialize the pipeline: Import the
KnowledgeDistillationPipeline class and configure extractor/target models using the cross-matrix recommendations. Do not assume the execution model is optimal for extraction.
- Profile your domain: Run a quick ablation study with 4-8 experience pools varying success ratios (0.2 to 0.8). Measure resulting skill quality to lock in your domain-specific profile.
- Inject the validated rubric: Replace all generic extraction prompts with the three empirically validated dimensions. Remove formatting constraints and plausibility-focused instructions.
- Deploy with monitoring: Version the extracted skill, deploy to a staging environment, and track consumption metrics. Roll back immediately if negative transfer exceeds 15% or performance delta falls below 2%.
- Iterate continuously: Treat skills as trainable artifacts. Feed new trajectories into the pool, re-extract with the validated rubric, and measure EE/TE shifts. Optimize like model weights, not static documentation.