f imports or helper definitions.
-- Input skeleton: target_lemma.lean
import Mathlib.Data.Real.Basic
import Mathlib.Algebra.Order.Field.Basic
-- EVOLVE-BLOCK begin
-- Agents may introduce auxiliary lemmas here
-- EVOLVE-BLOCK end
theorem convergence_bound_optimization (α : ℝ) (hα : 0 < α ∧ α < 1) :
∃ (schedule : ℕ → ℝ),
(∀ n, 0 < schedule n) ∧
(∀ ε > 0, ∃ N, ∀ n ≥ N, |schedule n - 0| < ε) := by
-- EVOLVE-BLOCK begin
sorry
-- EVOLVE-BLOCK end
Step 2: Implement the Orchestrator with Parallel Agent Pool
The orchestrator manages concurrent search trajectories. Each worker runs an independent LLM instance, interacts with the Lean compiler via standard I/O, and accumulates structured feedback. Parallelism is non-negotiable: proof search is highly non-deterministic, and running multiple trajectories simultaneously maximizes the probability of convergence within a fixed compute budget.
import { execSync } from 'child_process';
import { EventEmitter } from 'events';
interface CompilerFeedback {
isValid: boolean;
hasSorry: boolean;
failedTactic?: string;
goalState?: string;
errorMessage?: string;
}
interface AgentConfig {
id: string;
model: 'gemini-3.1-pro' | 'gemini-3.0-flash';
maxEpisodes: number;
temperature: number;
}
class ProofOrchestrator extends EventEmitter {
private pool: Map<string, AgentWorker> = new Map();
private compilerPath: string;
constructor(compilerPath: string) {
super();
this.compilerPath = compilerPath;
}
async spawnPool(configs: AgentConfig[], skeletonPath: string): Promise<string | null> {
const workers = configs.map(cfg => new AgentWorker(cfg, this.compilerPath));
workers.forEach(w => this.pool.set(w.id, w));
const results = await Promise.allSettled(
Array.from(this.pool.values()).map(w => w.run(skeletonPath))
);
const successfulProof = results.find(
r => r.status === 'fulfilled' && r.value !== null
);
return successfulProof?.status === 'fulfilled' ? successfulProof.value : null;
}
}
class AgentWorker {
readonly id: string;
private model: string;
private maxEpisodes: number;
private compilerPath: string;
private contextBuffer: string[] = [];
constructor(config: AgentConfig, compilerPath: string) {
this.id = config.id;
this.model = config.model;
this.maxEpisodes = config.maxEpisodes;
this.compilerPath = compilerPath;
}
async run(skeletonPath: string): Promise<string | null> {
let currentSketch = this.loadSkeleton(skeletonPath);
for (let episode = 0; episode < this.maxEpisodes; episode++) {
const prompt = this.buildPrompt(currentSketch, this.contextBuffer);
const generatedStep = await this.invokeLLM(prompt);
const feedback = await this.verifyWithCompiler(generatedStep);
if (feedback.isValid && !feedback.hasSorry) {
return generatedStep;
}
if (!feedback.isValid) {
this.contextBuffer.push(
`Episode ${episode}: tactic '${feedback.failedTactic}' rejected.\n` +
`Remaining goal: ${feedback.goalState}\n` +
`Compiler diagnostic: ${feedback.errorMessage}`
);
// Retain last 15 episodes to prevent context overflow
if (this.contextBuffer.length > 15) this.contextBuffer.shift();
}
currentSketch = generatedStep;
}
return null;
}
private async verifyWithCompiler(code: string): Promise<CompilerFeedback> {
try {
const output = execSync(`${this.compilerPath} --check`, {
input: code,
encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe']
});
return { isValid: true, hasSorry: false };
} catch (err: any) {
const stderr = err.stderr || '';
const matchSorry = stderr.includes('sorry');
const matchTactic = stderr.match(/tactic '([^']+)'/);
const matchGoal = stderr.match(/⊢ (.+)/);
return {
isValid: false,
hasSorry: matchSorry,
failedTactic: matchTactic?.[1],
goalState: matchGoal?.[1],
errorMessage: stderr.split('\n')[0]
};
}
}
private buildPrompt(sketch: string, history: string[]): string {
return `
You are refining a formal proof in Lean 4.
Current sketch:
${sketch}
Previous compiler feedback:
${history.join('\n---\n')}
Generate the next valid tactic sequence. Do not use 'sorry'.
Output only the Lean code block.
`;
}
private async invokeLLM(prompt: string): Promise<string> {
// Abstracted LLM call routing
const payload = { model: this.model, prompt, temperature: 0.7 };
const response = await fetch('https://api.gemini.internal/v1/generate', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload)
});
const data = await response.json();
return data.generated_code;
}
private loadSkeleton(path: string): string {
return `import Mathlib\n\n${require('fs').readFileSync(path, 'utf-8')}`;
}
}
Step 3: Apply Evolutionary Selection Without Gradients
Proof steps are discrete and non-differentiable. Gradient-based optimization fails here. Instead, the system tracks agent performance using Elo ratings and P-UCB (Probability Upper Confidence Bound) to allocate compute dynamically. Agents that consistently produce valid tactic sequences receive higher priority and longer context windows. This creates a fitness landscape where successful search patterns propagate without requiring backpropagation.
Architecture Rationale
- Parallel Pool over Sequential Search: Proof discovery is combinatorial. Running independent trajectories covers divergent reasoning paths. Early termination on first success minimizes wasted compute.
- Compiler as Ground Truth: Lean's type checker rejects invalid tactics deterministically. Structured error messages replace ambiguous natural-language critique.
- Model Routing: Heavy reasoning tasks route to
gemini-3.1-pro. Lightweight evaluation, rating, and context compression route to gemini-3.0-flash. This reduces latency and cost by ~60% compared to uniform model usage.
- Context Window Management: Compiler feedback accumulates rapidly. A sliding window with semantic compression prevents token overflow while preserving critical failure patterns.
Pitfall Guide
1. Treating Compiler Errors as Unstructured Text
Explanation: Parsing stderr as raw strings loses semantic structure. The verifier returns goal states, failed tactics, and type mismatches that require structured extraction.
Fix: Implement a regex/AST parser that maps compiler output to a CompilerFeedback interface. Route specific error types to targeted recovery strategies (e.g., type mismatch vs. tactic failure).
2. Unbounded Context Accumulation
Explanation: Feeding every compiler error into the LLM context window causes attention dilution and token overflow. The model begins ignoring recent, relevant feedback.
Fix: Maintain a rolling buffer of the last 10–15 episodes. Apply semantic compression: group similar failures, extract failure patterns, and discard redundant diagnostics.
3. Over-Constraining the Search Space
Explanation: Restricting EVOLVE-BLOCK regions too narrowly prevents agents from introducing necessary auxiliary lemmas or redefining helper functions.
Fix: Define block boundaries at the module level, not the tactic level. Allow agents to append definitions above the target theorem while protecting imports and existing verified lemmas.
4. Assuming Gradient-Based Optimization Applies
Explanation: Proof steps are discrete symbolic operations. Backpropagation cannot optimize tactic selection or lemma introduction.
Fix: Use evolutionary metrics like Elo ratings and P-UCB. Track success rates per agent, allocate compute proportionally, and prune low-performing trajectories early.
5. Ignoring Tactic Dependency Chains
Explanation: A failed tactic often invalidates subsequent steps. Rolling back only the last step leaves the proof in an inconsistent state.
Fix: Implement state checkpointing. When verification fails, revert to the last compiler-accepted state. Resume generation from that checkpoint rather than patching broken chains.
6. Single-Agent Bottlenecking
Explanation: Sequential exploration wastes time on dead-end reasoning paths. LLMs exhibit high variance; one trajectory's failure doesn't imply global impossibility.
Fix: Deploy a minimum of 8–12 parallel workers. Use early termination: halt the entire pool once a single worker returns a sorry-free proof.
7. Hardcoding Model Roles
Explanation: Assigning fixed models to fixed tasks ignores problem complexity. Simple proofs waste expensive model capacity; complex proofs starve lightweight models.
Fix: Implement dynamic routing. Route initial sketch generation and complex lemma discovery to gemini-3.1-pro. Route feedback parsing, rating, and context compression to gemini-3.0-flash. Adjust based on real-time success metrics.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Open research conjecture | Parallel agentic pool + compiler verification | High combinatorial search space requires divergent trajectories | Moderate ($200–$500 per sweep) |
| Competition benchmark | Sequential single-agent with high temperature | Known solution space; parallelism adds unnecessary overhead | Low ($20–$50 per problem) |
| Internal code verification | Static analysis + lightweight LLM reviewer | Deterministic rules outperform generative search for syntax/type safety | Minimal (API credits) |
| Educational theorem proving | Guided human-in-the-loop with compiler hints | Learners benefit from structured feedback rather than autonomous discovery | Low (human time dominant) |
Configuration Template
orchestrator:
pool_size: 10
max_episodes: 60
early_termination: true
context_window_limit: 15
model_routing:
primary_reasoning:
model: gemini-3.1-pro
temperature: 0.7
max_tokens: 4096
evaluation:
model: gemini-3.0-flash
temperature: 0.1
max_tokens: 1024
evolutionary:
selection_method: p_ucb
elo_decay_rate: 0.95
compute_allocation: proportional_to_rating
compiler:
path: /usr/local/bin/lean
check_flag: --check
error_parsing: structured_ast
rollback_strategy: last_valid_checkpoint
monitoring:
cost_tracking: true
latency_threshold_ms: 5000
alert_on_context_overflow: true
Quick Start Guide
- Install Lean 4: Run
curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh and verify with lean --version.
- Scaffold the Project: Create a
target_theorem.lean file containing your theorem statement, EVOLVE-BLOCK markers, and a sorry placeholder.
- Launch the Orchestrator: Execute the TypeScript runtime with the configuration template. The pool will spawn, interact with the Lean compiler, and iterate until a valid proof emerges or the episode budget expires.
- Monitor & Extract: Watch structured logs for compiler feedback, agent ratings, and cost accumulation. Upon success, extract the
sorry-free proof and generate a natural-language strategy summary using the evaluation model.