livekit/rtc-node';
import { PipelineRouter } from './pipeline-router';
export class VoiceAgentWorker {
private router: PipelineRouter;
private activeStreams: Map<string, AudioStream> = new Map();
constructor(private room: Room) {
this.router = new PipelineRouter();
this.setupTrackSubscriptions();
}
private setupTrackSubscriptions(): void {
this.room.on(Track.RemoteTrackSubscribed, (track: Track) => {
if (track.kind === 'audio') {
const stream = new AudioStream(track);
this.activeStreams.set(track.sid, stream);
this.consumeAudioStream(stream);
}
});
}
private async consumeAudioStream(stream: AudioStream): Promise<void> {
for await (const frame of stream) {
await this.router.pushAudioFrame(frame);
}
}
}
**Rationale:** The SFU handles media routing, echo cancellation, and packet loss concealment. The worker focuses exclusively on inference orchestration. This separation prevents media server crashes from cascading into model API failures.
### Step 2: Voice Activity Detection & Endpointing
VAD must operate with strict trailing silence thresholds. Default configurations often wait 500β800ms of silence before triggering inference, which alone violates the latency budget. Production systems should configure endpointing at 200ms for high-tempo interactions or 300ms for structured call flows.
```typescript
import { VADConfig, EndpointDetector } from '@deepgram/voice-sdk';
const vadConfig: VADConfig = {
silenceThresholdMs: 250,
useEndpointing: true,
echoCancellation: true,
noiseSuppression: true
};
export class AudioEndpointManager {
private detector: EndpointDetector;
constructor(config: VADConfig) {
this.detector = new EndpointDetector(config);
}
public processFrame(frame: Float32Array): boolean {
const isSpeaking = this.detector.analyze(frame);
if (!isSpeaking && this.detector.isSilent()) {
return this.detector.triggerEndpoint();
}
return false;
}
}
Rationale: Endpointing on silence rather than fixed timers prevents premature cuts during natural pauses. Echo cancellation is mandatory when the agent speaks simultaneously with the user, preventing feedback loops that corrupt STT accuracy.
Step 3: Streaming STT Pipeline
Batch transcription introduces 600β1,200ms of delay before inference begins. Streaming STT with interim result filtering ensures the pipeline fires only on finalized utterances.
import { DeepgramClient } from '@deepgram/sdk';
import { LLMTrigger } from './llm-trigger';
export class StreamingTranscriptionEngine {
private dgClient: DeepgramClient;
private trigger: LLMTrigger;
constructor(apiKey: string, llmTrigger: LLMTrigger) {
this.dgClient = new DeepgramClient(apiKey);
this.trigger = llmTrigger;
}
public async startStream(audioStream: AsyncIterable<Buffer>): Promise<void> {
const connection = this.dgClient.listen.live({
model: 'nova-2',
punctuate: true,
smart_format: true,
endpointing: 300,
interim_results: false
});
connection.on('transcript', (data) => {
if (data.is_final && data.channel?.alternatives?.[0]?.transcript) {
const transcript = data.channel.alternatives[0].transcript.trim();
if (transcript.length > 0) {
this.trigger.execute(transcript);
}
}
});
for await (const chunk of audioStream) {
connection.send(chunk);
}
}
}
Rationale: Disabling interim_results reduces WebSocket payload overhead. The endpointing=300 parameter aligns VAD and STT silence detection, eliminating redundant round-trips. Firing only on is_final: true prevents partial utterances from triggering incomplete LLM generations.
Step 4: LLM Streaming & Context Management
First-token latency dominates the inference budget. Model selection must balance speed and reasoning depth. GPT-4o-mini (~150ms) and Claude Haiku 4.5 (~120β180ms) consistently outperform larger models in streaming scenarios. Context windows should be capped near 2,000 tokens to maintain linear first-token scaling.
import { OpenAI } from 'openai';
import { TTSSynthesizer } from './tts-synthesizer';
export class LLMStreamOrchestrator {
private client: OpenAI;
private tts: TTSSynthesizer;
constructor(apiKey: string, tts: TTSSynthesizer) {
this.client = new OpenAI({ apiKey });
this.tts = tts;
}
public async generateResponse(userInput: string): Promise<void> {
const stream = await this.client.chat.completions.create({
model: 'gpt-4o-mini',
messages: [{ role: 'user', content: userInput }],
stream: true,
max_tokens: 300
});
for await (const chunk of stream) {
const token = chunk.choices[0]?.delta?.content ?? '';
if (token) {
await this.tts.pushToken(token);
}
}
}
}
Rationale: Streaming tokens directly to TTS recovers 100β200ms by overlapping generation and synthesis. System prompts must exclude markdown formatting, as TTS engines vocalize syntax characters. Keeping max_tokens constrained prevents runaway generation that blocks subsequent user input.
Step 5: Streaming TTS & Audio Playback
TTS latency varies dramatically by tier and configuration. ElevenLabs Flash tier delivers first-audio chunks in 60β100ms, while standard tiers exceed 200ms. Aggressive streaming optimization must be enabled to prevent buffer buildup.
import { ElevenLabsClient } from 'elevenlabs';
export class TTSSynthesizer {
private client: ElevenLabsClient;
private audioQueue: AsyncQueue<Uint8Array>;
constructor(apiKey: string) {
this.client = new ElevenLabsClient({ apiKey });
this.audioQueue = new AsyncQueue();
}
public async pushToken(token: string): Promise<void> {
const audioStream = await this.client.generate({
text: token,
model_id: 'eleven_flash_v2_5',
voice_id: 'premade',
output_format: 'pcm_16000',
stream: true,
optimize_streaming_latency: 4
});
for await (const chunk of audioStream) {
this.audioQueue.enqueue(chunk);
}
}
public async *readAudioChunks(): AsyncGenerator<Uint8Array> {
while (true) {
yield await this.audioQueue.dequeue();
}
}
}
Rationale: optimize_streaming_latency=4 forces the TTS engine to prioritize chunk delivery over audio quality smoothing. PCM 16kHz output matches telephony and SFU expectations, avoiding unnecessary transcoding. The async queue decouples TTS generation from RTP packetization, preventing backpressure from stalling the pipeline.
Pitfall Guide
1. VAD Trailing Silence Misconfiguration
Explanation: Default VAD implementations wait 500β800ms of silence before triggering inference. This pause sits entirely in the user experience before any API call fires, instantly violating latency budgets.
Fix: Configure trailing silence to 200β300ms. Align VAD endpointing with STT server-side silence detection to prevent duplicate waiting periods.
2. Batch STT Bottleneck
Explanation: Sending complete audio clips to STT APIs introduces 600β1,200ms of delay before the LLM receives input. This alone makes sub-500ms end-to-end latency mathematically impossible.
Fix: Implement streaming STT with WebSocket connections. Filter for is_final: true events and discard interim results to reduce payload overhead.
3. LLM Output Buffering
Explanation: Waiting for the complete LLM response before initiating TTS synthesis adds 200β400ms of idle time. The pipeline remains blocked while tokens are generated server-side.
Fix: Stream LLM tokens directly to TTS as they arrive. Overlap generation and synthesis to recover 100β200ms of total latency.
4. ICE Candidate Gathering Delay
Explanation: Naive WebRTC implementations wait for all ICE candidates before signaling. This adds 500β2,000ms to call setup, invisible in demos but critical in production.
Fix: Implement ICE Trickle to send candidates as they are discovered. Use STUN for reflexive discovery and TURN for symmetric NAT traversal. Leverage SFU-hosted signaling to eliminate custom WebSocket servers.
5. Codec Mismatch (Opus vs G.711)
Explanation: PSTN calls use G.711 at 8kHz narrowband, which degrades STT accuracy by 15β25% compared to 16kHz+ wideband Opus. Teams often ignore this when routing telephony traffic through AI agents.
Fix: Use WebRTC for browser/mobile clients to preserve wideband audio. For PSTN integration, apply spectral enhancement or switch to narrowband-optimized STT models. Never mix codec expectations in the same pipeline.
6. PII Leakage in Streaming Logs
Explanation: Raw audio and transcribed text passing through multiple services creates compliance exposure. Default logging configurations often capture full utterances, violating HIPAA/GDPR boundaries.
Fix: Implement real-time PII redaction at the STT output stage. Route inference through isolated worker processes with ephemeral memory. Disable provider-side logging for audio payloads and enforce data retention policies.
7. Ignoring p95 Latency Metrics
Explanation: Optimizing for average latency masks tail failures. A system with 300ms p50 but 1,200ms p95 will cause frequent user drop-offs during network congestion or model throttling.
Fix: Monitor p95 and p99 latency separately from p50. Implement circuit breakers that fallback to cached responses or graceful degradation when latency exceeds 800ms. Use adaptive bitrate and token limits during peak load.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Browser/Mobile Web App | WebRTC + Opus + SFU | Wideband audio improves STT accuracy; ICE Trickle reduces setup latency | Medium (SFU hosting + TURN bandwidth) |
| PSTN/Contact Center Integration | SIP + Twilio Media Streams | Bridges legacy telephony without running full SIP stacks; handles G.711 natively | Low-Medium (per-minute telephony rates) |
| High-Volume Healthcare Scheduling | Streaming STT + GPT-4o-mini + Flash TTS | Balances speed, accuracy, and compliance; meets p95 <800ms target | Medium (model API + streaming tiers) |
| Low-Budget MVP | Batch STT + Sync LLM + File TTS | Simplifies architecture for testing; acceptable for non-real-time demos | Low (standard API pricing) |
| Enterprise NAT/Firewall Environments | WebRTC + Dedicated TURN + ICE Trickle | Ensures connectivity behind symmetric NAT; prevents media path failures | High (TURN egress bandwidth) |
Configuration Template
# voice-pipeline-config.yaml
media:
transport: webrtc
codec: opus
sample_rate: 48000
echo_cancellation: true
noise_suppression: true
vad:
silence_threshold_ms: 250
endpointing: true
trailing_silence_ms: 300
stt:
provider: deepgram
model: nova-2
streaming: true
punctuate: true
smart_format: true
endpointing: 300
interim_results: false
llm:
provider: openai
model: gpt-4o-mini
stream: true
max_tokens: 300
context_limit: 2000
strip_markdown: true
tts:
provider: elevenlabs
model: eleven_flash_v2_5
streaming: true
optimize_latency: 4
output_format: pcm_16000
voice_id: preset
monitoring:
latency_targets:
p50: 400
p95: 800
p99: 1200
circuit_breaker:
threshold_ms: 800
fallback: graceful_degradation
pii_redaction:
enabled: true
patterns: [ssn, credit_card, email, phone]
Quick Start Guide
- Initialize SFU Room: Provision a LiveKit room with ICE Trickle enabled. Configure STUN (
stun.l.google.com:19302) and TURN endpoints for your region.
- Deploy Agent Worker: Run the TypeScript worker subscribing to audio tracks. Configure VAD trailing silence to 250ms and align STT endpointing to 300ms.
- Connect Streaming APIs: Point STT to Deepgram Nova-2 with streaming enabled. Route
is_final transcripts to GPT-4o-mini with stream: true. Pipe tokens to ElevenLabs Flash tier with optimize_streaming_latency: 4.
- Validate Latency Budget: Use browser DevTools or SFU metrics to verify p50 <400ms and p95 <800ms. Adjust VAD thresholds or model parameters if tail latency exceeds targets.
- Enforce Compliance: Enable real-time PII redaction at the STT output stage. Disable provider-side logging for audio payloads and configure ephemeral worker memory for session isolation.