In the world of LLM applications, context is both your most powerful asset and your most expensive resource. Every token you send to a model costs money, adds latency, and competes for attention within the model's finite context window.
123456789101112interface TokenBudget { total: number; systemPrompt: number; examples: number; retrievedDocs: number; conversationHistory: number; userContext: number; buffer: number; } function allocateTokenBudget(maxTokens: number, queryType: string): TokenBudget { // Different query types get different allocations
123456789101112from dataclasses import dataclass from datetime import datetime, timedelta import numpy as np @dataclass class ContextCandidate: content: str embedding: np.ndarray created_at: datetime source_authority: float # 0-1 scale user_interaction_count: int
123456789101112interface CacheEntry { queryEmbedding: number[]; context: string; response: string; timestamp: Date; hitCount: number; } class SemanticContextCache { private cache: Map<string, CacheEntry> = new Map(); private similarityThreshold = 0.95; private maxAge = 3600000; // 1 hour
123456789101112interface TokenBudget { total: number; allocated: Map<string, number>; used: Map<string, number>; } interface ContextSource { id: string; priority: number; minTokens: number; maxTokens: number; content: string;
123456789101112from dataclasses import dataclass from typing import List import numpy as np from sentence_transformers import SentenceTransformer @dataclass class ContextChunk: id: str content: str source: str timestamp: float metadata: dict
123456789101112class ContextOptimizationPipeline { private cache: SemanticCache; private scorer: HybridRelevanceScorer; private compressor: TieredCompressor; private budgetManager: TokenBudgetManager; private metrics: MetricsCollector; async assembleContext(query: string, userId: string): Promise<OptimizedContext> { const startTime = Date.now(); // Check semantic cache first const cached = await this.cache.findSimilar(query, 0.92);