Every interaction with a large language model begins with a fundamental transformation: your text becomes numbers, those numbers flow through attention mechanisms, and meaning emerges from mathematical operations performed billions of times per second. Understanding these mechanics isn't just academic curiosity—it's the foundation of effective context engineering.
123456789101112import tiktoken def count_tokens(text: str, model: str = "gpt-4") -> dict: """Count tokens and estimate costs before making API calls.""" encoding = tiktoken.encoding_for_model(model) tokens = encoding.encode(text) # GPT-4 pricing as of 2024 input_cost_per_1k = 0.03 return { "token_count": len(tokens),
123456789101112from dataclasses import dataclass from typing import List, Optional from enum import Enum class AttentionZone(Enum): PRIMARY_START = "primary_start" # First 500 tokens PRIMARY_END = "primary_end" # Last 300 tokens SECONDARY = "secondary" # Positions 500-1000 and -600 to -300 COMPRESSION = "compression" # Middle zone @dataclass class ContextBlock:
123456789101112import tiktoken from typing import List, Dict, Optional from dataclasses import dataclass @dataclass class ContextSegment: content: str priority: int # 1 = critical, 2 = important, 3 = helpful position: str # 'start', 'middle', 'end' class TokenAwareContextBuilder: def __init__(self, model: str = 'gpt-4', max_tokens: int = 8000):
123456789101112interface RetrievedChunk { content: string; relevanceScore: number; tokenCount: number; source: string; } interface ContextConfig { maxTokens: number; reserveForQuery: number; reserveForResponse: number; boundaryBoost: number; // Extra weight for boundary positions
123456789101112from typing import List, Tuple import numpy as np from sentence_transformers import SentenceTransformer class SemanticContextCompressor: def __init__(self, model_name: str = 'all-MiniLM-L6-v2'): self.encoder = SentenceTransformer(model_name) def compress_context( self, documents: List[str], query: str,