AI systems fail differently than traditional software—they don't crash cleanly with stack traces, they degrade subtly, hallucinate confidently, or drift silently over time. A single AI incident can erode months of user trust in minutes, making incident response for AI products fundamentally different from conventional software operations.
123456789101112from dataclasses import dataclass from datetime import datetime, timedelta import numpy as np from collections import deque @dataclass class AIOutputMetrics: timestamp: datetime latency_ms: float confidence_score: float output_length: int user_feedback: int # -1, 0, 1
123456789101112import { MetricsClient } from './metrics'; import { AlertManager } from './alerts'; interface AIHealthMetrics { errorRate: number; latencyP99: number; confidenceP10: number; fallbackRate: number; } class AIIncidentDetector { private baseline: AIHealthMetrics;
123456789101112interface DegradedResponse { result: any; degraded: boolean; degradationReason?: string; userMessage?: string; } class GracefulAIService { async processRequest(input: string): Promise<DegradedResponse> { // Try primary AI provider try { const result = await this.primaryProvider.process(input);
123456789101112interface IncidentTimeline { detected: Date; acknowledged: Date; mitigated: Date; resolved: Date; events: TimelineEvent[]; } class PostMortemCollector { async generatePostMortemData(incidentId: string): Promise<PostMortemReport> { const incident = await this.incidentStore.get(incidentId);