Shipping an AI product once is relatively straightforward—keeping it reliable as you iterate is where teams struggle. Evaluation pipelines transform quality assurance from a manual, error-prone process into an automated safety net that catches regressions before your users do.
123456789101112import json import time from dataclasses import dataclass from typing import List, Dict, Any from openai import OpenAI @dataclass class EvalResult: test_id: str passed: bool score: float latency_ms: float
123456789101112from dataclasses import dataclass from enum import Enum from typing import List, Optional import statistics class GateSeverity(Enum): CRITICAL = "critical" # Blocks deployment WARNING = "warning" # Flags for review INFO = "info" # Logs only @dataclass class QualityGate:
123456789101112import numpy as np from scipy import stats from typing import Tuple, Optional def detect_regression( baseline_scores: list[float], current_scores: list[float], alpha: float = 0.05, min_effect_size: float = 0.02 ) -> Tuple[bool, dict]: """ Detect if current scores represent a significant regression.
123456789101112name: AI Evaluation Pipeline on: pull_request: paths: - 'prompts/**' - 'src/ai/**' - 'evals/**' push: branches: [main] schedule: - cron: '0 */6 * * *' # Every 6 hours
123456789101112import json import asyncio from dataclasses import dataclass from typing import List, Dict, Any import openai import time @dataclass class EvalResult: test_id: str input: str expected: str
123456789101112# config/quality_gates.yaml content: # gates: # accuracy: # threshold: 0.85 # level: error # max_regression: 0.05 # helpfulness: # threshold: 0.80 # level: error # max_regression: 0.05 # latency_p95_ms: # threshold: 3000
123456789101112import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go from datetime import datetime, timedelta import sqlite3 # Database connection @st.cache_resource def get_db(): return sqlite3.connect('evals.db', check_same_thread=False)