Traditional software testing assumes deterministic outputs—run the same input, get the same output, every time. AI products shatter this assumption completely, creating a testing paradox that paralyzes many teams.
123456789101112import pytest import asyncio from your_ai_client import AIClient client = AIClient() class TestAISmokeTests: """Fast smoke tests - run on every commit""" @pytest.mark.timeout(5) async def test_basic_completion_works(self): """AI responds to basic prompt"""
123456789101112import pytest import time from typing import Any import json class AISmokeTests: """Run in <30 seconds on every commit""" def __init__(self, model_client): self.client = model_client self.timeout = 10 # seconds per test
123456789101112from dataclasses import dataclass from enum import Enum from typing import List, Dict import statistics class RegressionSeverity(Enum): CRITICAL = "critical" # Block deployment MAJOR = "major" # Investigate first MINOR = "minor" # Monitor closely NOISE = "noise" # Ignore @dataclass
123456789101112import pytest import asyncio from typing import Literal import time class AISmokeTester: def __init__(self, client, timeout: float = 10.0): self.client = client self.timeout = timeout self.results = [] async def test_endpoint(
123456789101112import json from dataclasses import dataclass from enum import Enum import hashlib class TestCategory(Enum): REGRESSION = "regression" EDGE_CASE = "edge_case" ADVERSARIAL = "adversarial" @dataclass class GoldenExample:
123456789101112from pydantic import BaseModel from typing import Literal class JudgmentCriteria(BaseModel): name: str description: str score: Literal[1, 2, 3, 4, 5] reasoning: str class JudgmentResult(BaseModel): criteria_scores: list[JudgmentCriteria] overall_score: float