Building AI agents is only half the battleβunderstanding whether they actually work is where production systems succeed or fail. Unlike traditional software where unit tests provide binary pass/fail results, agent evaluation requires measuring nuanced behaviors across task completion, reasoning quality, tool usage efficiency, and cost effectiveness.
123456789101112from dataclasses import dataclass from typing import List, Dict, Optional from enum import Enum import json class CompletionStatus(Enum): FULL = "full" PARTIAL = "partial" FAILED = "failed" ESCALATED = "escalated" @dataclass
123456789101112from dataclasses import dataclass, field from typing import List, Dict, Any, Optional from datetime import datetime import json import boto3 from enum import Enum class ActionType(Enum): TOOL_CALL = "tool_call" REASONING = "reasoning" OBSERVATION = "observation" DECISION = "decision"
123456789101112import asyncio import json import time from dataclasses import dataclass, field from typing import List, Dict, Any, Optional from datetime import datetime import boto3 from enum import Enum class TaskDifficulty(Enum): SIMPLE = 1 MODERATE = 2
123456789101112import anthropic import hashlib import json from typing import Dict, List, Tuple, Optional from dataclasses import dataclass import boto3 from functools import lru_cache @dataclass class JudgmentResult: dimension: str score: float # 0-1
123456789101112{ "AWSTemplateFormatVersion": "2010-09-09", "Description": "Agent Evaluation Metrics Dashboard", "Resources": { "AgentEvaluationDashboard": { "Type": "AWS::CloudWatch::Dashboard", "Properties": { "DashboardName": "AgentEvaluation-Production", "DashboardBody": { "Fn::Sub": "{\"widgets\":[{\"type\":\"metric\",\"x\":0,\"y\":0,\"width\":8,\"height\":6,\"properties\":{\"title\":\"Task Completion Rate\",\"metrics\":[[\"AgentMetrics\",\"TaskCompletionRate\",\"Environment\",\"production\",{\"stat\":\"Average\",\"period\":300}],[\".\",\"TaskCompletionRate\",\".\",\".\",{\"stat\":\"p50\"}],[\".\",\".\",\".\",\".\",{\"stat\":\"p90\"}]],\"region\":\"${AWS::Region}\",\"yAxis\":{\"left\":{\"min\":0,\"max\":100}}}},{\"type\":\"metric\",\"x\":8,\"y\":0,\"width\":8,\"height\":6,\"properties\":{\"title\":\"Cost per Task (USD)\",\"metrics\":[[\"AgentMetrics\",\"CostPerTask\",\"TaskCategory\",\"simple\"],[\".\",\".\",\".\",\"moderate\"],[\".\",\".\",\".\",\"complex\"],[\".\",\".\",\".\",\"expert\"]],\"region\":\"${AWS::Region}\",\"stat\":\"Average\",\"period\":3600}},{\"type\":\"metric\",\"x\":16,\"y\":0,\"width\":8,\"height\":6,\"properties\":{\"title\":\"Trajectory Efficiency\",\"metrics\":[[\"AgentMetrics\",\"TrajectoryEfficiency\",\"Environment\",\"production\"]],\"region\":\"${AWS::Region}\",\"stat\":\"Average\",\"period\":300,\"annotations\":{\"horizontal\":[{\"value\":0.7,\"label\":\"Target\"}]}}},{\"type\":\"metric\",\"x\":0,\"y\":6,\"width\":12,\"height\":6,\"properties\":{\"title\":\"Tool Usage Accuracy by Tool\",\"metrics\":[[\"AgentMetrics\",\"ToolAccuracy\",\"ToolName\",\"database_query\"],[\".\",\".\",\".\",\"api_call\"],[\".\",\".\",\".\",\"file_operation\"],[\".\",\".\",\".\",\"calculation\"]],\"region\":\"${AWS::Region}\",\"stat\":\"Average\",\"period\":3600}},{\"type\":\"metric\",\"x\":12,\"y\":6,\"width\":12,\"height\":6,\"properties\":{\"title\":\"Failure Categories\",\"metrics\":[[\"AgentMetrics\",\"FailureCount\",\"Category\",\"tool_error\"],[\".\",\".\",\".\",\"timeout\"],[\".\",\".\",\".\",\"invalid_output\"],[\".\",\".\",\".\",\"safety_violation\"]],\"region\":\"${AWS::Region}\",\"stat\":\"Sum\",\"period\":3600}}]}" } }