Introduction
Testing AI applications differs fundamentally from testing deterministic software. Model outputs are probabilistic, edge cases are infinite, and a passing unit test does not guarantee correct behavior in production. A comprehensive AI testing strategy combines traditional software testing with AI-specific evaluation methodologies to catch regressions, hallucinations, and performance degradations before they reach users.
Evaluation Datasets
Curate high-quality evaluation datasets that reflect real-world usage:
from dataclasses import dataclass
from typing import List, Callable
import json
@dataclass
class TestCase:
id: str
input: str
expected_output: str
domain: str
difficulty: str # "easy", "medium", "hard"
tags: List[str]
# For non-deterministic evaluation
criteria: List[Callable[[str], bool]]
class EvalDataset:
def __init__(self, name: str):
self.name = name
self.test_cases: List[TestCase] = []
def add_golden_set(self, path: str):
"""Load curated golden test cases."""
with open(path) as f:
data = json.load(f)
for item in data:
self.test_cases.append(TestCase(
id=item["id"],
input=item["input"],
expected_output=item["expected_output"],
domain=item.get("domain", "general"),
difficulty=item.get("difficulty", "medium"),
tags=item.get("tags", []),
criteria=[
lambda output, expected=item["expected_output"]:
expected.lower() in output.lower(),
],
))
def add_adversarial(self, path: str):
"""Load adversarial test cases (edge cases, jailbreaks)."""
with open(path) as f:
data = json.load(f)
for item in data:
self.test_cases.append(TestCase(
id=f"adv_{item['id']}",
input=item["input"],
expected_output="",
domain="adversarial",
difficulty="hard",
tags=["adversarial"],
criteria=[
lambda output:
"I cannot" in output or "I'm unable" in output,
],
))
Regression Testing
Automated regression testing catches model behavior changes:
class ModelRegressionTest:
def __init__(self, eval_dataset: EvalDataset):
self.dataset = eval_dataset
self.results_history: List[dict] = []
async def run_regression_suite(
self,
model_name: str,
previous_results: dict = None,
) -> dict:
results = {
"model": model_name,
"timestamp": datetime.utcnow().isoformat(),
"total": len(self.dataset.test_cases),
"passed": 0,
"failed": 0,
"failures": [],
"score": 0.0,
}
for test_case in self.dataset.test_cases:
try:
output = await self._invoke_model(model_name, test_case.input)
# Evaluate against all criteria
passed = all(
criterion(output) for criterion in test_case.criteria
)
if passed:
results["passed"] += 1
else:
results["failed"] += 1
results["failures"].append({
"id": test_case.id,
"input": test_case.input,
"expected": test_case.expected_output,
"actual": output,
"domain": test_case.domain,
})
except Exception as e:
results["failed"] += 1
results["failures"].append({
"id": test_case.id,
"error": str(e),
})
results["score"] = results["passed"] / results["total"]
# Compare with previous run
if previous_results:
score_delta = results["score"] - previous_results["score"]
results["regression"] = score_delta < -0.02
results["score_delta"] = score_delta
return results
def fail_pipeline_if_regression(self, results: dict):
"""Fail CI if model regressed beyond threshold."""
if results.get("regression", False):
raise Exception(
f"Model regression detected! "
f"Score dropped from "
f"{results.get('previous_score', 1.0):.2%} to "
f"{results['score']:.2%}"
)
A/B Evaluation
Compare model versions side by side with structured evaluation:
class ABEvaluation:
def __init__(self, judge_model: str = "claude-opus-4-20260512"):
self.judge = judge_model
async def evaluate_pair(
self,
prompt: str,
output_a: str,
output_b: str,
criteria: List[str],
) -> dict:
"""Use an independent judge model to compare outputs."""
evaluation_prompt = f"""
Compare these two AI responses to the same prompt.
Prompt: "{prompt}"
Response A: "{output_a}"
Response B: "{output_b}"
Evaluate on these criteria: {', '.join(criteria)}
For each criterion, state which response is better (A, B, or tie)
and provide a brief justification.
"""
response = client.messages.create(
model=self.judge,
max_tokens=1024,
messages=[{"role": "user", "content": evaluation_prompt}],
)
return self._parse_evaluation(response.content[0].text)
async def batch_evaluate(
self,
prompts: List[str],
model_a: str,
model_b: str,
criteria: List[str],
) -> dict:
results = {"model_a_wins": 0, "model_b_wins": 0, "ties": 0}
for prompt in prompts:
output_a = await self._invoke(model_a, prompt)
output_b = await self._invoke(model_b, prompt)
evaluation = await self.evaluate_pair(
prompt, output_a, output_b, criteria
)
# Aggregate across criteria
winner = evaluation["winner"]
results[f"{winner}_wins"] += 1
results["total"] = len(prompts)
results["a_win_rate"] = results["model_a_wins"] / results["total"]
results["b_win_rate"] = results["model_b_wins"] / results["total"]
return results
Hallucination Detection
Automated hallucination checks verify factual accuracy:
class HallucinationDetector:
def __init__(self, knowledge_base: Callable):
self.kb = knowledge_base
async def check_factual_claims(self, output: str) -> List[dict]:
"""Extract factual claims and verify them against a knowledge base."""
# 1. Extract atomic claims
claims = await self._extract_claims(output)
verified_claims = []
for claim in claims:
# 2. Search knowledge base
evidence = await self.kb.search(claim["text"])
# 3. Verify claim against evidence
verification = await self._verify_claim(
claim["text"],
evidence,
claim["context"],
)
verified_claims.append({
"claim": claim["text"],
"confidence": verification["confidence"],
"supported": verification["supported"],
"evidence": evidence[:3],
"context": claim["context"],
})
return verified_claims
async def _verify_claim(
self,
claim: str,
evidence: List[str],
context: str,
) -> dict:
prompt = f"""
Claim: "{claim}"
Context: "{context}"
Evidence: {' '.join(evidence[:3])}
Is this claim supported by the evidence?
Respond with JSON:
{{"supported": bool, "confidence": 0.0-1.0, "reasoning": "..."}}
"""
response = await self._llm_call(prompt)
return json.loads(response)
def compute_hallucination_rate(
self, verified_claims: List[dict]
) -> float:
unsupported = sum(
1 for c in verified_claims if not c["supported"]
)
total = len(verified_claims)
return unsupported / total if total > 0 else 0.0
Prompt Testing
Version-control prompts with structured testing:
class PromptRegistry:
def __init__(self):
self.prompts = {}
def register(
self,
name: str,
template: str,
version: str,
tests: List[Callable] = None,
):
self.prompts[name] = {
"template": template,
"version": version,
"tests": tests or [],
"performance": [],
}
async def test_prompt(
self, name: str, test_cases: List[dict]
) -> dict:
prompt = self.prompts[name]
results = []
for case in test_cases:
# Render template with test inputs
rendered = prompt["template"].format(**case["inputs"])
output = await self._invoke(rendered)
# Run tests
test_results = [
test(output) for test in prompt["tests"]
]
results.append({
"case": case["name"],
"output": output,
"tests_passed": all(test_results),
"test_details": test_results,
})
return {
"prompt": name,
"version": prompt["version"],
"pass_rate": sum(r["tests_passed"] for r in results) / len(results),
"results": results,
}
Performance Testing
Benchmark latency, throughput, and cost across model versions:
class AIPerformanceTest:
async def benchmark(
self,
model: str,
concurrency: int = 10,
requests: int = 100,
) -> dict:
import time
import asyncio
semaphore = asyncio.Semaphore(concurrency)
latencies = []
async def single_request():
async with semaphore:
start = time.monotonic()
await self._invoke_model(model, "Test prompt")
latencies.append(time.monotonic() - start)
tasks = [single_request() for _ in range(requests)]
await asyncio.gather(*tasks)
latencies.sort()
return {
"model": model,
"p50": latencies[len(latencies) // 2],
"p95": latencies[int(len(latencies) * 0.95)],
"p99": latencies[int(len(latencies) * 0.99)],
"avg": sum(latencies) / len(latencies),
"throughput": requests / sum(latencies),
"total_cost": self._calculate_cost(model, requests),
}
A mature AI testing pipeline runs golden set regression on every PR, A/B evaluations before model upgrades, hallucination detection on every production response, and performance benchmarks weekly. No single metric captures model quality; combine automated tests with human evaluation for production releases.