Testing Strategies for AI Applications


Introduction





Testing AI applications differs fundamentally from testing deterministic software. Model outputs are probabilistic, edge cases are infinite, and a passing unit test does not guarantee correct behavior in production. A comprehensive AI testing strategy combines traditional software testing with AI-specific evaluation methodologies to catch regressions, hallucinations, and performance degradations before they reach users.





Evaluation Datasets





Curate high-quality evaluation datasets that reflect real-world usage:






from dataclasses import dataclass


from typing import List, Callable


import json




@dataclass


class TestCase:


id: str


input: str


expected_output: str


domain: str


difficulty: str # "easy", "medium", "hard"


tags: List[str]


# For non-deterministic evaluation


criteria: List[Callable[[str], bool]]




class EvalDataset:


def __init__(self, name: str):


self.name = name


self.test_cases: List[TestCase] = []




def add_golden_set(self, path: str):


"""Load curated golden test cases."""


with open(path) as f:


data = json.load(f)


for item in data:


self.test_cases.append(TestCase(


id=item["id"],


input=item["input"],


expected_output=item["expected_output"],


domain=item.get("domain", "general"),


difficulty=item.get("difficulty", "medium"),


tags=item.get("tags", []),


criteria=[


lambda output, expected=item["expected_output"]:


expected.lower() in output.lower(),


],


))




def add_adversarial(self, path: str):


"""Load adversarial test cases (edge cases, jailbreaks)."""


with open(path) as f:


data = json.load(f)


for item in data:


self.test_cases.append(TestCase(


id=f"adv_{item['id']}",


input=item["input"],


expected_output="",


domain="adversarial",


difficulty="hard",


tags=["adversarial"],


criteria=[


lambda output:


"I cannot" in output or "I'm unable" in output,


],


))







Regression Testing





Automated regression testing catches model behavior changes:






class ModelRegressionTest:


def __init__(self, eval_dataset: EvalDataset):


self.dataset = eval_dataset


self.results_history: List[dict] = []




async def run_regression_suite(


self,


model_name: str,


previous_results: dict = None,


) -> dict:


results = {


"model": model_name,


"timestamp": datetime.utcnow().isoformat(),


"total": len(self.dataset.test_cases),


"passed": 0,


"failed": 0,


"failures": [],


"score": 0.0,


}




for test_case in self.dataset.test_cases:


try:


output = await self._invoke_model(model_name, test_case.input)




# Evaluate against all criteria


passed = all(


criterion(output) for criterion in test_case.criteria


)




if passed:


results["passed"] += 1


else:


results["failed"] += 1


results["failures"].append({


"id": test_case.id,


"input": test_case.input,


"expected": test_case.expected_output,


"actual": output,


"domain": test_case.domain,


})


except Exception as e:


results["failed"] += 1


results["failures"].append({


"id": test_case.id,


"error": str(e),


})




results["score"] = results["passed"] / results["total"]




# Compare with previous run


if previous_results:


score_delta = results["score"] - previous_results["score"]


results["regression"] = score_delta < -0.02


results["score_delta"] = score_delta




return results




def fail_pipeline_if_regression(self, results: dict):


"""Fail CI if model regressed beyond threshold."""


if results.get("regression", False):


raise Exception(


f"Model regression detected! "


f"Score dropped from "


f"{results.get('previous_score', 1.0):.2%} to "


f"{results['score']:.2%}"


)







A/B Evaluation





Compare model versions side by side with structured evaluation:






class ABEvaluation:


def __init__(self, judge_model: str = "claude-opus-4-20260512"):


self.judge = judge_model




async def evaluate_pair(


self,


prompt: str,


output_a: str,


output_b: str,


criteria: List[str],


) -> dict:


"""Use an independent judge model to compare outputs."""


evaluation_prompt = f"""


Compare these two AI responses to the same prompt.




Prompt: "{prompt}"




Response A: "{output_a}"


Response B: "{output_b}"




Evaluate on these criteria: {', '.join(criteria)}




For each criterion, state which response is better (A, B, or tie)


and provide a brief justification.


"""




response = client.messages.create(


model=self.judge,


max_tokens=1024,


messages=[{"role": "user", "content": evaluation_prompt}],


)




return self._parse_evaluation(response.content[0].text)




async def batch_evaluate(


self,


prompts: List[str],


model_a: str,


model_b: str,


criteria: List[str],


) -> dict:


results = {"model_a_wins": 0, "model_b_wins": 0, "ties": 0}




for prompt in prompts:


output_a = await self._invoke(model_a, prompt)


output_b = await self._invoke(model_b, prompt)




evaluation = await self.evaluate_pair(


prompt, output_a, output_b, criteria


)




# Aggregate across criteria


winner = evaluation["winner"]


results[f"{winner}_wins"] += 1




results["total"] = len(prompts)


results["a_win_rate"] = results["model_a_wins"] / results["total"]


results["b_win_rate"] = results["model_b_wins"] / results["total"]




return results







Hallucination Detection





Automated hallucination checks verify factual accuracy:






class HallucinationDetector:


def __init__(self, knowledge_base: Callable):


self.kb = knowledge_base




async def check_factual_claims(self, output: str) -> List[dict]:


"""Extract factual claims and verify them against a knowledge base."""


# 1. Extract atomic claims


claims = await self._extract_claims(output)




verified_claims = []


for claim in claims:


# 2. Search knowledge base


evidence = await self.kb.search(claim["text"])




# 3. Verify claim against evidence


verification = await self._verify_claim(


claim["text"],


evidence,


claim["context"],


)




verified_claims.append({


"claim": claim["text"],


"confidence": verification["confidence"],


"supported": verification["supported"],


"evidence": evidence[:3],


"context": claim["context"],


})




return verified_claims




async def _verify_claim(


self,


claim: str,


evidence: List[str],


context: str,


) -> dict:


prompt = f"""


Claim: "{claim}"


Context: "{context}"


Evidence: {' '.join(evidence[:3])}




Is this claim supported by the evidence?


Respond with JSON:


{{"supported": bool, "confidence": 0.0-1.0, "reasoning": "..."}}


"""


response = await self._llm_call(prompt)


return json.loads(response)




def compute_hallucination_rate(


self, verified_claims: List[dict]


) -> float:


unsupported = sum(


1 for c in verified_claims if not c["supported"]


)


total = len(verified_claims)


return unsupported / total if total > 0 else 0.0







Prompt Testing





Version-control prompts with structured testing:






class PromptRegistry:


def __init__(self):


self.prompts = {}




def register(


self,


name: str,


template: str,


version: str,


tests: List[Callable] = None,


):


self.prompts[name] = {


"template": template,


"version": version,


"tests": tests or [],


"performance": [],


}




async def test_prompt(


self, name: str, test_cases: List[dict]


) -> dict:


prompt = self.prompts[name]


results = []




for case in test_cases:


# Render template with test inputs


rendered = prompt["template"].format(**case["inputs"])


output = await self._invoke(rendered)




# Run tests


test_results = [


test(output) for test in prompt["tests"]


]




results.append({


"case": case["name"],


"output": output,


"tests_passed": all(test_results),


"test_details": test_results,


})




return {


"prompt": name,


"version": prompt["version"],


"pass_rate": sum(r["tests_passed"] for r in results) / len(results),


"results": results,


}







Performance Testing





Benchmark latency, throughput, and cost across model versions:






class AIPerformanceTest:


async def benchmark(


self,


model: str,


concurrency: int = 10,


requests: int = 100,


) -> dict:


import time


import asyncio




semaphore = asyncio.Semaphore(concurrency)


latencies = []




async def single_request():


async with semaphore:


start = time.monotonic()


await self._invoke_model(model, "Test prompt")


latencies.append(time.monotonic() - start)




tasks = [single_request() for _ in range(requests)]


await asyncio.gather(*tasks)




latencies.sort()


return {


"model": model,


"p50": latencies[len(latencies) // 2],


"p95": latencies[int(len(latencies) * 0.95)],


"p99": latencies[int(len(latencies) * 0.99)],


"avg": sum(latencies) / len(latencies),


"throughput": requests / sum(latencies),


"total_cost": self._calculate_cost(model, requests),


}







A mature AI testing pipeline runs golden set regression on every PR, A/B evaluations before model upgrades, hallucination detection on every production response, and performance benchmarks weekly. No single metric captures model quality; combine automated tests with human evaluation for production releases.