LLM Version Management: Model Registry, A/B Testing, Rollback


Introduction





LLMs change frequently: new model releases, fine-tuned versions, updated system prompts, and modified retrieval pipelines all constitute "versions" of your AI system. Unlike traditional software where you can pin a dependency version, LLM behavior shifts with each API update. This article covers the tools and practices for managing LLM versions in production.





Model Registry





A model registry tracks metadata for every deployed model version:






from datetime import datetime


from enum import Enum


import json




class ModelStatus(Enum):


STAGING = "staging"


CANARY = "canary"


PRODUCTION = "production"


ROLLED_BACK = "rolled_back"


DEPRECATED = "deprecated"




class ModelRegistry:


def __init__(self, storage_backend):


self.storage = storage_backend




def register_model(self, model_id: str, metadata: dict) -> dict:


entry = {


"model_id": model_id,


"provider": metadata.get("provider"),


"version": metadata.get("version"),


"description": metadata.get("description"),


"parameters": metadata.get("parameters", {}),


"system_prompt_hash": metadata.get("system_prompt_hash"),


"registered_at": datetime.now().isoformat(),


"status": ModelStatus.STAGING.value,


"evaluation_scores": {},


"deployment_history": [],


}


self.storage.save(f"models/{model_id}", entry)


return entry




def promote(self, model_id: str, target_status: ModelStatus):


entry = self.storage.load(f"models/{model_id}")


entry["status"] = target_status.value


entry["deployment_history"].append({


"action": f"promoted_to_{target_status.value}",


"timestamp": datetime.now().isoformat(),


})


self.storage.save(f"models/{model_id}", entry)




def get_active_model(self) -> dict:


"""Get the current production model."""


all_models = self.storage.load_all("models/*")


for model in sorted(all_models, key=lambda m: m["registered_at"], reverse=True):


if model["status"] == ModelStatus.PRODUCTION.value:


return model


return None




# Usage


registry = ModelRegistry(redis_client)


registry.register_model("claude-sonnet-v4-1", {


"provider": "anthropic",


"version": "claude-sonnet-4-20260512",


"parameters": {"temperature": 0.7, "max_tokens": 4096},


})







A/B Testing Framework





Compare model versions on live traffic with statistical significance:






import random


import hashlib




class ModelABTest:


def __init__(self, registry: ModelRegistry):


self.registry = registry


self.experiments = {}




def start_experiment(self, name: str, model_a: str, model_b: str, traffic_split: float = 0.5):


self.experiments[name] = {


"model_a": model_a,


"model_b": model_b,


"traffic_split": traffic_split,


"started_at": datetime.now().isoformat(),


"results": {"a": {"calls": 0, "errors": 0, "latency_ms": []},


"b": {"calls": 0, "errors": 0, "latency_ms": []}},


}




def select_model(self, experiment: str, user_id: str) -> str:


exp = self.experiments[experiment]


# Deterministic assignment based on user_id hash


hash_val = int(hashlib.md5(f"{experiment}:{user_id}".encode()).hexdigest(), 16)


if (hash_val % 1000) / 1000 < exp["traffic_split"]:


return exp["model_a"], "a"


return exp["model_b"], "b"




def record_result(self, experiment: str, variant: str, latency_ms: float, error: bool = False):


exp = self.experiments[experiment]


exp["results"][variant]["calls"] += 1


exp["results"][variant]["latency_ms"].append(latency_ms)


if error:


exp["results"][variant]["errors"] += 1




def get_winner(self, experiment: str) -> str | None:


exp = self.experiments[experiment]


results = exp["results"]


if results["a"]["calls"] < 100 or results["b"]["calls"] < 100:


return None # Not enough data




error_rate_a = results["a"]["errors"] / results["a"]["calls"]


error_rate_b = results["b"]["errors"] / results["b"]["calls"]




# Simple decision: lower error rate wins


if error_rate_a < error_rate_b:


return exp["model_a"]


return exp["model_b"]







Gradual Rollout





Deploy new models incrementally with automatic rollback:






class GradualRollout:


def __init__(self, registry: ModelRegistry, evaluation_fn):


self.registry = registry


self.evaluate = evaluation_fn




async def deploy(self, model_id: str, stages: list[dict] = None):


if stages is None:


stages = [


{"name": "canary", "traffic": 0.01, "duration_min": 30, "eval_threshold": 0.9},


{"name": "small", "traffic": 0.10, "duration_min": 120, "eval_threshold": 0.95},


{"name": "medium", "traffic": 0.25, "duration_min": 360, "eval_threshold": 0.95},


{"name": "large", "traffic": 0.50, "duration_min": 720, "eval_threshold": 0.95},


{"name": "full", "traffic": 1.00, "duration_min": 0, "eval_threshold": 0.0},


]




for stage in stages:


print(f"Deploying to stage: {stage['name']} ({stage['traffic']*100}% traffic)")


self.registry.promote(model_id, ModelStatus.CANARY)


await self._route_traffic(model_id, stage["traffic"])




# Wait for evaluation period


await asyncio.sleep(stage["duration_min"] * 60)




# Evaluate performance


scores = await self.evaluate(model_id)


if scores.get("overall", 0) < stage["eval_threshold"]:


print(f"Stage {stage['name']} failed evaluation. Rolling back.")


await self.rollback(model_id)


return False




self.registry.promote(model_id, ModelStatus.PRODUCTION)


return True




async def rollback(self, model_id: str):


previous_model = self.registry.get_active_model()


self.registry.promote(previous_model["model_id"], ModelStatus.PRODUCTION)


self.registry.promote(model_id, ModelStatus.ROLLED_BACK)


print(f"Rolled back to {previous_model['model_id']}")







Evaluation Gate





Automated evaluation runs before any model promotion:






class EvaluationGate:


def __init__(self, test_suite: list[dict]):


self.test_suite = test_suite # [{input, expected_output, metrics}]




async def evaluate_model(self, model_fn) -> dict:


results = {"passed": 0, "failed": 0, "details": []}




for test in self.test_suite:


output = await model_fn(test["input"])


score = self._score_output(output, test)


if score >= test.get("threshold", 0.8):


results["passed"] += 1


else:


results["failed"] += 1


results["details"].append({"test": test["input"], "score": score})




results["pass_rate"] = results["passed"] / len(self.test_suite)


return results




def _score_output(self, output: str, test: dict) -> float:


if "expected_output" in test:


return self._similarity(output, test["expected_output"])


return 0.0







Conclusion





Treat LLM versions as managed artifacts, not API parameters. Use a model registry to track every version with its metadata, system prompt, and evaluation scores. Implement A/B testing to compare versions on live traffic. Deploy new models gradually through canary, small, medium, and full rollout stages with automated evaluation gates at each stage. Maintain the ability to roll back instantly when metrics degrade. This discipline makes LLM version management as reliable as traditional software deployment.