LLM Caching: Semantic Cache, Exact Match, TTL, Invalidation Strategies
Introduction
LLM API calls are expensive, both in cost and latency. Caching previously generated responses can reduce costs by 20-80% depending on the application. Unlike traditional HTTP caching where exact URL matching suffices, LLM caching must handle semantically equivalent but textually different queries. This article covers caching strategies from simple exact match to sophisticated semantic caching.
Exact Match Cache
The simplest cache: identical inputs produce identical outputs:
import hashlib
import json
from functools import lru_cache
class ExactMatchCache:
def __init__(self, ttl_seconds: int = 3600):
self.cache = {}
self.ttl = ttl_seconds
def _make_key(self, messages: list[dict], model: str, params: dict) -> str:
canonical = json.dumps({
"messages": messages,
"model": model,
"temperature": params.get("temperature", 0),
"max_tokens": params.get("max_tokens"),
}, sort_keys=True)
return hashlib.sha256(canonical.encode()).hexdigest()
def get(self, messages: list[dict], model: str, params: dict) -> str | None:
key = self._make_key(messages, model, params)
if key in self.cache:
entry = self.cache[key]
if time.time() - entry["timestamp"] < self.ttl:
return entry["response"]
else:
del self.cache[key]
return None
def set(self, messages: list[dict], model: str, params: dict, response: str):
key = self._make_key(messages, model, params)
self.cache[key] = {"response": response, "timestamp": time.time()}
Exact match works well when identical questions recur: FAQs, repeated classification tasks, or template-based prompts where only parameters change.
Semantic Cache
Semantic caching returns cached responses for semantically equivalent questions:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class SemanticCache:
def __init__(self, embedding_fn, similarity_threshold: float = 0.92):
self.embedding = embedding_fn
self.threshold = similarity_threshold
self.cache_entries: list[dict] = []
def get(self, query: str) -> str | None:
query_emb = self.embedding(query)
for entry in self.cache_entries:
similarity = cosine_similarity([query_emb], [entry["embedding"]])[0][0]
if similarity >= self.threshold:
entry["access_count"] += 1
entry["last_accessed"] = time.time()
return entry["response"]
return None
def set(self, query: str, response: str):
entry = {
"query": query,
"embedding": self.embedding(query),
"response": response,
"created_at": time.time(),
"access_count": 1,
"last_accessed": time.time(),
}
self.cache_entries.append(entry)
def evict_lru(self, max_entries: int = 10000):
if len(self.cache_entries) > max_entries:
self.cache_entries.sort(key=lambda e: e["last_accessed"])
self.cache_entries = self.cache_entries[-max_entries:]
The similarity threshold controls the precision-recall tradeoff. A threshold of 0.95 is safe but rarely matches. A threshold of 0.85 catches more queries but risks returning irrelevant cached responses. Test on your specific query distribution.
Two-Level Cache
Combine both strategies for maximum coverage:
class TwoLevelLLMCache:
def __init__(self, embedding_fn):
self.exact = ExactMatchCache(ttl_seconds=7200)
self.semantic = SemanticCache(embedding_fn, similarity_threshold=0.92)
def get(self, messages: list[dict], model: str, params: dict) -> str | None:
# Try exact match first (fast, no embedding computation)
exact_result = self.exact.get(messages, model, params)
if exact_result:
return exact_result
# Try semantic match for the last user message
last_user_msg = self._get_last_user_message(messages)
if last_user_msg:
semantic_result = self.semantic.get(last_user_msg)
if semantic_result:
return semantic_result
return None
def set(self, messages: list[dict], model: str, params: dict, response: str):
self.exact.set(messages, model, params, response)
last_user_msg = self._get_last_user_message(messages)
if last_user_msg:
self.semantic.set(last_user_msg, response)
TTL and Invalidation
Different cache entries need different expiration policies:
class TTLManager:
def __init__(self):
# Different TTLs for different content types
self.ttl_config = {
"classification": 86400 * 30, # 30 days (stable task)
"extraction": 86400 * 7, # 7 days
"summarization": 3600, # 1 hour (content changes)
"generation": 300, # 5 minutes (creative, no cache)
"factual_qa": 86400, # 1 day (may become stale)
}
def get_ttl(self, task_type: str) -> int:
return self.ttl_config.get(task_type, 3600)
def invalidate_by_prefix(self, prefix: str):
"""Invalidate cache entries matching a prefix pattern."""
# Used when source documents are updated
pass
def invalidate_all(self):
"""Emergency invalidation."""
pass
Cache-Aware Application Design
Structure your application to maximize cache hits:
class CacheAwareLLMClient:
def __init__(self, cache: TwoLevelLLMCache, llm_fn):
self.cache = cache
self.llm = llm_fn
async def generate(self, task_type: str, messages: list[dict], model: str, params: dict) -> str:
# Check cache
cached = self.cache.get(messages, model, params)
if cached:
return cached
# Generate fresh response
response = await self.llm(messages, model, params)
# Cache with appropriate TTL
self.cache.set(messages, model, params, response)
return response
Cache Hit Rate Optimization
class CacheAnalytics:
def track_hit_rate(self):
"""Monitor and report cache effectiveness."""
metrics = {
"exact_hits": 0,
"semantic_hits": 0,
"misses": 0,
}
# Adjust semantic threshold based on hit quality
if metrics["semantic_hits"] > 0 and user_feedback_score < 0.8:
self.semantic.threshold += 0.02 # Be more conservative
Conclusion
LLM caching significantly reduces costs and latency. Use exact-match caching for deterministic tasks with identical inputs. Add semantic caching with embedding similarity for paraphrased questions. Implement two-level caching with exact match first (fast) then semantic match (broader). Configure TTLs based on content stability: long TTLs for classification tasks, short TTLs for generation. Monitor cache hit rates and user satisfaction to tune similarity thresholds. A well-tuned cache typically achieves 30-60% hit rates in production.