Multi-Modal RAG: Images, Tables, Documents — Chunking and Retrieval


Introduction





Real-world documents contain more than text: images, charts, tables, and diagrams carry critical information that text-only RAG systems cannot access. Multi-modal RAG extends retrieval to include visual content, enabling questions like "What does the Q3 revenue chart show?" or "What values are in the configuration table?" This article covers the architectures and techniques for building multi-modal RAG.





Strategies for Multi-Modal RAG





There are three main approaches to handling non-text content:






# Strategy 1: Convert everything to text (simplest)


# Strategy 2: Embed images alongside text (moderate)


# Strategy 3: Multi-modal retrieval with specialized models (most powerful)







Strategy 1: Text Conversion





Convert images and tables to text using vision models or OCR:






from openai import OpenAI


import base64




client = OpenAI()




def describe_image(image_path: str) -> str:


with open(image_path, "rb") as f:


image_data = base64.b64encode(f.read()).decode("utf-8")




response = client.chat.completions.create(


model="gpt-4o",


messages=[


{


"role": "user",


"content": [


{"type": "text", "text": "Describe this image in detail, including all text, data points, and visual elements."},


{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}},


],


}


],


max_tokens=1024,


)


return response.choices[0].message.content




def convert_table_to_text(table_data: list[list[str]]) -> str:


"""Convert a parsed table to searchable text."""


headers = table_data[0]


rows = table_data[1:]


text_parts = []




for row in rows:


row_desc = ", ".join(f"{headers[i]}: {cell}" for i, cell in enumerate(row))


text_parts.append(row_desc)




return "\n".join(text_parts)







Strategy 2: Multi-Vector Retriever





Store both text representations and visual embeddings:






from langchain.retrievers.multi_vector import MultiVectorRetriever


from langchain.storage import InMemoryStore


from langchain.vectorstores import Chroma


from langchain.embeddings import OpenAIEmbeddings


from langchain.schema.document import Document




# Store text summaries alongside raw elements


vectorstore = Chroma(


collection_name="multi_modal_docs",


embedding_function=OpenAIEmbeddings(),


)


store = InMemoryStore()




retriever = MultiVectorRetriever(


vectorstore=vectorstore,


docstore=store,


id_key="doc_id",


)




# For each document element (text, image, table):


# 1. Generate a text summary


# 2. Store the summary in the vector store


# 3. Store the original element in the doc store


# 4. Link them with a shared doc_id




doc_id = "doc_001_image_03"


summary = "Revenue chart showing Q1-Q4 2025: Q1=$1.2M, Q2=$1.5M, Q3=$1.8M, Q4=$2.1M"


original = Document(


page_content="[IMAGE: revenue_chart_2025.png]",


metadata={"type": "image", "path": "revenue_chart_2025.png", "doc_id": doc_id},


)




retriever.vectorstore.add_documents([Document(page_content=summary, metadata={"doc_id": doc_id})])


retriever.docstore.mset([(doc_id, original)])







Strategy 3: Multi-Modal Embeddings





Use embedding models that handle both text and images in a shared space:






from sentence_transformers import SentenceTransformer


import torch


from PIL import Image




class MultiModalEmbedder:


def __init__(self, model_name="clip-ViT-B-32"):


self.model = SentenceTransformer(model_name)




def embed_text(self, text: str) -> list[float]:


return self.model.encode(text).tolist()




def embed_image(self, image_path: str) -> list[float]:


image = Image.open(image_path)


return self.model.encode(image).tolist()




def search_by_text(self, query: str, image_embeddings: list, top_k: int = 5):


query_emb = self.embed_text(query)


scores = torch.cosine_similarity(


torch.tensor(query_emb).unsqueeze(0),


torch.tensor(image_embeddings),


)


top_indices = scores.topk(top_k).indices.tolist()


return top_indices, scores[top_indices].tolist()







Chunking Strategies for Multi-Modal Data





Each content type needs a different chunking approach:






class MultiModalChunker:


def chunk_pdf(self, pdf_path: str) -> list[dict]:


"""Extract and chunk text, images, and tables from a PDF."""


import fitz # PyMuPDF




doc = fitz.open(pdf_path)


chunks = []




for page_num, page in enumerate(doc):


# Extract text blocks


blocks = page.get_text("dict")["blocks"]


for block in blocks:


if block["type"] == 0: # Text


text = block["lines"][0]["spans"][0]["text"]


chunks.append({


"type": "text",


"content": text,


"page": page_num,


"bbox": block["bbox"],


})


elif block["type"] == 1: # Image


image = block["image"]


chunks.append({


"type": "image",


"content": f"[IMAGE: page_{page_num}_block_{block['number']}]",


"page": page_num,


"image_data": image,


"bbox": block["bbox"],


})




return chunks




def chunk_table(self, table_df) -> dict:


"""Convert table to searchable format."""


summary = f"Table with {len(table_df)} rows and {len(table_df.columns)} columns: {', '.join(table_df.columns)}"


text_representation = table_df.to_markdown()


return {


"type": "table",


"summary": summary,


"content": text_representation,


"metadata": {"columns": list(table_df.columns), "rows": len(table_df)},


}







Retrieval and Fusion





Query across all content types and fuse the results:






def multi_modal_retrieve(query: str, text_index, image_index, table_index, top_k: int = 3):


text_results = text_index.similarity_search(query, k=top_k)


image_results = search_images(query, image_index, top_k)


table_results = search_tables(query, table_index, top_k)




# Fuse results with type-aware scoring


all_results = []


for doc in text_results:


all_results.append({"content": doc.page_content, "type": "text", "score": 1.0})


for img in image_results:


all_results.append({"content": img["description"], "type": "image", "path": img["path"], "score": 0.9})


for tbl in table_results:


all_results.append({"content": tbl["summary"], "type": "table", "data": tbl["content"], "score": 0.9})




all_results.sort(key=lambda x: x["score"], reverse=True)


return all_results[:top_k * 2]







Conclusion





Multi-modal RAG extends retrieval to images, tables, and other visual content. The simplest approach converts non-text content to text descriptions using vision models. More sophisticated approaches use multi-vector retrievers or shared embedding spaces like CLIP. Choose your strategy based on the complexity of your visual content and the precision required for retrieval. Always evaluate retrieval quality separately for each modality.