Real-world documents contain more than just text: images, tables, charts, diagrams, and complex layouts carry critical information. Multimodal RAG extends traditional text-based retrieval to handle these diverse content types, enabling AI systems to reason over complete documents as humans do.
Traditional text-only RAG systems face significant limitations when dealing with real-world documents:
| Document Type | Content | Text-Only Problem |
|---|---|---|
| Financial Report | Revenue chart | Cannot extract trends from visualization |
| Technical Docs | Architecture diagram | Component relationships lost |
| Research Paper | Complex tables | Table structure corrupted |
| Product Catalog | Product images | Visual attributes invisible |
| Medical Records | X-rays, scans | Diagnostic images ignored |
┌─────────────────────────────────────────────────────────────────────────┐
│ Multimodal RAG Pipeline │
├─────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ ┌──────────────┐ ┌─────────────────────────────┐ │
│ │ Document │ │ Multimodal │ │ Unified Vector Store │ │
│ │ Ingestion │───▶│ Processing │───▶│ Text + Image + Table Embs │ │
│ └─────────────┘ └──────────────┘ └─────────────────────────────┘ │
│ │ │ │ │
│ │ ┌──────┴──────┐ │ │
│ │ │ │ │ │
│ │ ┌─────▼─────┐ ┌─────▼─────┐ │ │
│ │ │ Vision │ │ Text │ │ │
│ │ │ Encoder │ │ Encoder │ │ │
│ │ └───────────┘ └───────────┘ │ │
│ │ │ │
│ ┌─────▼─────────────────────────────────────────────▼─────────────┐ │
│ │ Query Processing │ │
│ │ Text Query → Text Embedding ──┐ │ │
│ │ Image Query → Vision Embedding ├── Cross-Modal Retrieval │ │
│ │ Mixed Query → Unified Embedding┘ │ │
│ └───────────────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌───────────────────────────────────────────────────────────────────┐ │
│ │ Multimodal LLM (GPT-4V, Claude 3, Gemini) │ │
│ │ Generates Response from All Modalities │ │
│ └───────────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────────────┘Convert everything to text, then use standard text RAG:
from openai import OpenAI
import base64
client = OpenAI()
def extract_image_description(image_path: str) -> str:
"""Use vision model to describe image content."""
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": """Describe this image in detail for document retrieval purposes.
Include:
- All visible text
- Data in charts/tables (with numbers)
- Diagram structure and relationships
- Key visual elements"""
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_data}"}
}
]
}],
max_tokens=1000
)
return response.choices[0].message.content
def process_pdf_with_images(pdf_path: str) -> list[dict]:
"""Process PDF extracting both text and image descriptions."""
import fitz # PyMuPDF
doc = fitz.open(pdf_path)
chunks = []
for page_num, page in enumerate(doc):
# Extract text
text = page.get_text()
if text.strip():
chunks.append({
"type": "text",
"content": text,
"page": page_num,
"source": pdf_path
})
# Extract and describe images
for img_idx, img in enumerate(page.get_images()):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
# Save temporarily and describe
img_path = f"/tmp/page{page_num}_img{img_idx}.png"
pix.save(img_path)
description = extract_image_description(img_path)
chunks.append({
"type": "image",
"content": description,
"page": page_num,
"image_path": img_path,
"source": pdf_path
})
return chunksUse models that embed text and images into the same vector space:
from sentence_transformers import SentenceTransformer
from PIL import Image
import numpy as np
# CLIP embeds text and images in shared space
model = SentenceTransformer('clip-ViT-B-32')
class MultimodalRetriever:
def __init__(self, model_name: str = 'clip-ViT-B-32'):
self.model = SentenceTransformer(model_name)
self.embeddings = []
self.documents = []
def add_text(self, text: str, metadata: dict = None):
"""Add text document to index."""
embedding = self.model.encode(text)
self.embeddings.append(embedding)
self.documents.append({
"type": "text",
"content": text,
"metadata": metadata or {}
})
def add_image(self, image_path: str, metadata: dict = None):
"""Add image to index."""
image = Image.open(image_path)
embedding = self.model.encode(image)
self.embeddings.append(embedding)
self.documents.append({
"type": "image",
"path": image_path,
"metadata": metadata or {}
})
def search(self, query, k: int = 5) -> list[dict]:
"""Search with text or image query."""
if isinstance(query, str):
query_embedding = self.model.encode(query)
else:
# Query is an image
query_embedding = self.model.encode(query)
# Compute similarities
embeddings_matrix = np.array(self.embeddings)
similarities = np.dot(embeddings_matrix, query_embedding)
# Get top k
top_indices = np.argsort(similarities)[-k:][::-1]
return [
{**self.documents[i], "score": float(similarities[i])}
for i in top_indices
]
# Usage
retriever = MultimodalRetriever()
# Index mixed content
retriever.add_text("Machine learning uses neural networks for pattern recognition")
retriever.add_image("architecture_diagram.png", {"page": 5})
retriever.add_text("The system processes 1000 requests per second")
retriever.add_image("performance_chart.png", {"page": 12})
# Search with text - finds both text and relevant images
results = retriever.search("system architecture overview", k=5)
# Search with image - finds similar images and related text
query_image = Image.open("query_diagram.png")
results = retriever.search(query_image, k=5)ColPali takes a revolutionary approach: instead of extracting text from documents, it embeds the entire document page as an image, preserving layout, tables, and visual structure.
Traditional Pipeline:
PDF → OCR → Text Extraction → Chunking → Text Embedding → Vector DB
Problems:
- OCR errors corrupt text
- Table structure lost
- Layout semantics destroyed
- Expensive multi-step pipeline
ColPali Pipeline:
PDF → Render Page as Image → Vision Encoder → Vector DB
Benefits:
- Zero OCR errors
- Layout preserved perfectly
- Tables and charts native
- Single-step embedding# ColPali for document retrieval
from colpali_engine import ColPali, ColPaliProcessor
from pdf2image import convert_from_path
import torch
class ColPaliRetriever:
def __init__(self, model_name: str = "vidore/colpali-v1.2"):
self.model = ColPali.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="cuda"
)
self.processor = ColPaliProcessor.from_pretrained(model_name)
self.index = []
def index_pdf(self, pdf_path: str):
"""Index a PDF by embedding each page as an image."""
# Convert PDF pages to images
pages = convert_from_path(pdf_path, dpi=144)
for page_num, page_image in enumerate(pages):
# Process page image
inputs = self.processor(images=[page_image], return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Get page embedding (actually multiple patch embeddings for late interaction)
with torch.no_grad():
embeddings = self.model(**inputs)
self.index.append({
"embeddings": embeddings.cpu(),
"page": page_num,
"source": pdf_path,
"image": page_image
})
def search(self, query: str, k: int = 5) -> list[dict]:
"""Search using text query against document images."""
# Encode query
query_inputs = self.processor(text=[query], return_tensors="pt")
query_inputs = {k: v.to("cuda") for k, v in query_inputs.items()}
with torch.no_grad():
query_embeddings = self.model(**query_inputs)
# Late interaction scoring (MaxSim)
scores = []
for doc in self.index:
# MaxSim: for each query token, find max similarity to any doc patch
score = self._maxsim(query_embeddings, doc["embeddings"].to("cuda"))
scores.append(score)
# Return top k
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
return [
{**self.index[i], "score": scores[i]}
for i in top_indices
]
def _maxsim(self, query_embs, doc_embs) -> float:
"""Compute MaxSim score between query and document."""
# [query_tokens, dim] @ [doc_patches, dim].T = [query_tokens, doc_patches]
similarities = torch.matmul(query_embs, doc_embs.T)
# For each query token, take max over doc patches
max_sims = similarities.max(dim=-1).values
# Sum over query tokens
return max_sims.sum().item()
# Usage
retriever = ColPaliRetriever()
# Index documents (preserves tables, charts, layout)
retriever.index_pdf("financial_report.pdf")
retriever.index_pdf("technical_documentation.pdf")
# Search - works on visual content!
results = retriever.search("Q3 revenue growth chart", k=3)
results = retriever.search("system architecture diagram", k=3)
results = retriever.search("comparison table of features", k=3)| Aspect | OCR + Text Embedding | ColPali |
|---|---|---|
| Table Handling | Structure often corrupted | Perfect preservation |
| Charts/Diagrams | Information lost | Fully searchable |
| Layout | Lost in text extraction | Preserved as visual |
| Processing Speed | Slow (OCR + chunking) | Fast (single pass) |
| Accuracy | OCR errors propagate | No OCR needed |
| Storage | Text chunks | Patch embeddings (larger) |
Tables are notoriously difficult for text-based RAG. Here are specialized approaches:
import pandas as pd
from unstructured.partition.pdf import partition_pdf
def extract_tables_from_pdf(pdf_path: str) -> list[dict]:
"""Extract tables with structure preserved."""
elements = partition_pdf(
pdf_path,
strategy="hi_res", # Use vision model for better table detection
infer_table_structure=True
)
tables = []
for element in elements:
if element.category == "Table":
# Convert to various formats for embedding
tables.append({
"html": element.metadata.text_as_html,
"text": element.text,
"markdown": html_table_to_markdown(element.metadata.text_as_html),
"page": element.metadata.page_number
})
return tables
def create_table_embedding(table: dict, strategy: str = "combined") -> str:
"""Create embeddable representation of a table."""
if strategy == "description":
# Use LLM to describe the table
return describe_table_with_llm(table["html"])
elif strategy == "markdown":
# Embed markdown representation
return table["markdown"]
elif strategy == "combined":
# Combine description with data
description = describe_table_with_llm(table["html"])
return f"{description}\n\nTable Data:\n{table['markdown']}"
elif strategy == "row_wise":
# Create separate embeddings for each row
df = pd.read_html(table["html"])[0]
rows = []
headers = df.columns.tolist()
for _, row in df.iterrows():
row_text = " | ".join([f"{h}: {v}" for h, v in zip(headers, row)])
rows.append(row_text)
return rows # Return list for multiple embeddingsdef table_qa_rag(query: str, tables: list[dict], llm) -> str:
"""RAG specifically for table-based questions."""
# First, find relevant tables
relevant_tables = retrieve_relevant_tables(query, tables, k=3)
# Format tables for LLM
table_context = ""
for i, table in enumerate(relevant_tables):
table_context += f"\nTable {i+1}:\n{table['markdown']}\n"
# Specialized prompt for table reasoning
prompt = f"""You are analyzing data tables to answer a question.
Tables:
{table_context}
Question: {query}
Instructions:
- Refer to specific cells when citing data
- If calculations are needed, show your work
- If the answer isn't in the tables, say so
Answer:"""
return llm.invoke(prompt).contentModern VLMs (GPT-4V, Claude 3, Gemini) can directly process images in the generation step:
import base64
from openai import OpenAI
class VisionRAG:
"""RAG system that passes retrieved images directly to VLM."""
def __init__(self):
self.client = OpenAI()
self.text_retriever = TextRetriever()
self.image_retriever = ImageRetriever()
def query(self, user_query: str) -> dict:
# Retrieve relevant text
text_results = self.text_retriever.search(user_query, k=5)
# Retrieve relevant images
image_results = self.image_retriever.search(user_query, k=3)
# Build multimodal prompt
content = [{"type": "text", "text": f"Answer this question: {user_query}\n\nContext:"}]
# Add text context
text_context = "\n\n".join([r["text"] for r in text_results])
content.append({"type": "text", "text": text_context})
# Add image context
content.append({"type": "text", "text": "\n\nRelevant images:"})
for img_result in image_results:
with open(img_result["path"], "rb") as f:
img_data = base64.b64encode(f.read()).decode()
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_data}"}
})
content.append({
"type": "text",
"text": "\n\nBased on the text and images above, provide a comprehensive answer."
})
# Generate with vision model
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
max_tokens=1000
)
return {
"answer": response.choices[0].message.content,
"text_sources": text_results,
"image_sources": image_results
}
# Usage
rag = VisionRAG()
result = rag.query("What does the system architecture look like and how does data flow?")
# The VLM can now SEE the architecture diagram and describe it!| Model | Modalities | Best For | Notes |
|---|---|---|---|
| CLIP (OpenAI) | Text + Image | General images | Widely used, good baseline |
| SigLIP | Text + Image | Better accuracy | Improved CLIP training |
| ColPali | Text + Document Images | PDFs, documents | Late interaction, no OCR |
| Jina CLIP v2 | Text + Image | Multilingual | 89 languages supported |
| Nomic Embed Vision | Text + Image | Screenshots | Good for UI/UX content |
| ImageBind (Meta) | 6 modalities | Audio, video, etc. | Text, image, audio, depth, thermal, IMU |
from dataclasses import dataclass
from enum import Enum
from typing import Union
import asyncio
class ContentType(Enum):
TEXT = "text"
IMAGE = "image"
TABLE = "table"
CHART = "chart"
DIAGRAM = "diagram"
@dataclass
class MultimodalChunk:
id: str
content_type: ContentType
text_content: str | None
image_path: str | None
embedding: list[float]
metadata: dict
class ProductionMultimodalRAG:
"""Production-ready multimodal RAG system."""
def __init__(
self,
text_embedder,
vision_embedder, # CLIP or ColPali
vector_store,
vlm_client, # GPT-4V, Claude 3, etc.
):
self.text_embedder = text_embedder
self.vision_embedder = vision_embedder
self.vector_store = vector_store
self.vlm = vlm_client
async def ingest_document(self, doc_path: str) -> list[str]:
"""Ingest a document with all modalities."""
# Detect document type
if doc_path.endswith('.pdf'):
chunks = await self._process_pdf(doc_path)
elif doc_path.endswith(('.png', '.jpg', '.jpeg')):
chunks = await self._process_image(doc_path)
else:
chunks = await self._process_text(doc_path)
# Store in vector database
chunk_ids = []
for chunk in chunks:
self.vector_store.upsert(
id=chunk.id,
embedding=chunk.embedding,
metadata={
"content_type": chunk.content_type.value,
"text": chunk.text_content,
"image_path": chunk.image_path,
**chunk.metadata
}
)
chunk_ids.append(chunk.id)
return chunk_ids
async def _process_pdf(self, pdf_path: str) -> list[MultimodalChunk]:
"""Process PDF with text, tables, and images."""
chunks = []
# Use unstructured for comprehensive extraction
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
pdf_path,
strategy="hi_res",
extract_images_in_pdf=True,
infer_table_structure=True
)
for i, element in enumerate(elements):
if element.category == "Table":
# Table - embed both text and visual representation
chunk = await self._create_table_chunk(element, i, pdf_path)
elif element.category == "Image":
# Image - use vision embedder
chunk = await self._create_image_chunk(element, i, pdf_path)
else:
# Text - standard embedding
chunk = await self._create_text_chunk(element, i, pdf_path)
chunks.append(chunk)
return chunks
async def query(
self,
user_query: str,
include_images: bool = True,
k: int = 10
) -> dict:
"""Query across all modalities."""
# Embed query for text search
query_embedding = self.text_embedder.encode(user_query)
# Search vector store
results = self.vector_store.search(
embedding=query_embedding,
k=k,
include_metadata=True
)
# Separate by content type
text_results = []
image_results = []
table_results = []
for r in results:
content_type = r["metadata"]["content_type"]
if content_type == "text":
text_results.append(r)
elif content_type in ["image", "chart", "diagram"]:
image_results.append(r)
elif content_type == "table":
table_results.append(r)
# Build multimodal context for VLM
if include_images and (image_results or table_results):
response = await self._generate_with_vision(
user_query,
text_results,
image_results,
table_results
)
else:
response = await self._generate_text_only(
user_query,
text_results + table_results
)
return {
"answer": response,
"sources": {
"text": text_results,
"images": image_results,
"tables": table_results
}
}
async def _generate_with_vision(
self,
query: str,
text_results: list,
image_results: list,
table_results: list
) -> str:
"""Generate response using vision-language model."""
# Build multimodal message
content = []
# System context
content.append({
"type": "text",
"text": f"Answer this question using the provided context (text and images): {query}"
})
# Add text context
if text_results:
text_context = "\n\n".join([r["metadata"]["text"] for r in text_results[:5]])
content.append({"type": "text", "text": f"\n\nText Context:\n{text_context}"})
# Add table context (as images for better understanding)
for table in table_results[:2]:
if table["metadata"].get("image_path"):
content.append(self._create_image_content(table["metadata"]["image_path"]))
# Add image context
for img in image_results[:3]:
if img["metadata"].get("image_path"):
content.append(self._create_image_content(img["metadata"]["image_path"]))
# Call VLM
response = self.vlm.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
max_tokens=1500
)
return response.choices[0].message.contentdef smart_chunk_document(elements: list) -> list[dict]:
"""Chunk keeping related content together."""
chunks = []
current_chunk = {"text": "", "images": [], "tables": []}
for element in elements:
if element.category == "Title":
# Start new chunk on titles
if current_chunk["text"]:
chunks.append(current_chunk)
current_chunk = {"text": element.text, "images": [], "tables": []}
elif element.category == "Image":
# Keep images with surrounding text
current_chunk["images"].append(element)
elif element.category == "Table":
# Tables are self-contained
chunks.append({
"text": element.text,
"images": [],
"tables": [element]
})
else:
current_chunk["text"] += " " + element.text
return chunksdef hybrid_multimodal_search(query: str, text_index, visual_index, alpha: float = 0.7):
"""Combine text and visual retrieval."""
# Text-based search
text_results = text_index.search(query, k=20)
# Visual search (if query suggests visual content)
visual_keywords = ["chart", "diagram", "image", "figure", "table", "graph"]
if any(kw in query.lower() for kw in visual_keywords):
visual_results = visual_index.search(query, k=10)
alpha = 0.5 # Weight visual more
else:
visual_results = visual_index.search(query, k=5)
# Fuse results
return reciprocal_rank_fusion(
[text_results, visual_results],
weights=[alpha, 1 - alpha]
)Multimodal RAG represents the future of knowledge systems, enabling AI to truly understand and reason over complete documents as humans do. As vision-language models continue to improve, the gap between text-only and multimodal RAG will only grow wider.