Document processing and chunking are critical stages in the RAG ingestion pipeline. How you split documents directly impacts retrieval quality. This lesson covers chunking strategies from basic to advanced, including the cutting-edge late chunking technique.
Chunking is not just about fitting text into embedding model limits. It fundamentally affects:
Before chunking, documents typically require preprocessing:
import re
def clean_text(text: str) -> str:
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters (keep punctuation)
text = re.sub(r'[^\w\s.,!?;:\-\'"]', '', text)
# Normalize unicode
import unicodedata
text = unicodedata.normalize('NFKC', text)
# Remove boilerplate (headers, footers, page numbers)
text = re.sub(r'Page \d+ of \d+', '', text)
return text.strip()def extract_metadata(document, source_path: str) -> dict:
return {
"source": source_path,
"filename": os.path.basename(source_path),
"file_type": source_path.split('.')[-1],
"created_at": datetime.now().isoformat(),
"title": extract_title(document), # Custom extraction
"author": extract_author(document),
"word_count": len(document.split()),
}The simplest approach: split text by character/token count with optional overlap.
Naive chunking divides text into fixed-size segments without regard for semantic boundaries. Each chunk is embedded independently.
from langchain.text_splitter import CharacterTextSplitter
text = """
Machine learning is a subset of artificial intelligence that enables
systems to learn from data. Deep learning, a subset of ML, uses neural
networks with multiple layers. These networks can automatically learn
representations from raw data.
"""
splitter = CharacterTextSplitter(
chunk_size=100,
chunk_overlap=20,
separator=" "
)
chunks = splitter.split_text(text)
# Output:
# Chunk 1: "Machine learning is a subset of artificial intelligence that enables systems to learn from data."
# Chunk 2: "from data. Deep learning, a subset of ML, uses neural networks with multiple layers."
# Chunk 3: "multiple layers. These networks can automatically learn representations from raw data."Tries multiple separators hierarchically to find natural break points.
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", ". ", ", ", " ", ""],
length_function=len
)
# Tries to split on paragraphs first, then sentences, then words
chunks = splitter.split_text(document_text)Best Practice: This is the most commonly used splitter and works well for general text.
Respects document structure based on format:
from langchain.text_splitter import MarkdownHeaderTextSplitter
headers_to_split = [
("#", "h1"),
("##", "h2"),
("###", "h3"),
]
splitter = MarkdownHeaderTextSplitter(headers_to_split)
chunks = splitter.split_text(markdown_text)
# Each chunk includes header hierarchy as metadata
# {"h1": "Introduction", "h2": "Overview", "content": "..."}from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
Language
)
# Python code splitter
python_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.PYTHON,
chunk_size=500,
chunk_overlap=50
)
# Respects function/class boundaries
chunks = python_splitter.split_text(python_code)Uses embeddings to identify semantic boundaries between sentences.
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
# Semantic chunker identifies topic shifts
splitter = SemanticChunker(
embeddings=embeddings,
breakpoint_threshold_type="percentile", # or "standard_deviation", "interquartile"
breakpoint_threshold_amount=95 # Top 5% similarity drops indicate boundaries
)
chunks = splitter.split_text(document_text)Late chunking is a novel technique that addresses the fundamental problem with naive chunking: loss of contextual information.
When you chunk first and embed second, each chunk loses awareness of the broader document context:
Document: "Berlin is the capital of Germany. It has a population of 3.6 million."
Naive Chunking:
Chunk 1: "Berlin is the capital of Germany."
Chunk 2: "It has a population of 3.6 million."
Problem: Chunk 2's embedding doesn't know "It" refers to "Berlin"!Late chunking reverses the order: embed the entire document first using a long-context embedding model, then chunk the resulting token embeddings.
# Conceptual implementation of late chunking
from transformers import AutoModel, AutoTokenizer
import torch
def late_chunking(text: str, chunk_size: int = 256):
# Load long-context model (e.g., jina-embeddings-v2-base-en supports 8192 tokens)
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-en")
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")
# Step 1: Tokenize and embed ENTIRE document
tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=8192)
with torch.no_grad():
# Get token-level embeddings (not pooled)
outputs = model(**tokens, output_hidden_states=True)
token_embeddings = outputs.last_hidden_state[0] # [seq_len, hidden_dim]
# Step 2: Chunk the token embeddings
chunk_embeddings = []
for i in range(0, len(token_embeddings), chunk_size):
chunk_tokens = token_embeddings[i:i + chunk_size]
# Mean pooling for this chunk
chunk_embedding = chunk_tokens.mean(dim=0)
chunk_embeddings.append(chunk_embedding)
return chunk_embeddingsLate interaction is a related but distinct concept used in re-ranking and retrieval.
Instead of a single embedding per query/document, late interaction maintains per-token embeddings and computes similarity at query time using MaxSim:
# ColBERT scoring with MaxSim
def colbert_score(query_embeddings, doc_embeddings):
"""
query_embeddings: [num_query_tokens, dim]
doc_embeddings: [num_doc_tokens, dim]
For each query token, find max similarity to any doc token.
Sum these max similarities.
"""
# Compute all pairwise similarities
similarities = query_embeddings @ doc_embeddings.T # [query_tokens, doc_tokens]
# MaxSim: for each query token, take max similarity
max_sims = similarities.max(dim=1).values # [query_tokens]
# Sum across query tokens
return max_sims.sum()
# Example
# Query: "What is ML?" -> 4 token embeddings
# Doc: "Machine learning is a type of AI..." -> 8 token embeddings
# Score = MaxSim aggregated across all query tokensColPali extends late interaction to multimodal retrieval, enabling similarity computation between text queries and document images (PDFs, screenshots).
| Method | Context Aware | Compute Cost | Best For |
|---|---|---|---|
| Naive/Fixed | ❌ | Low | Simple documents, prototyping |
| Recursive | ❌ | Low | General purpose, production baseline |
| Document-Aware | Partial | Low | Structured docs (Markdown, code) |
| Semantic | ❌ | Medium | Topic-diverse documents |
| Late Chunking | ✅ | High | High-stakes retrieval, complex docs |
| Late Interaction | ✅ | High | Re-ranking, precision-critical |
Chunk size is one of the most important hyperparameters in RAG. Here's how to determine the optimal size:
# Sweep different chunk sizes and evaluate
chunk_sizes = [256, 512, 1024, 2048]
overlaps = [0, 64, 128, 256]
results = []
for size in chunk_sizes:
for overlap in overlaps:
if overlap >= size:
continue
# Build index with this configuration
chunks = split_documents(documents, chunk_size=size, overlap=overlap)
index = build_index(chunks)
# Evaluate on test set
metrics = evaluate_retrieval(index, test_queries)
results.append({
"chunk_size": size,
"overlap": overlap,
"recall@5": metrics["recall@5"],
"mrr": metrics["mrr"]
})
# Find best configuration
best = max(results, key=lambda x: x["recall@5"])A powerful pattern that decouples embedding granularity from context window:
from llama_index import SentenceWindowNodeParser
# Parse with sentence windows
parser = SentenceWindowNodeParser.from_defaults(
sentence_splitter=lambda text: text.split(". "),
window_size=3, # Include 3 sentences before/after
window_metadata_key="window",
original_text_metadata_key="original_text"
)
nodes = parser.get_nodes_from_documents(documents)
# Each node contains:
# - node.text: The specific sentence (embedded)
# - node.metadata["window"]: Surrounding context (returned to LLM)Anthropic's contextual retrieval technique prepends document-level context to each chunk:
import anthropic
def add_context_to_chunk(chunk: str, document: str) -> str:
"""Use Claude to generate contextual prefix for each chunk."""
client = anthropic.Anthropic()
prompt = f"""<document>
{document}
</document>
Here is the chunk we want to situate within the whole document:
<chunk>
{chunk}
</chunk>
Please give a short succinct context to situate this chunk within
the overall document for the purposes of improving search retrieval
of the chunk. Answer only with the succinct context and nothing else."""
response = client.messages.create(
model="claude-3-haiku-20240307",
max_tokens=100,
messages=[{"role": "user", "content": prompt}]
)
context = response.content[0].text
return f"{context}\n\n{chunk}"
# Example output:
# Original: "The system uses 256-bit AES encryption."
# Contextual: "This section from the Security Architecture Guide describes
# the encryption standards. The system uses 256-bit AES encryption."from dataclasses import dataclass
from typing import List
import hashlib
@dataclass
class ProcessedChunk:
id: str
text: str
embedding: List[float]
metadata: dict
def process_document(
document: str,
source: str,
chunk_size: int = 512,
overlap: int = 50,
add_context: bool = True
) -> List[ProcessedChunk]:
# Step 1: Clean and preprocess
clean_doc = clean_text(document)
# Step 2: Extract metadata
metadata = extract_metadata(document, source)
# Step 3: Split into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap
)
chunks = splitter.split_text(clean_doc)
# Step 4: Add contextual prefixes (optional)
if add_context:
chunks = [add_context_to_chunk(chunk, clean_doc) for chunk in chunks]
# Step 5: Generate embeddings
embeddings = embed_model.encode(chunks)
# Step 6: Create processed chunks with IDs and metadata
processed = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
chunk_id = hashlib.md5(f"{source}:{i}".encode()).hexdigest()
processed.append(ProcessedChunk(
id=chunk_id,
text=chunk,
embedding=embedding.tolist(),
metadata={
**metadata,
"chunk_index": i,
"total_chunks": len(chunks)
}
))
return processedIn the next lesson, we'll explore embedding generation and storage in detail, including model selection and optimization strategies.