Evaluating RAG systems requires measuring both retrieval quality and generation quality. This lesson covers comprehensive evaluation frameworks, metrics, and practical implementation of RAG evaluation pipelines.
RAG evaluation should cover both the retrieval and generation stages independently, as well as end-to-end performance:
┌────────────────────────────────────────────────────────────────┐
│ RAG Evaluation │
├────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────┐ ┌─────────────────┐ │
│ │ Retrieval │ │ Generation │ │
│ │ Metrics │ │ Metrics │ │
│ ├─────────────────┤ ├─────────────────┤ │
│ │ • Recall@K │ │ • Faithfulness │ │
│ │ • Precision@K │ │ • Relevance │ │
│ │ • MRR │ │ • Completeness │ │
│ │ • NDCG │ │ • BLEU/ROUGE │ │
│ │ • Context │ │ • Groundedness │ │
│ │ Relevance │ │ │ │
│ └─────────────────┘ └─────────────────┘ │
│ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ End-to-End Metrics │ │
│ ├──────────────────────────────────────────────────┤ │
│ │ • Answer Correctness • User Satisfaction │ │
│ │ • Latency • Cost per Query │ │
│ └──────────────────────────────────────────────────┘ │
└────────────────────────────────────────────────────────────────┘Measures what fraction of relevant documents are retrieved in the top K results.
def recall_at_k(retrieved_ids: list[str], relevant_ids: set[str], k: int) -> float:
"""
Recall@K = |Retrieved ∩ Relevant| / |Relevant|
How many relevant documents did we find in top K?
"""
retrieved_top_k = set(retrieved_ids[:k])
found_relevant = len(retrieved_top_k & relevant_ids)
if len(relevant_ids) == 0:
return 0.0
return found_relevant / len(relevant_ids)
# Example:
# Retrieved: [doc1, doc3, doc5, doc7, doc9] (k=5)
# Relevant: {doc1, doc2, doc3, doc4}
# Recall@5 = |{doc1, doc3}| / |{doc1, doc2, doc3, doc4}| = 2/4 = 0.5Measures what fraction of the top K retrieved documents are relevant.
def precision_at_k(retrieved_ids: list[str], relevant_ids: set[str], k: int) -> float:
"""
Precision@K = |Retrieved ∩ Relevant| / K
How many of the top K results are relevant?
"""
retrieved_top_k = set(retrieved_ids[:k])
found_relevant = len(retrieved_top_k & relevant_ids)
return found_relevant / k
# Example:
# Retrieved: [doc1, doc3, doc5, doc7, doc9] (k=5)
# Relevant: {doc1, doc2, doc3, doc4}
# Precision@5 = 2/5 = 0.4Average of reciprocal ranks of the first relevant result across queries.
def reciprocal_rank(retrieved_ids: list[str], relevant_ids: set[str]) -> float:
"""
RR = 1 / rank of first relevant document
MRR = mean of RR across all queries
"""
for rank, doc_id in enumerate(retrieved_ids, 1):
if doc_id in relevant_ids:
return 1.0 / rank
return 0.0
def mean_reciprocal_rank(results: list[dict]) -> float:
"""Calculate MRR across multiple queries."""
rr_scores = [
reciprocal_rank(r["retrieved"], set(r["relevant"]))
for r in results
]
return sum(rr_scores) / len(rr_scores)
# Example:
# Query 1: First relevant at rank 1 → RR = 1.0
# Query 2: First relevant at rank 3 → RR = 0.33
# Query 3: First relevant at rank 2 → RR = 0.5
# MRR = (1.0 + 0.33 + 0.5) / 3 = 0.61Measures ranking quality with graded relevance and position discount.
import numpy as np
def dcg_at_k(relevances: list[float], k: int) -> float:
"""
DCG@K = Σ (2^rel_i - 1) / log2(i + 1)
Higher relevance documents at higher positions contribute more.
"""
relevances = relevances[:k]
gains = [2**rel - 1 for rel in relevances]
discounts = [np.log2(i + 2) for i in range(len(relevances))]
return sum(g / d for g, d in zip(gains, discounts))
def ndcg_at_k(
retrieved_ids: list[str],
id_to_relevance: dict[str, float],
k: int
) -> float:
"""
NDCG@K = DCG@K / IDCG@K
Normalized to 0-1 range by dividing by ideal DCG.
"""
# Get relevances for retrieved docs
relevances = [id_to_relevance.get(doc_id, 0) for doc_id in retrieved_ids[:k]]
# Calculate DCG
dcg = dcg_at_k(relevances, k)
# Calculate ideal DCG (sorted by relevance)
ideal_relevances = sorted(id_to_relevance.values(), reverse=True)[:k]
idcg = dcg_at_k(ideal_relevances, k)
if idcg == 0:
return 0.0
return dcg / idcg
# Example with graded relevance:
# doc1: 3 (highly relevant), doc2: 2 (relevant), doc3: 1 (somewhat), doc4: 0 (not)
# Retrieved order: [doc3, doc1, doc4, doc2]
# Relevances: [1, 3, 0, 2]
# DCG = (2^1-1)/log2(2) + (2^3-1)/log2(3) + (2^0-1)/log2(4) + (2^2-1)/log2(5)
# = 1/1 + 7/1.58 + 0 + 3/2.32 = 1 + 4.43 + 0 + 1.29 = 6.72
# IDCG (ideal: [3, 2, 1, 0]): 7/1 + 3/1.58 + 1/2 + 0 = 7 + 1.90 + 0.5 = 9.40
# NDCG = 6.72 / 9.40 = 0.71LLM-based evaluation of whether retrieved context is relevant to the query.
def context_relevance(query: str, contexts: list[str], llm) -> float:
"""
Use LLM to evaluate how relevant retrieved contexts are.
"""
scores = []
for context in contexts:
prompt = f"""Rate the relevance of this context to the query on a scale of 0-10.
Query: {query}
Context: {context[:500]}
Provide only a number from 0-10:"""
response = llm.invoke(prompt)
try:
score = float(response.content.strip()) / 10
scores.append(score)
except:
scores.append(0.5)
return sum(scores) / len(scores) if scores else 0.0Measures whether the generated answer is grounded in the retrieved context (not hallucinated).
def faithfulness(answer: str, contexts: list[str], llm) -> float:
"""
Check if claims in the answer are supported by the context.
1. Extract claims from answer
2. Check each claim against context
3. Calculate ratio of supported claims
"""
# Step 1: Extract claims
extract_prompt = f"""Extract factual claims from this answer as a numbered list.
Each claim should be a single, verifiable statement.
Answer: {answer}
Claims:"""
claims_response = llm.invoke(extract_prompt)
claims = [c.strip() for c in claims_response.content.split("\n") if c.strip()]
if not claims:
return 1.0 # No claims to verify
# Step 2: Check each claim
context_text = "\n---\n".join(contexts)
supported_count = 0
for claim in claims:
verify_prompt = f"""Is this claim supported by the context? Answer YES or NO.
Claim: {claim}
Context:
{context_text[:3000]}
Supported (YES/NO):"""
response = llm.invoke(verify_prompt)
if "YES" in response.content.upper():
supported_count += 1
return supported_count / len(claims)
# Example:
# Context: "Paris is the capital of France. It has a population of 2.1 million."
# Answer: "Paris, the capital of France, has 2.1 million people and is known for the Eiffel Tower."
# Claims: ["Paris is the capital of France", "Paris has 2.1 million people", "Paris is known for the Eiffel Tower"]
# Supported: [True, True, False (not in context)]
# Faithfulness = 2/3 = 0.67Measures whether the answer addresses the user's question.
def answer_relevance(query: str, answer: str, llm) -> float:
"""
Evaluate if the answer is relevant to the question.
Can use reverse generation: generate questions from answer,
compare similarity to original query.
"""
# Method 1: Direct scoring
prompt = f"""Rate how well this answer addresses the question on a scale of 0-10.
Consider completeness and directness.
Question: {query}
Answer: {answer}
Score (0-10):"""
response = llm.invoke(prompt)
try:
return float(response.content.strip()) / 10
except:
return 0.5
def answer_relevance_embedding(
query: str,
answer: str,
embed_model
) -> float:
"""
Method 2: Embedding similarity between query and answer.
Lower is often better - the answer should match the question topic.
"""
query_emb = embed_model.encode(query)
answer_emb = embed_model.encode(answer)
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([query_emb], [answer_emb])[0][0]
return similarityCompare generated answer against ground truth.
def answer_correctness(
generated: str,
ground_truth: str,
llm
) -> float:
"""
Compare generated answer to ground truth answer.
"""
prompt = f"""Compare the generated answer to the reference answer.
Rate the accuracy and completeness on a scale of 0-10.
Reference Answer: {ground_truth}
Generated Answer: {generated}
Score (0-10):"""
response = llm.invoke(prompt)
try:
return float(response.content.strip()) / 10
except:
return 0.5from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
def calculate_rouge(generated: str, reference: str) -> dict:
"""
ROUGE: Recall-Oriented Understudy for Gisting Evaluation
Measures n-gram overlap between generated and reference text.
"""
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated)
return {
'rouge1_f': scores['rouge1'].fmeasure,
'rouge2_f': scores['rouge2'].fmeasure,
'rougeL_f': scores['rougeL'].fmeasure
}
def calculate_bleu(generated: str, reference: str) -> float:
"""
BLEU: Bilingual Evaluation Understudy
Measures n-gram precision with brevity penalty.
"""
reference_tokens = reference.lower().split()
generated_tokens = generated.lower().split()
return sentence_bleu([reference_tokens], generated_tokens)
# Note: ROUGE/BLEU are less meaningful for open-ended generation
# LLM-based evaluation is preferred for RAGRAGAS is a popular framework for evaluating RAG systems with four key metrics:
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall
)
from datasets import Dataset
# Prepare evaluation data
eval_data = {
"question": ["What is RAG?", "How does chunking work?"],
"answer": ["RAG combines retrieval with generation...", "Chunking splits documents..."],
"contexts": [["RAG is a technique..."], ["Documents are divided into chunks..."]],
"ground_truth": ["RAG is Retrieval Augmented Generation...", "Chunking divides..."]
}
dataset = Dataset.from_dict(eval_data)
# Run evaluation
results = evaluate(
dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
)
print(results)
# {
# 'faithfulness': 0.85,
# 'answer_relevancy': 0.78,
# 'context_precision': 0.72,
# 'context_recall': 0.65
# }from dataclasses import dataclass
from typing import Optional
import json
@dataclass
class EvalSample:
query: str
retrieved_docs: list[dict]
generated_answer: str
ground_truth_answer: Optional[str] = None
relevant_doc_ids: Optional[list[str]] = None
@dataclass
class EvalResults:
# Retrieval
recall_at_5: float
precision_at_5: float
mrr: float
ndcg_at_5: float
context_relevance: float
# Generation
faithfulness: float
answer_relevance: float
answer_correctness: Optional[float] = None
class RAGEvaluator:
def __init__(self, llm, embedding_model):
self.llm = llm
self.embedding_model = embedding_model
def evaluate_sample(self, sample: EvalSample) -> EvalResults:
"""Evaluate a single query-answer pair."""
retrieved_ids = [d["id"] for d in sample.retrieved_docs]
retrieved_texts = [d["text"] for d in sample.retrieved_docs]
# Retrieval metrics (if ground truth available)
if sample.relevant_doc_ids:
relevant_set = set(sample.relevant_doc_ids)
r5 = recall_at_k(retrieved_ids, relevant_set, 5)
p5 = precision_at_k(retrieved_ids, relevant_set, 5)
mrr = reciprocal_rank(retrieved_ids, relevant_set)
ndcg = ndcg_at_k(retrieved_ids, {id: 1 for id in relevant_set}, 5)
else:
r5 = p5 = mrr = ndcg = None
# Context relevance (LLM-based)
ctx_rel = context_relevance(sample.query, retrieved_texts, self.llm)
# Faithfulness
faith = faithfulness(sample.generated_answer, retrieved_texts, self.llm)
# Answer relevance
ans_rel = answer_relevance(sample.query, sample.generated_answer, self.llm)
# Answer correctness (if ground truth available)
if sample.ground_truth_answer:
correctness = answer_correctness(
sample.generated_answer,
sample.ground_truth_answer,
self.llm
)
else:
correctness = None
return EvalResults(
recall_at_5=r5,
precision_at_5=p5,
mrr=mrr,
ndcg_at_5=ndcg,
context_relevance=ctx_rel,
faithfulness=faith,
answer_relevance=ans_rel,
answer_correctness=correctness
)
def evaluate_batch(self, samples: list[EvalSample]) -> dict:
"""Evaluate multiple samples and aggregate results."""
all_results = [self.evaluate_sample(s) for s in samples]
# Aggregate metrics
def mean_or_none(values):
non_none = [v for v in values if v is not None]
return sum(non_none) / len(non_none) if non_none else None
return {
"num_samples": len(samples),
"retrieval": {
"recall@5": mean_or_none([r.recall_at_5 for r in all_results]),
"precision@5": mean_or_none([r.precision_at_5 for r in all_results]),
"mrr": mean_or_none([r.mrr for r in all_results]),
"ndcg@5": mean_or_none([r.ndcg_at_5 for r in all_results]),
"context_relevance": mean_or_none([r.context_relevance for r in all_results])
},
"generation": {
"faithfulness": mean_or_none([r.faithfulness for r in all_results]),
"answer_relevance": mean_or_none([r.answer_relevance for r in all_results]),
"answer_correctness": mean_or_none([r.answer_correctness for r in all_results])
}
}def generate_eval_dataset(documents: list[dict], llm, n_samples: int = 50) -> list[dict]:
"""
Generate evaluation dataset from documents using LLM.
"""
eval_samples = []
for doc in documents[:n_samples]:
# Generate question from document
q_prompt = f"""Generate a question that can be answered using this document.
Make it a realistic user question.
Document: {doc['text'][:1000]}
Question:"""
question = llm.invoke(q_prompt).content.strip()
# Generate ground truth answer
a_prompt = f"""Answer this question based on the document.
Be accurate and concise.
Document: {doc['text'][:1000]}
Question: {question}
Answer:"""
answer = llm.invoke(a_prompt).content.strip()
eval_samples.append({
"query": question,
"ground_truth_answer": answer,
"relevant_doc_ids": [doc["id"]]
})
return eval_samples
# Usage
eval_dataset = generate_eval_dataset(documents, llm, n_samples=100)
# Save for reproducibility
with open("eval_dataset.json", "w") as f:
json.dump(eval_dataset, f, indent=2)def compare_configurations(
eval_dataset: list[dict],
config_a: RAGSystem,
config_b: RAGSystem,
evaluator: RAGEvaluator
) -> dict:
"""Compare two RAG configurations on the same evaluation set."""
results_a = []
results_b = []
for item in eval_dataset:
query = item["query"]
# Run both configurations
response_a = config_a.query(query)
response_b = config_b.query(query)
# Create eval samples
sample_a = EvalSample(
query=query,
retrieved_docs=response_a["retrieved"],
generated_answer=response_a["answer"],
ground_truth_answer=item.get("ground_truth_answer"),
relevant_doc_ids=item.get("relevant_doc_ids")
)
sample_b = EvalSample(
query=query,
retrieved_docs=response_b["retrieved"],
generated_answer=response_b["answer"],
ground_truth_answer=item.get("ground_truth_answer"),
relevant_doc_ids=item.get("relevant_doc_ids")
)
results_a.append(evaluator.evaluate_sample(sample_a))
results_b.append(evaluator.evaluate_sample(sample_b))
# Compare aggregate metrics
return {
"config_a": evaluator.aggregate(results_a),
"config_b": evaluator.aggregate(results_b),
"winner": determine_winner(results_a, results_b)
}Research shows LLMs struggle with information in the middle of long contexts:
def evaluate_position_bias(rag_system, eval_dataset: list[dict]) -> dict:
"""
Test if relevant information position affects answer quality.
"""
results = {
"beginning": [], # Relevant doc at position 0
"middle": [], # Relevant doc at middle position
"end": [] # Relevant doc at last position
}
for item in eval_dataset:
for position in ["beginning", "middle", "end"]:
# Reorder retrieved docs so relevant doc is at specific position
docs = reorder_docs(item["retrieved"], item["relevant_doc_ids"], position)
# Generate answer with reordered context
answer = rag_system.generate_with_context(item["query"], docs)
# Evaluate faithfulness/correctness
score = evaluate_answer(answer, item["ground_truth"])
results[position].append(score)
return {
pos: sum(scores) / len(scores)
for pos, scores in results.items()
}
# Typical finding: beginning > end > middleWhen evaluating RAG systems, it's valuable to compare against established benchmarks. Here are the most important ones:
The most widely-used passage retrieval benchmark with 8.8M passages and 500K+ queries.
# MS MARCO evaluation
from beir import util
from beir.datasets.data_loader import GenericDataLoader
# Load MS MARCO
dataset = "msmarco"
data_path = util.download_and_unzip(f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip", "datasets")
corpus, queries, qrels = GenericDataLoader(data_path).load(split="dev")
# Evaluate your retriever
from beir.retrieval.evaluation import EvaluateRetrieval
evaluator = EvaluateRetrieval()
results = your_retriever.search(corpus, queries, top_k=100)
ndcg, map_score, recall, precision = evaluator.evaluate(qrels, results, [1, 3, 5, 10, 100])
print(f"NDCG@10: {ndcg['NDCG@10']:.4f}")
print(f"Recall@100: {recall['Recall@100']:.4f}")A heterogeneous benchmark spanning 18 diverse datasets to test zero-shot generalization:
# Evaluate on BEIR benchmark
from beir.retrieval import models
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch
# Your model
model = models.SentenceBERT("BAAI/bge-large-en-v1.5")
retriever = DenseRetrievalExactSearch(model, batch_size=128)
# Run on multiple datasets
beir_datasets = ["nfcorpus", "fiqa", "scifact", "trec-covid", "arguana"]
for dataset_name in beir_datasets:
corpus, queries, qrels = load_beir_dataset(dataset_name)
results = retriever.search(corpus, queries, top_k=100)
ndcg, _, recall, _ = EvaluateRetrieval().evaluate(qrels, results, [10])
print(f"{dataset_name}: NDCG@10={ndcg['NDCG@10']:.3f}, Recall@10={recall['Recall@10']:.3f}")FRAMES tests multi-hop reasoning requiring synthesis across 2-15 Wikipedia articles:
# FRAMES-style multi-hop evaluation
def evaluate_frames_style(rag_system, test_set: list[dict]) -> dict:
"""
Evaluate on FRAMES-style multi-hop questions.
Each question requires synthesizing info from multiple documents.
"""
results = {
"numerical": {"correct": 0, "total": 0},
"tabular": {"correct": 0, "total": 0},
"multi_constraint": {"correct": 0, "total": 0},
"temporal": {"correct": 0, "total": 0}
}
for item in test_set:
question = item["question"]
ground_truth = item["answer"]
reasoning_type = item["type"]
# Run RAG
response = rag_system.query(question)
# Check correctness (using LLM judge)
is_correct = llm_judge_correctness(response["answer"], ground_truth)
results[reasoning_type]["total"] += 1
if is_correct:
results[reasoning_type]["correct"] += 1
# Calculate accuracy per type
return {
rtype: data["correct"] / data["total"] if data["total"] > 0 else 0
for rtype, data in results.items()
}Tests long-form factuality across 38 topics using SAFE (Search-Augmented Factuality Evaluator):
# LongFact-style evaluation
def longfact_evaluate(response: str, llm) -> dict:
"""
Evaluate long-form response factuality.
1. Extract atomic facts
2. Verify each fact via search
3. Calculate precision
"""
# Extract facts
facts = extract_atomic_facts(response, llm)
supported = 0
not_supported = 0
for fact in facts:
# Search for supporting evidence
search_results = web_search(fact_to_query(fact))
# Check if fact is supported
if verify_fact_against_results(fact, search_results, llm):
supported += 1
else:
not_supported += 1
precision = supported / (supported + not_supported) if (supported + not_supported) > 0 else 0
return {
"total_facts": len(facts),
"supported": supported,
"not_supported": not_supported,
"factual_precision": precision
}ARES provides automated evaluation without human labels by training LLM judges:
# ARES-style automated evaluation
from ares import ARES
# Initialize ARES evaluator
evaluator = ARES(
ppi=True, # Use Prediction-Powered Inference for confidence intervals
few_shot_examples=few_shot_data # Few examples for calibration
)
# Evaluate RAG outputs
scores = evaluator.evaluate(
queries=queries,
retrieved_contexts=contexts,
generated_answers=answers,
metrics=["context_relevance", "answer_faithfulness", "answer_relevance"]
)
print(f"Context Relevance: {scores['context_relevance']:.3f}")
print(f"Answer Faithfulness: {scores['answer_faithfulness']:.3f}")
print(f"Answer Relevance: {scores['answer_relevance']:.3f}")| Benchmark | Focus | Metrics | Best For |
|---|---|---|---|
| MS MARCO | Passage Retrieval | MRR@10, Recall@1000 | General retrieval tuning |
| BEIR | Zero-shot Retrieval | NDCG@10 | Domain generalization |
| FRAMES | Multi-hop Reasoning | Accuracy per type | Complex questions |
| LongFact | Long-form Factuality | F1@K | Detailed responses |
| RAGAS | End-to-End RAG | Faithfulness, Relevance | Quick evaluation |
| ARES | Automated Evaluation | Multiple with CI | Label-free evaluation |
In the next lesson, we'll cover performance optimization and caching strategies for production RAG systems.