Choosing the right distance metric is crucial for vector search quality. This lesson provides a deep dive into distance metrics, their mathematical properties, and practical guidance on when to use each.
Distance: Lower = More Similar
- Euclidean distance: 0 = identical, ∞ = very different
Similarity: Higher = More Similar
- Cosine similarity: 1 = identical, -1 = opposite
Conversion: distance = 1 - similarity (for normalized metrics)import numpy as np
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""
Measures the angle between two vectors.
cos(θ) = (a · b) / (||a|| × ||b||)
Properties:
- Range: [-1, 1]
- Invariant to vector magnitude (length)
- 1: identical direction
- 0: orthogonal (perpendicular)
- -1: opposite direction
"""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# Cosine distance
def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
return 1 - cosine_similarity(a, b)
# Example: Magnitude doesn't matter!
v1 = np.array([1, 2, 3])
v2 = np.array([2, 4, 6]) # Same direction, 2x magnitude
v3 = np.array([1, 0, 0]) # Different direction
print(cosine_similarity(v1, v2)) # 1.0 (identical direction!)
print(cosine_similarity(v1, v3)) # 0.27 (different direction)Use when: Text embeddings, semantic similarity, when magnitude shouldn't affect results.
def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
"""
Straight-line distance between points.
d = sqrt(Σ(a_i - b_i)²)
Properties:
- Range: [0, ∞)
- Affected by magnitude
- Intuitive geometric interpretation
"""
return np.sqrt(np.sum((a - b) ** 2))
# Squared Euclidean (faster, avoids sqrt)
def squared_euclidean(a: np.ndarray, b: np.ndarray) -> float:
return np.sum((a - b) ** 2)
# Example: Magnitude matters!
v1 = np.array([1, 2, 3])
v2 = np.array([2, 4, 6])
v3 = np.array([1.1, 2.1, 3.1])
print(euclidean_distance(v1, v2)) # 3.74 (far due to magnitude)
print(euclidean_distance(v1, v3)) # 0.17 (close)Use when: Clustering, image features, when magnitude is meaningful.
def dot_product(a: np.ndarray, b: np.ndarray) -> float:
"""
Sum of element-wise products.
a · b = Σ(a_i × b_i)
Properties:
- Range: (-∞, ∞)
- Combines direction AND magnitude
- For normalized vectors: dot = cosine similarity
"""
return np.dot(a, b)
# Key insight: Normalize vectors for cosine via dot product
def normalize(v):
return v / np.linalg.norm(v)
v1_norm = normalize(v1)
v2_norm = normalize(v2)
# These are equivalent:
print(cosine_similarity(v1, v2)) # Using cosine formula
print(dot_product(v1_norm, v2_norm)) # Dot of normalized vectorsUse when: Vectors are pre-normalized, or when magnitude encodes importance.
For normalized vectors (||a|| = ||b|| = 1):
cosine_similarity(a, b) = dot_product(a, b)
euclidean_distance(a, b)² = 2 - 2 × cosine_similarity(a, b)
= 2 × (1 - dot_product(a, b))
= 2 × cosine_distance(a, b)
So for normalized vectors:
- Euclidean and Cosine give the same ranking!
- Dot product is fastest (no normalization at query time)import numpy as np
# Generate random normalized vectors
np.random.seed(42)
vectors = np.random.randn(100, 768)
vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
query = np.random.randn(768)
query = query / np.linalg.norm(query)
# Compute all three metrics
cosine_sims = np.dot(vectors, query) # Same as cosine for normalized
euclidean_dists = np.linalg.norm(vectors - query, axis=1)
# Rankings should be identical!
cosine_ranking = np.argsort(-cosine_sims) # Descending (higher = better)
euclidean_ranking = np.argsort(euclidean_dists) # Ascending (lower = better)
print("Rankings match:", np.array_equal(cosine_ranking, euclidean_ranking)) # Truedef manhattan_distance(a: np.ndarray, b: np.ndarray) -> float:
"""
Sum of absolute differences.
d = Σ|a_i - b_i|
Also called: L1, taxicab, city-block distance
"""
return np.sum(np.abs(a - b))Use when: High-dimensional sparse data, robust to outliers.
def hamming_distance(a: np.ndarray, b: np.ndarray) -> int:
"""
Count of positions where elements differ.
Used for binary vectors (0/1).
"""
return np.sum(a != b)
# Example with binary vectors
a = np.array([1, 0, 1, 1, 0])
b = np.array([1, 1, 1, 0, 0])
print(hamming_distance(a, b)) # 2 (positions 1 and 3 differ)Use when: Binary hash codes, locality-sensitive hashing.
def jaccard_similarity(set_a: set, set_b: set) -> float:
"""
Ratio of intersection to union.
J(A, B) = |A ∩ B| / |A ∪ B|
"""
intersection = len(set_a & set_b)
union = len(set_a | set_b)
return intersection / union if union > 0 else 0
# For binary vectors
def jaccard_binary(a: np.ndarray, b: np.ndarray) -> float:
intersection = np.sum(a & b)
union = np.sum(a | b)
return intersection / union if union > 0 else 0Use when: Sparse sets, document similarity, recommendations.
| Use Case | Recommended Metric | Why |
|---|---|---|
| Text embeddings (RAG) | Cosine / Dot | Semantic similarity, magnitude-invariant |
| OpenAI embeddings | Cosine | Recommended by OpenAI |
| Normalized vectors | Dot Product | Fastest (equivalent to cosine) |
| Image features | Euclidean | Position in feature space matters |
| Clustering | Euclidean | Natural for k-means centroids |
| Sparse high-dimensional | Manhattan | Robust to outliers |
| Binary codes (LSH) | Hamming | Fast bit operations |
# Pinecone
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone()
pc.create_index(
name="my-index",
dimension=768,
metric="cosine", # or "euclidean", "dotproduct"
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
# Qdrant
from qdrant_client.models import Distance, VectorParams
client.create_collection(
collection_name="my_collection",
vectors_config=VectorParams(
size=768,
distance=Distance.COSINE # or EUCLID, DOT
)
)
# ChromaDB
collection = client.create_collection(
name="my_collection",
metadata={"hnsw:space": "cosine"} # or "l2", "ip"
)
# Weaviate
{
"class": "Document",
"vectorIndexConfig": {
"distance": "cosine" # or "l2-squared", "dot"
}
}# Best practice: Normalize at index time for speed
# At indexing
def add_document(text: str, vector_store):
embedding = model.encode(text)
# Normalize once at index time
normalized = embedding / np.linalg.norm(embedding)
vector_store.add(normalized)
# At query time - use dot product (faster than cosine)
def search(query: str, vector_store):
embedding = model.encode(query)
normalized = embedding / np.linalg.norm(embedding)
# Dot product = cosine for normalized vectors
return vector_store.search(normalized, metric="dot")
# Many embedding models normalize by default
embeddings = model.encode(texts, normalize_embeddings=True)In the next lesson, we'll explore Pinecone, a fully managed vector database.