Vector Databases & Embeddings

0 of 12 lessons completed

Building Semantic Search Applications

Semantic search finds results based on meaning rather than exact keyword matches. This lesson shows you how to build end-to-end semantic search systems using vector databases.

Semantic vs. Lexical Search

AspectLexical (Keyword)Semantic
MatchingExact keywordsMeaning/context
Query: "car"Only docs with "car"Also: vehicle, automobile, sedan
Synonyms❌ Misses them✅ Understands them
Context❌ No understanding✅ Considers context
SpeedVery fastFast (with ANN)

End-to-End Semantic Search Pipeline

# Complete semantic search system

from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import hashlib

class SemanticSearch:
    def __init__(self, collection_name: str = "documents"):
        self.openai = OpenAI()
        self.qdrant = QdrantClient("localhost", port=6333)
        self.collection_name = collection_name
        self.model = "text-embedding-3-small"
        self.dimension = 1536
        
        self._ensure_collection()
    
    def _ensure_collection(self):
        """Create collection if it doesn't exist"""
        collections = self.qdrant.get_collections().collections
        if not any(c.name == self.collection_name for c in collections):
            self.qdrant.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(
                    size=self.dimension,
                    distance=Distance.COSINE
                )
            )
    
    def _embed(self, texts: list[str]) -> list[list[float]]:
        """Generate embeddings for texts"""
        response = self.openai.embeddings.create(
            model=self.model,
            input=texts
        )
        return [item.embedding for item in response.data]
    
    def _generate_id(self, text: str) -> str:
        """Generate deterministic ID from text"""
        return hashlib.md5(text.encode()).hexdigest()[:16]
    
    def index(self, documents: list[dict]):
        """Index documents with metadata
        
        Args:
            documents: List of {"content": str, "metadata": dict}
        """
        texts = [doc["content"] for doc in documents]
        embeddings = self._embed(texts)
        
        points = [
            PointStruct(
                id=self._generate_id(doc["content"]),
                vector=embedding,
                payload={
                    "content": doc["content"],
                    **doc.get("metadata", {})
                }
            )
            for doc, embedding in zip(documents, embeddings)
        ]
        
        self.qdrant.upsert(
            collection_name=self.collection_name,
            points=points
        )
        
        return len(points)
    
    def search(self, query: str, limit: int = 5, filter_dict: dict = None):
        """Semantic search
        
        Args:
            query: Search query
            limit: Number of results
            filter_dict: Optional filter conditions
        """
        query_embedding = self._embed([query])[0]
        
        # Build filter if provided
        query_filter = None
        if filter_dict:
            from qdrant_client.models import Filter, FieldCondition, MatchValue
            conditions = [
                FieldCondition(key=k, match=MatchValue(value=v))
                for k, v in filter_dict.items()
            ]
            query_filter = Filter(must=conditions)
        
        results = self.qdrant.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            query_filter=query_filter,
            limit=limit
        )
        
        return [
            {
                "content": hit.payload["content"],
                "score": hit.score,
                "metadata": {k: v for k, v in hit.payload.items() if k != "content"}
            }
            for hit in results
        ]

Usage Example

# Initialize
search = SemanticSearch("product_docs")

# Index documents
documents = [
    {
        "content": "The MacBook Pro features the M3 chip with 8-core CPU...",
        "metadata": {"category": "laptops", "brand": "Apple"}
    },
    {
        "content": "Dell XPS 15 comes with Intel Core i9 processor...",
        "metadata": {"category": "laptops", "brand": "Dell"}
    },
    {
        "content": "iPhone 15 Pro has a titanium design...",
        "metadata": {"category": "phones", "brand": "Apple"}
    }
]

search.index(documents)

# Search
results = search.search("powerful laptop for development")
# Returns MacBook Pro and Dell XPS even without exact keywords

# Search with filter
results = search.search(
    "best computer for coding",
    filter_dict={"brand": "Apple"}
)
# Only Apple products

Document Processing Pipeline

from typing import Generator
import tiktoken

class DocumentProcessor:
    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
    
    def chunk_document(self, text: str, source: str = None) -> list[dict]:
        """Split document into overlapping chunks"""
        tokens = self.tokenizer.encode(text)
        chunks = []
        
        start = 0
        while start < len(tokens):
            end = start + self.chunk_size
            chunk_tokens = tokens[start:end]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            
            chunks.append({
                "content": chunk_text,
                "metadata": {
                    "source": source,
                    "chunk_index": len(chunks),
                    "token_count": len(chunk_tokens)
                }
            })
            
            start = end - self.chunk_overlap
        
        return chunks
    
    def process_documents(self, documents: list[dict]) -> Generator[dict, None, None]:
        """Process multiple documents"""
        for doc in documents:
            chunks = self.chunk_document(
                doc["content"],
                source=doc.get("source", "unknown")
            )
            for chunk in chunks:
                # Merge original metadata
                chunk["metadata"].update(doc.get("metadata", {}))
                yield chunk

# Usage
processor = DocumentProcessor(chunk_size=512, chunk_overlap=50)

# Process large documents
raw_docs = [
    {"content": "Very long document text...", "source": "manual.pdf"},
    {"content": "Another long document...", "source": "guide.pdf"}
]

chunks = list(processor.process_documents(raw_docs))
search.index(chunks)

Web Application with FastAPI

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

app = FastAPI()
search = SemanticSearch("documents")

class SearchRequest(BaseModel):
    query: str
    limit: int = 5
    filters: dict = None

class IndexRequest(BaseModel):
    documents: list[dict]

@app.post("/index")
async def index_documents(request: IndexRequest):
    """Index documents"""
    count = search.index(request.documents)
    return {"indexed": count}

@app.post("/search")
async def search_documents(request: SearchRequest):
    """Search documents"""
    results = search.search(
        query=request.query,
        limit=request.limit,
        filter_dict=request.filters
    )
    return {"results": results}

# Run: uvicorn app:app --reload

Next.js/TypeScript Frontend

// components/SemanticSearch.tsx
'use client';
import { useState } from 'react';

interface SearchResult {
  content: string;
  score: number;
  metadata: Record<string, string>;
}

export function SemanticSearch() {
  const [query, setQuery] = useState('');
  const [results, setResults] = useState<SearchResult[]>([]);
  const [loading, setLoading] = useState(false);

  const handleSearch = async () => {
    setLoading(true);
    try {
      const res = await fetch('/api/search', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ query, limit: 5 })
      });
      const data = await res.json();
      setResults(data.results);
    } finally {
      setLoading(false);
    }
  };

  return (
    <div>
      <input
        value={query}
        onChange={(e) => setQuery(e.target.value)}
        placeholder="Search semantically..."
        onKeyDown={(e) => e.key === 'Enter' && handleSearch()}
      />
      <button onClick={handleSearch} disabled={loading}>
        {loading ? 'Searching...' : 'Search'}
      </button>
      
      <div className="results">
        {results.map((result, i) => (
          <div key={i} className="result">
            <div className="score">
              {(result.score * 100).toFixed(1)}% match
            </div>
            <p>{result.content}</p>
          </div>
        ))}
      </div>
    </div>
  );
}

Performance Optimization

# 1. Batch embedding requests
def batch_embed(texts: list[str], batch_size: int = 100):
    """Embed in batches to avoid rate limits"""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        embeddings = embed(batch)
        all_embeddings.extend(embeddings)
    return all_embeddings

# 2. Use caching for frequent queries
from functools import lru_cache
import hashlib

@lru_cache(maxsize=1000)
def cached_embed(text: str) -> tuple[float, ...]:
    """Cache embeddings (tuple for hashability)"""
    embedding = embed([text])[0]
    return tuple(embedding)

# 3. Async for concurrent operations
import asyncio
from openai import AsyncOpenAI

async def async_embed(texts: list[str]) -> list[list[float]]:
    client = AsyncOpenAI()
    response = await client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    return [item.embedding for item in response.data]

# 4. Use quantization for large scale
from qdrant_client.models import ScalarQuantization, ScalarQuantizationConfig

# Reduces memory by 4x
client.create_collection(
    collection_name="large_collection",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
    quantization_config=ScalarQuantization(
        scalar=ScalarQuantizationConfig(type="int8")
    )
)

Best Practices

  • Chunk wisely - 256-512 tokens typically work well
  • Use overlap - 10-20% overlap prevents context loss
  • Batch operations - Index and embed in batches
  • Cache embeddings - Avoid re-computing for same content
  • Store source metadata - Track document origins
  • Use filters - Narrow search space when possible

Key Takeaways

  • Semantic search finds results by meaning, not keywords
  • Pipeline: Chunk → Embed → Index → Query → Rank
  • Document processing is critical for search quality
  • Batch and cache embeddings for performance
  • Metadata filtering improves precision

In the next lesson, we'll explore hybrid search, combining semantic and lexical approaches for even better results.