Understanding Embeddings

Building Semantic Search Applications

Semantic search finds results based on meaning rather than exact keyword matches. This lesson shows you how to build end-to-end semantic search systems using vector databases.

Semantic vs. Lexical Search

Aspect	Lexical (Keyword)	Semantic
Matching	Exact keywords	Meaning/context
Query: "car"	Only docs with "car"	Also: vehicle, automobile, sedan
Synonyms	❌ Misses them	✅ Understands them
Context	❌ No understanding	✅ Considers context
Speed	Very fast	Fast (with ANN)

End-to-End Semantic Search Pipeline

# Complete semantic search system

from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import hashlib

class SemanticSearch:
    def __init__(self, collection_name: str = "documents"):
        self.openai = OpenAI()
        self.qdrant = QdrantClient("localhost", port=6333)
        self.collection_name = collection_name
        self.model = "text-embedding-3-small"
        self.dimension = 1536
        
        self._ensure_collection()
    
    def _ensure_collection(self):
        """Create collection if it doesn't exist"""
        collections = self.qdrant.get_collections().collections
        if not any(c.name == self.collection_name for c in collections):
            self.qdrant.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(
                    size=self.dimension,
                    distance=Distance.COSINE
                )
            )
    
    def _embed(self, texts: list[str]) -> list[list[float]]:
        """Generate embeddings for texts"""
        response = self.openai.embeddings.create(
            model=self.model,
            input=texts
        )
        return [item.embedding for item in response.data]
    
    def _generate_id(self, text: str) -> str:
        """Generate deterministic ID from text"""
        return hashlib.md5(text.encode()).hexdigest()[:16]
    
    def index(self, documents: list[dict]):
        """Index documents with metadata
        
        Args:
            documents: List of {"content": str, "metadata": dict}
        """
        texts = [doc["content"] for doc in documents]
        embeddings = self._embed(texts)
        
        points = [
            PointStruct(
                id=self._generate_id(doc["content"]),
                vector=embedding,
                payload={
                    "content": doc["content"],
                    **doc.get("metadata", {})
                }
            )
            for doc, embedding in zip(documents, embeddings)
        ]
        
        self.qdrant.upsert(
            collection_name=self.collection_name,
            points=points
        )
        
        return len(points)
    
    def search(self, query: str, limit: int = 5, filter_dict: dict = None):
        """Semantic search
        
        Args:
            query: Search query
            limit: Number of results
            filter_dict: Optional filter conditions
        """
        query_embedding = self._embed([query])[0]
        
        # Build filter if provided
        query_filter = None
        if filter_dict:
            from qdrant_client.models import Filter, FieldCondition, MatchValue
            conditions = [
                FieldCondition(key=k, match=MatchValue(value=v))
                for k, v in filter_dict.items()
            ]
            query_filter = Filter(must=conditions)
        
        results = self.qdrant.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            query_filter=query_filter,
            limit=limit
        )
        
        return [
            {
                "content": hit.payload["content"],
                "score": hit.score,
                "metadata": {k: v for k, v in hit.payload.items() if k != "content"}
            }
            for hit in results
        ]

Usage Example

# Initialize
search = SemanticSearch("product_docs")

# Index documents
documents = [
    {
        "content": "The MacBook Pro features the M3 chip with 8-core CPU...",
        "metadata": {"category": "laptops", "brand": "Apple"}
    },
    {
        "content": "Dell XPS 15 comes with Intel Core i9 processor...",
        "metadata": {"category": "laptops", "brand": "Dell"}
    },
    {
        "content": "iPhone 15 Pro has a titanium design...",
        "metadata": {"category": "phones", "brand": "Apple"}
    }
]

search.index(documents)

# Search
results = search.search("powerful laptop for development")
# Returns MacBook Pro and Dell XPS even without exact keywords

# Search with filter
results = search.search(
    "best computer for coding",
    filter_dict={"brand": "Apple"}
)
# Only Apple products

Document Processing Pipeline

from typing import Generator
import tiktoken

class DocumentProcessor:
    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
    
    def chunk_document(self, text: str, source: str = None) -> list[dict]:
        """Split document into overlapping chunks"""
        tokens = self.tokenizer.encode(text)
        chunks = []
        
        start = 0
        while start < len(tokens):
            end = start + self.chunk_size
            chunk_tokens = tokens[start:end]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            
            chunks.append({
                "content": chunk_text,
                "metadata": {
                    "source": source,
                    "chunk_index": len(chunks),
                    "token_count": len(chunk_tokens)
                }
            })
            
            start = end - self.chunk_overlap
        
        return chunks
    
    def process_documents(self, documents: list[dict]) -> Generator[dict, None, None]:
        """Process multiple documents"""
        for doc in documents:
            chunks = self.chunk_document(
                doc["content"],
                source=doc.get("source", "unknown")
            )
            for chunk in chunks:
                # Merge original metadata
                chunk["metadata"].update(doc.get("metadata", {}))
                yield chunk

# Usage
processor = DocumentProcessor(chunk_size=512, chunk_overlap=50)

# Process large documents
raw_docs = [
    {"content": "Very long document text...", "source": "manual.pdf"},
    {"content": "Another long document...", "source": "guide.pdf"}
]

chunks = list(processor.process_documents(raw_docs))
search.index(chunks)

Web Application with FastAPI

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

app = FastAPI()
search = SemanticSearch("documents")

class SearchRequest(BaseModel):
    query: str
    limit: int = 5
    filters: dict = None

class IndexRequest(BaseModel):
    documents: list[dict]

@app.post("/index")
async def index_documents(request: IndexRequest):
    """Index documents"""
    count = search.index(request.documents)
    return {"indexed": count}

@app.post("/search")
async def search_documents(request: SearchRequest):
    """Search documents"""
    results = search.search(
        query=request.query,
        limit=request.limit,
        filter_dict=request.filters
    )
    return {"results": results}

# Run: uvicorn app:app --reload

Next.js/TypeScript Frontend

// components/SemanticSearch.tsx
'use client';
import { useState } from 'react';

interface SearchResult {
  content: string;
  score: number;
  metadata: Record<string, string>;
}

export function SemanticSearch() {
  const [query, setQuery] = useState('');
  const [results, setResults] = useState<SearchResult[]>([]);
  const [loading, setLoading] = useState(false);

  const handleSearch = async () => {
    setLoading(true);
    try {
      const res = await fetch('/api/search', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ query, limit: 5 })
      });
      const data = await res.json();
      setResults(data.results);
    } finally {
      setLoading(false);
    }
  };

  return (
    <div>
      <input
        value={query}
        onChange={(e) => setQuery(e.target.value)}
        placeholder="Search semantically..."
        onKeyDown={(e) => e.key === 'Enter' && handleSearch()}
      />
      <button onClick={handleSearch} disabled={loading}>
        {loading ? 'Searching...' : 'Search'}
      </button>
      
      <div className="results">
        {results.map((result, i) => (
          <div key={i} className="result">
            <div className="score">
              {(result.score * 100).toFixed(1)}% match
            </div>
            <p>{result.content}</p>
          </div>
        ))}
      </div>
    </div>
  );
}

Performance Optimization

# 1. Batch embedding requests
def batch_embed(texts: list[str], batch_size: int = 100):
    """Embed in batches to avoid rate limits"""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        embeddings = embed(batch)
        all_embeddings.extend(embeddings)
    return all_embeddings

# 2. Use caching for frequent queries
from functools import lru_cache
import hashlib

@lru_cache(maxsize=1000)
def cached_embed(text: str) -> tuple[float, ...]:
    """Cache embeddings (tuple for hashability)"""
    embedding = embed([text])[0]
    return tuple(embedding)

# 3. Async for concurrent operations
import asyncio
from openai import AsyncOpenAI

async def async_embed(texts: list[str]) -> list[list[float]]:
    client = AsyncOpenAI()
    response = await client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    return [item.embedding for item in response.data]

# 4. Use quantization for large scale
from qdrant_client.models import ScalarQuantization, ScalarQuantizationConfig

# Reduces memory by 4x
client.create_collection(
    collection_name="large_collection",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
    quantization_config=ScalarQuantization(
        scalar=ScalarQuantizationConfig(type="int8")
    )
)

Best Practices

Chunk wisely - 256-512 tokens typically work well
Use overlap - 10-20% overlap prevents context loss
Batch operations - Index and embed in batches
Cache embeddings - Avoid re-computing for same content
Store source metadata - Track document origins
Use filters - Narrow search space when possible

Key Takeaways

Semantic search finds results by meaning, not keywords
Pipeline: Chunk → Embed → Index → Query → Rank
Document processing is critical for search quality
Batch and cache embeddings for performance
Metadata filtering improves precision

In the next lesson, we'll explore hybrid search, combining semantic and lexical approaches for even better results.

Vector Databases & Embeddings