Vector Databases & Embeddings

0 of 12 lessons completed

Pinecone: Managed Vector Database

Pinecone is a fully managed, serverless vector database designed for production AI applications. This lesson covers Pinecone's architecture, features, and hands-on implementation.

Why Pinecone?

  • Fully Managed - No infrastructure to manage, automatic scaling
  • Serverless - Pay only for what you use, scales to zero
  • Low Latency - Single-digit millisecond queries at scale
  • Enterprise Features - SOC2, HIPAA, encryption, access controls
  • Hybrid Search - Combine vector and keyword search

Getting Started

# Install
pip install pinecone
from pinecone import Pinecone, ServerlessSpec

# Initialize client
pc = Pinecone(api_key="your-api-key")

# Create serverless index
pc.create_index(
    name="my-rag-index",
    dimension=1536,  # OpenAI embedding dimension
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

# Connect to index
index = pc.Index("my-rag-index")

# Check index stats
print(index.describe_index_stats())

Basic Operations

Upserting Vectors

from openai import OpenAI

openai = OpenAI()

def get_embedding(text: str) -> list[float]:
    response = openai.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

# Prepare vectors with metadata
documents = [
    {"id": "doc1", "text": "Machine learning is a subset of AI..."},
    {"id": "doc2", "text": "Neural networks are inspired by the brain..."},
    {"id": "doc3", "text": "Deep learning uses multiple layers..."},
]

vectors = [
    {
        "id": doc["id"],
        "values": get_embedding(doc["text"]),
        "metadata": {
            "text": doc["text"],
            "source": "documentation",
            "category": "ml"
        }
    }
    for doc in documents
]

# Upsert (insert or update)
index.upsert(vectors=vectors, namespace="docs")

# Batch upsert for large datasets
batch_size = 100
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    index.upsert(vectors=batch, namespace="docs")

Querying

# Basic query
query = "What is deep learning?"
query_embedding = get_embedding(query)

results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="docs",
    include_metadata=True
)

for match in results.matches:
    print(f"Score: {match.score:.4f}")
    print(f"Text: {match.metadata['text'][:100]}...")
    print("---")

Filtering with Metadata

# Query with filters
results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="docs",
    include_metadata=True,
    filter={
        "category": {"$eq": "ml"},
        "$or": [
            {"source": {"$eq": "documentation"}},
            {"date": {"$gte": "2024-01-01"}}
        ]
    }
)

# Available filter operators:
# $eq, $ne - equals, not equals
# $gt, $gte, $lt, $lte - comparisons
# $in, $nin - in list, not in list
# $and, $or - logical operators

Namespaces

Namespaces partition your index for multi-tenancy or logical separation.

# Different namespaces for different users/purposes
index.upsert(vectors=user1_docs, namespace="user-123")
index.upsert(vectors=user2_docs, namespace="user-456")

# Query only within a namespace
results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="user-123"  # Only searches this user's data
)

# Delete namespace
index.delete(delete_all=True, namespace="user-123")

Hybrid Search (Sparse-Dense)

Pinecone supports hybrid search combining dense vectors with sparse (BM25-style) vectors.

from pinecone_text.sparse import BM25Encoder

# Initialize BM25 encoder
bm25 = BM25Encoder()
bm25.fit(corpus)  # Fit on your documents

# Create hybrid vectors
def create_hybrid_vector(text: str) -> dict:
    dense = get_embedding(text)
    sparse = bm25.encode_documents([text])[0]
    
    return {
        "values": dense,
        "sparse_values": {
            "indices": sparse["indices"],
            "values": sparse["values"]
        }
    }

# Upsert hybrid vectors
vectors = [
    {
        "id": doc["id"],
        **create_hybrid_vector(doc["text"]),
        "metadata": {"text": doc["text"]}
    }
    for doc in documents
]
index.upsert(vectors=vectors)

# Hybrid query
query_dense = get_embedding(query)
query_sparse = bm25.encode_queries([query])[0]

results = index.query(
    vector=query_dense,
    sparse_vector={
        "indices": query_sparse["indices"],
        "values": query_sparse["values"]
    },
    top_k=10,
    include_metadata=True
)

Managing Vectors

# Fetch vectors by ID
fetched = index.fetch(ids=["doc1", "doc2"], namespace="docs")
print(fetched.vectors["doc1"].metadata)

# Update metadata only (without re-embedding)
index.update(
    id="doc1",
    set_metadata={"category": "updated_category"},
    namespace="docs"
)

# Delete by ID
index.delete(ids=["doc1", "doc2"], namespace="docs")

# Delete by filter
index.delete(
    filter={"category": {"$eq": "old_category"}},
    namespace="docs"
)

# Delete entire namespace
index.delete(delete_all=True, namespace="docs")

Production Patterns

RAG Implementation

class PineconeRAG:
    def __init__(self, index_name: str):
        self.pc = Pinecone()
        self.index = self.pc.Index(index_name)
        self.openai = OpenAI()
    
    def embed(self, text: str) -> list[float]:
        response = self.openai.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding
    
    def add_documents(self, documents: list[dict], namespace: str = ""):
        vectors = []
        for doc in documents:
            vectors.append({
                "id": doc["id"],
                "values": self.embed(doc["text"]),
                "metadata": {
                    "text": doc["text"],
                    **doc.get("metadata", {})
                }
            })
        
        # Batch upsert
        for i in range(0, len(vectors), 100):
            self.index.upsert(
                vectors=vectors[i:i+100],
                namespace=namespace
            )
    
    def search(
        self,
        query: str,
        top_k: int = 5,
        filter: dict = None,
        namespace: str = ""
    ) -> list[dict]:
        results = self.index.query(
            vector=self.embed(query),
            top_k=top_k,
            filter=filter,
            namespace=namespace,
            include_metadata=True
        )
        
        return [
            {
                "id": m.id,
                "score": m.score,
                "text": m.metadata.get("text", ""),
                "metadata": m.metadata
            }
            for m in results.matches
        ]
    
    def query(self, question: str, namespace: str = "") -> str:
        # Retrieve relevant documents
        results = self.search(question, top_k=5, namespace=namespace)
        
        # Build context
        context = "\n---\n".join([r["text"] for r in results])
        
        # Generate answer
        response = self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": "Answer based on the provided context."
                },
                {
                    "role": "user",
                    "content": f"Context:\n{context}\n\nQuestion: {question}"
                }
            ]
        )
        
        return response.choices[0].message.content

# Usage
rag = PineconeRAG("my-rag-index")
answer = rag.query("What is deep learning?")

Async Operations

import asyncio
from pinecone import Pinecone

async def async_search(queries: list[str]):
    pc = Pinecone()
    index = pc.Index("my-index")
    
    # Run multiple queries concurrently
    tasks = []
    for query in queries:
        embedding = get_embedding(query)
        task = asyncio.to_thread(
            index.query,
            vector=embedding,
            top_k=5
        )
        tasks.append(task)
    
    results = await asyncio.gather(*tasks)
    return results

Index Management

# List all indexes
indexes = pc.list_indexes()
print([idx.name for idx in indexes])

# Describe index
index_info = pc.describe_index("my-index")
print(f"Dimension: {index_info.dimension}")
print(f"Metric: {index_info.metric}")
print(f"Status: {index_info.status}")

# Get index stats
stats = index.describe_index_stats()
print(f"Total vectors: {stats.total_vector_count}")
print(f"Namespaces: {stats.namespaces}")

# Delete index
pc.delete_index("old-index")

Pricing Considerations

  • Serverless - Pay per read/write units, good for variable workloads
  • Pod-based - Fixed capacity, better for consistent high-volume
  • Storage - Charged per GB stored
  • Free tier - 1 serverless index, 2GB storage, 1M reads/month

Key Takeaways

  • Pinecone is ideal for production with zero infrastructure management
  • Namespaces enable multi-tenancy within a single index
  • Metadata filtering supports complex queries
  • Hybrid search combines semantic and keyword matching
  • Batch operations (100 vectors per batch) for efficiency
  • Serverless pricing works well for variable workloads

In the next lesson, we'll explore Weaviate and Qdrant - open-source alternatives with different strengths.