Understanding Embeddings

Pinecone: Managed Vector Database

Pinecone is a fully managed, serverless vector database designed for production AI applications. This lesson covers Pinecone's architecture, features, and hands-on implementation.

Why Pinecone?

Fully Managed - No infrastructure to manage, automatic scaling
Serverless - Pay only for what you use, scales to zero
Low Latency - Single-digit millisecond queries at scale
Enterprise Features - SOC2, HIPAA, encryption, access controls
Hybrid Search - Combine vector and keyword search

Getting Started

# Install
pip install pinecone

from pinecone import Pinecone, ServerlessSpec

# Initialize client
pc = Pinecone(api_key="your-api-key")

# Create serverless index
pc.create_index(
    name="my-rag-index",
    dimension=1536,  # OpenAI embedding dimension
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

# Connect to index
index = pc.Index("my-rag-index")

# Check index stats
print(index.describe_index_stats())

Basic Operations

Upserting Vectors

from openai import OpenAI

openai = OpenAI()

def get_embedding(text: str) -> list[float]:
    response = openai.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

# Prepare vectors with metadata
documents = [
    {"id": "doc1", "text": "Machine learning is a subset of AI..."},
    {"id": "doc2", "text": "Neural networks are inspired by the brain..."},
    {"id": "doc3", "text": "Deep learning uses multiple layers..."},
]

vectors = [
    {
        "id": doc["id"],
        "values": get_embedding(doc["text"]),
        "metadata": {
            "text": doc["text"],
            "source": "documentation",
            "category": "ml"
        }
    }
    for doc in documents
]

# Upsert (insert or update)
index.upsert(vectors=vectors, namespace="docs")

# Batch upsert for large datasets
batch_size = 100
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    index.upsert(vectors=batch, namespace="docs")

Querying

# Basic query
query = "What is deep learning?"
query_embedding = get_embedding(query)

results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="docs",
    include_metadata=True
)

for match in results.matches:
    print(f"Score: {match.score:.4f}")
    print(f"Text: {match.metadata['text'][:100]}...")
    print("---")

Filtering with Metadata

# Query with filters
results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="docs",
    include_metadata=True,
    filter={
        "category": {"$eq": "ml"},
        "$or": [
            {"source": {"$eq": "documentation"}},
            {"date": {"$gte": "2024-01-01"}}
        ]
    }
)

# Available filter operators:
# $eq, $ne - equals, not equals
# $gt, $gte, $lt, $lte - comparisons
# $in, $nin - in list, not in list
# $and, $or - logical operators

Namespaces

Namespaces partition your index for multi-tenancy or logical separation.

# Different namespaces for different users/purposes
index.upsert(vectors=user1_docs, namespace="user-123")
index.upsert(vectors=user2_docs, namespace="user-456")

# Query only within a namespace
results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="user-123"  # Only searches this user's data
)

# Delete namespace
index.delete(delete_all=True, namespace="user-123")

Hybrid Search (Sparse-Dense)

Pinecone supports hybrid search combining dense vectors with sparse (BM25-style) vectors.

from pinecone_text.sparse import BM25Encoder

# Initialize BM25 encoder
bm25 = BM25Encoder()
bm25.fit(corpus)  # Fit on your documents

# Create hybrid vectors
def create_hybrid_vector(text: str) -> dict:
    dense = get_embedding(text)
    sparse = bm25.encode_documents([text])[0]
    
    return {
        "values": dense,
        "sparse_values": {
            "indices": sparse["indices"],
            "values": sparse["values"]
        }
    }

# Upsert hybrid vectors
vectors = [
    {
        "id": doc["id"],
        **create_hybrid_vector(doc["text"]),
        "metadata": {"text": doc["text"]}
    }
    for doc in documents
]
index.upsert(vectors=vectors)

# Hybrid query
query_dense = get_embedding(query)
query_sparse = bm25.encode_queries([query])[0]

results = index.query(
    vector=query_dense,
    sparse_vector={
        "indices": query_sparse["indices"],
        "values": query_sparse["values"]
    },
    top_k=10,
    include_metadata=True
)

Managing Vectors

# Fetch vectors by ID
fetched = index.fetch(ids=["doc1", "doc2"], namespace="docs")
print(fetched.vectors["doc1"].metadata)

# Update metadata only (without re-embedding)
index.update(
    id="doc1",
    set_metadata={"category": "updated_category"},
    namespace="docs"
)

# Delete by ID
index.delete(ids=["doc1", "doc2"], namespace="docs")

# Delete by filter
index.delete(
    filter={"category": {"$eq": "old_category"}},
    namespace="docs"
)

# Delete entire namespace
index.delete(delete_all=True, namespace="docs")

Production Patterns

RAG Implementation

class PineconeRAG:
    def __init__(self, index_name: str):
        self.pc = Pinecone()
        self.index = self.pc.Index(index_name)
        self.openai = OpenAI()
    
    def embed(self, text: str) -> list[float]:
        response = self.openai.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding
    
    def add_documents(self, documents: list[dict], namespace: str = ""):
        vectors = []
        for doc in documents:
            vectors.append({
                "id": doc["id"],
                "values": self.embed(doc["text"]),
                "metadata": {
                    "text": doc["text"],
                    **doc.get("metadata", {})
                }
            })
        
        # Batch upsert
        for i in range(0, len(vectors), 100):
            self.index.upsert(
                vectors=vectors[i:i+100],
                namespace=namespace
            )
    
    def search(
        self,
        query: str,
        top_k: int = 5,
        filter: dict = None,
        namespace: str = ""
    ) -> list[dict]:
        results = self.index.query(
            vector=self.embed(query),
            top_k=top_k,
            filter=filter,
            namespace=namespace,
            include_metadata=True
        )
        
        return [
            {
                "id": m.id,
                "score": m.score,
                "text": m.metadata.get("text", ""),
                "metadata": m.metadata
            }
            for m in results.matches
        ]
    
    def query(self, question: str, namespace: str = "") -> str:
        # Retrieve relevant documents
        results = self.search(question, top_k=5, namespace=namespace)
        
        # Build context
        context = "\n---\n".join([r["text"] for r in results])
        
        # Generate answer
        response = self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": "Answer based on the provided context."
                },
                {
                    "role": "user",
                    "content": f"Context:\n{context}\n\nQuestion: {question}"
                }
            ]
        )
        
        return response.choices[0].message.content

# Usage
rag = PineconeRAG("my-rag-index")
answer = rag.query("What is deep learning?")

Async Operations

import asyncio
from pinecone import Pinecone

async def async_search(queries: list[str]):
    pc = Pinecone()
    index = pc.Index("my-index")
    
    # Run multiple queries concurrently
    tasks = []
    for query in queries:
        embedding = get_embedding(query)
        task = asyncio.to_thread(
            index.query,
            vector=embedding,
            top_k=5
        )
        tasks.append(task)
    
    results = await asyncio.gather(*tasks)
    return results

Index Management

# List all indexes
indexes = pc.list_indexes()
print([idx.name for idx in indexes])

# Describe index
index_info = pc.describe_index("my-index")
print(f"Dimension: {index_info.dimension}")
print(f"Metric: {index_info.metric}")
print(f"Status: {index_info.status}")

# Get index stats
stats = index.describe_index_stats()
print(f"Total vectors: {stats.total_vector_count}")
print(f"Namespaces: {stats.namespaces}")

# Delete index
pc.delete_index("old-index")

Pricing Considerations

Serverless - Pay per read/write units, good for variable workloads
Pod-based - Fixed capacity, better for consistent high-volume
Storage - Charged per GB stored
Free tier - 1 serverless index, 2GB storage, 1M reads/month

Key Takeaways

Pinecone is ideal for production with zero infrastructure management
Namespaces enable multi-tenancy within a single index
Metadata filtering supports complex queries
Hybrid search combines semantic and keyword matching
Batch operations (100 vectors per batch) for efficiency
Serverless pricing works well for variable workloads

In the next lesson, we'll explore Weaviate and Qdrant - open-source alternatives with different strengths.

Vector Databases & Embeddings