Introduction: Retrieval is the foundation of RAG systems—the quality of retrieved documents directly impacts generation quality. Different retrieval strategies excel in different scenarios: dense retrieval captures semantic similarity, sparse retrieval handles exact keyword matches, and hybrid approaches combine both. This guide covers advanced retrieval techniques: embedding-based dense retrieval, BM25 and sparse methods, hybrid search strategies, query expansion and transformation, multi-stage retrieval with reranking, and optimizing retrieval for your specific use case. These patterns help you build RAG systems that find the right information every time.
Dense Retrieval with Embeddings
from openai import OpenAI
import numpy as np
from typing import Optional
client = OpenAI()
class DenseRetriever:
"""Dense retrieval using embeddings."""
def __init__(self, model: str = "text-embedding-3-small"):
self.model = model
self.documents: list[dict] = []
self.embeddings: np.ndarray = None
def embed(self, texts: list[str]) -> np.ndarray:
"""Get embeddings for texts."""
response = client.embeddings.create(
model=self.model,
input=texts
)
return np.array([e.embedding for e in response.data])
def add_documents(self, documents: list[dict]):
"""Add documents to the index."""
texts = [doc["content"] for doc in documents]
new_embeddings = self.embed(texts)
if self.embeddings is None:
self.embeddings = new_embeddings
else:
self.embeddings = np.vstack([self.embeddings, new_embeddings])
self.documents.extend(documents)
def search(
self,
query: str,
k: int = 5,
threshold: float = 0.0
) -> list[dict]:
"""Search for similar documents."""
query_embedding = self.embed([query])[0]
# Cosine similarity
similarities = np.dot(self.embeddings, query_embedding) / (
np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
)
# Get top-k indices
top_indices = np.argsort(similarities)[::-1][:k]
results = []
for idx in top_indices:
score = similarities[idx]
if score >= threshold:
results.append({
**self.documents[idx],
"score": float(score)
})
return results
# Usage
retriever = DenseRetriever()
documents = [
{"id": "1", "content": "Python is a programming language known for its simplicity."},
{"id": "2", "content": "Machine learning models can learn patterns from data."},
{"id": "3", "content": "Neural networks are inspired by biological neurons."},
]
retriever.add_documents(documents)
results = retriever.search("What is deep learning?", k=2)
for r in results:
print(f"Score: {r['score']:.3f} - {r['content'][:50]}...")
Sparse Retrieval with BM25
# pip install rank_bm25
from rank_bm25 import BM25Okapi
import re
from typing import Callable
class SparseRetriever:
"""Sparse retrieval using BM25."""
def __init__(self, tokenizer: Callable[[str], list[str]] = None):
self.tokenizer = tokenizer or self._default_tokenizer
self.documents: list[dict] = []
self.bm25: BM25Okapi = None
self.corpus: list[list[str]] = []
def _default_tokenizer(self, text: str) -> list[str]:
"""Simple tokenizer."""
# Lowercase and split on non-alphanumeric
text = text.lower()
tokens = re.findall(r'\b\w+\b', text)
return tokens
def add_documents(self, documents: list[dict]):
"""Add documents to the index."""
for doc in documents:
tokens = self.tokenizer(doc["content"])
self.corpus.append(tokens)
self.documents.append(doc)
# Rebuild BM25 index
self.bm25 = BM25Okapi(self.corpus)
def search(self, query: str, k: int = 5) -> list[dict]:
"""Search using BM25."""
query_tokens = self.tokenizer(query)
scores = self.bm25.get_scores(query_tokens)
# Get top-k indices
top_indices = np.argsort(scores)[::-1][:k]
results = []
for idx in top_indices:
if scores[idx] > 0:
results.append({
**self.documents[idx],
"score": float(scores[idx])
})
return results
# Usage
sparse_retriever = SparseRetriever()
sparse_retriever.add_documents(documents)
results = sparse_retriever.search("programming language", k=2)
for r in results:
print(f"Score: {r['score']:.3f} - {r['content'][:50]}...")
Hybrid Search
class HybridRetriever:
"""Combine dense and sparse retrieval."""
def __init__(
self,
dense_weight: float = 0.5,
sparse_weight: float = 0.5
):
self.dense = DenseRetriever()
self.sparse = SparseRetriever()
self.dense_weight = dense_weight
self.sparse_weight = sparse_weight
def add_documents(self, documents: list[dict]):
"""Add documents to both indexes."""
self.dense.add_documents(documents)
self.sparse.add_documents(documents)
def _normalize_scores(self, results: list[dict]) -> dict[str, float]:
"""Normalize scores to 0-1 range."""
if not results:
return {}
scores = [r["score"] for r in results]
min_score = min(scores)
max_score = max(scores)
range_score = max_score - min_score if max_score != min_score else 1
return {
r["id"]: (r["score"] - min_score) / range_score
for r in results
}
def search(
self,
query: str,
k: int = 5,
dense_k: int = 20,
sparse_k: int = 20
) -> list[dict]:
"""Hybrid search combining dense and sparse."""
# Get results from both
dense_results = self.dense.search(query, k=dense_k)
sparse_results = self.sparse.search(query, k=sparse_k)
# Normalize scores
dense_scores = self._normalize_scores(dense_results)
sparse_scores = self._normalize_scores(sparse_results)
# Combine scores
all_ids = set(dense_scores.keys()) | set(sparse_scores.keys())
combined_scores = {}
for doc_id in all_ids:
dense_score = dense_scores.get(doc_id, 0) * self.dense_weight
sparse_score = sparse_scores.get(doc_id, 0) * self.sparse_weight
combined_scores[doc_id] = dense_score + sparse_score
# Sort by combined score
sorted_ids = sorted(combined_scores.keys(), key=lambda x: combined_scores[x], reverse=True)
# Build results
doc_map = {d["id"]: d for d in self.dense.documents}
results = []
for doc_id in sorted_ids[:k]:
doc = doc_map[doc_id]
results.append({
**doc,
"score": combined_scores[doc_id],
"dense_score": dense_scores.get(doc_id, 0),
"sparse_score": sparse_scores.get(doc_id, 0)
})
return results
# Usage
hybrid = HybridRetriever(dense_weight=0.7, sparse_weight=0.3)
hybrid.add_documents(documents)
results = hybrid.search("neural network programming", k=3)
for r in results:
print(f"Combined: {r['score']:.3f} (dense: {r['dense_score']:.3f}, sparse: {r['sparse_score']:.3f})")
Query Expansion and Transformation
import json
class QueryTransformer:
"""Transform queries for better retrieval."""
def expand_query(self, query: str) -> list[str]:
"""Expand query with synonyms and related terms."""
prompt = f"""Generate 3 alternative phrasings of this search query.
Keep the same intent but use different words.
Query: {query}
Return JSON: {{"alternatives": ["query1", "query2", "query3"]}}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return [query] + data.get("alternatives", [])
def decompose_query(self, query: str) -> list[str]:
"""Break complex query into sub-queries."""
prompt = f"""Break this complex query into simpler sub-queries.
Each sub-query should retrieve part of the information needed.
Query: {query}
Return JSON: {{"sub_queries": ["query1", "query2", ...]}}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("sub_queries", [query])
def hypothetical_document(self, query: str) -> str:
"""Generate hypothetical document that would answer the query (HyDE)."""
prompt = f"""Write a short paragraph that would be a perfect answer to this question.
Write as if you're creating a document that contains the answer.
Question: {query}
Write 2-3 sentences that directly answer the question."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
class EnhancedRetriever:
"""Retriever with query transformation."""
def __init__(self, base_retriever):
self.retriever = base_retriever
self.transformer = QueryTransformer()
def search_with_expansion(self, query: str, k: int = 5) -> list[dict]:
"""Search with query expansion."""
queries = self.transformer.expand_query(query)
all_results = {}
for q in queries:
results = self.retriever.search(q, k=k)
for r in results:
if r["id"] not in all_results or r["score"] > all_results[r["id"]]["score"]:
all_results[r["id"]] = r
# Sort by score
sorted_results = sorted(all_results.values(), key=lambda x: x["score"], reverse=True)
return sorted_results[:k]
def search_with_hyde(self, query: str, k: int = 5) -> list[dict]:
"""Search using HyDE (Hypothetical Document Embeddings)."""
# Generate hypothetical document
hypo_doc = self.transformer.hypothetical_document(query)
# Search using the hypothetical document as query
return self.retriever.search(hypo_doc, k=k)
# Usage
enhanced = EnhancedRetriever(retriever)
# Query expansion
results = enhanced.search_with_expansion("How do neural networks learn?", k=3)
# HyDE
results = enhanced.search_with_hyde("What is backpropagation?", k=3)
Reranking
# pip install sentence-transformers
from sentence_transformers import CrossEncoder
class Reranker:
"""Rerank retrieved documents for better relevance."""
def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
self.model = CrossEncoder(model_name)
def rerank(
self,
query: str,
documents: list[dict],
k: int = None
) -> list[dict]:
"""Rerank documents using cross-encoder."""
if not documents:
return []
# Prepare pairs
pairs = [(query, doc["content"]) for doc in documents]
# Get scores
scores = self.model.predict(pairs)
# Add scores to documents
for doc, score in zip(documents, scores):
doc["rerank_score"] = float(score)
# Sort by rerank score
sorted_docs = sorted(documents, key=lambda x: x["rerank_score"], reverse=True)
if k:
return sorted_docs[:k]
return sorted_docs
class LLMReranker:
"""Rerank using LLM for more nuanced relevance."""
def rerank(
self,
query: str,
documents: list[dict],
k: int = 5
) -> list[dict]:
"""Rerank using LLM."""
if len(documents) <= k:
return documents
# Format documents for LLM
doc_texts = "\n\n".join([
f"[{i}] {doc['content'][:500]}"
for i, doc in enumerate(documents)
])
prompt = f"""Rank these documents by relevance to the query.
Return the indices of the top {k} most relevant documents in order.
Query: {query}
Documents:
{doc_texts}
Return JSON: {{"ranking": [index1, index2, ...]}}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
ranking = data.get("ranking", list(range(k)))
return [documents[i] for i in ranking if i < len(documents)]
# Two-stage retrieval
class TwoStageRetriever:
"""Retrieve then rerank for better results."""
def __init__(self, retriever, reranker):
self.retriever = retriever
self.reranker = reranker
def search(
self,
query: str,
k: int = 5,
retrieve_k: int = 20
) -> list[dict]:
"""Two-stage retrieval: retrieve many, rerank to top-k."""
# Stage 1: Retrieve candidates
candidates = self.retriever.search(query, k=retrieve_k)
# Stage 2: Rerank
reranked = self.reranker.rerank(query, candidates, k=k)
return reranked
# Usage
reranker = Reranker()
two_stage = TwoStageRetriever(hybrid, reranker)
results = two_stage.search("machine learning basics", k=3, retrieve_k=10)
Contextual Retrieval
class ContextualRetriever:
"""Add context to chunks before embedding."""
def __init__(self, retriever):
self.retriever = retriever
def add_context_to_chunk(
self,
chunk: str,
document_context: str
) -> str:
"""Add document context to chunk for better embeddings."""
prompt = f"""Generate a brief context for this chunk based on the full document.
The context should help understand what this chunk is about.
Document context: {document_context[:1000]}
Chunk: {chunk}
Write 1-2 sentences of context to prepend to the chunk."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
max_tokens=100
)
context = response.choices[0].message.content
return f"{context}\n\n{chunk}"
def add_documents_with_context(
self,
documents: list[dict],
chunk_size: int = 500
):
"""Chunk documents and add context."""
contextualized_chunks = []
for doc in documents:
content = doc["content"]
doc_context = content[:500] # Use beginning as context
# Simple chunking
chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
for i, chunk in enumerate(chunks):
contextualized = self.add_context_to_chunk(chunk, doc_context)
contextualized_chunks.append({
"id": f"{doc['id']}_chunk_{i}",
"content": contextualized,
"original_content": chunk,
"source_doc": doc["id"]
})
self.retriever.add_documents(contextualized_chunks)
# Usage
contextual = ContextualRetriever(DenseRetriever())
contextual.add_documents_with_context(documents)
Production Retrieval Service
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional
app = FastAPI()
class SearchRequest(BaseModel):
query: str
k: int = 5
strategy: str = "hybrid" # dense, sparse, hybrid
use_reranking: bool = True
use_query_expansion: bool = False
class SearchResult(BaseModel):
id: str
content: str
score: float
metadata: dict = {}
class SearchResponse(BaseModel):
results: list[SearchResult]
query_variants: list[str] = []
# Initialize retrievers
dense_retriever = DenseRetriever()
sparse_retriever = SparseRetriever()
hybrid_retriever = HybridRetriever()
reranker = Reranker()
transformer = QueryTransformer()
@app.post("/search", response_model=SearchResponse)
async def search(request: SearchRequest):
"""Search documents with configurable strategy."""
query_variants = [request.query]
# Query expansion
if request.use_query_expansion:
query_variants = transformer.expand_query(request.query)
# Select retriever
if request.strategy == "dense":
retriever = dense_retriever
elif request.strategy == "sparse":
retriever = sparse_retriever
else:
retriever = hybrid_retriever
# Retrieve
retrieve_k = request.k * 4 if request.use_reranking else request.k
all_results = {}
for query in query_variants:
results = retriever.search(query, k=retrieve_k)
for r in results:
if r["id"] not in all_results or r["score"] > all_results[r["id"]]["score"]:
all_results[r["id"]] = r
candidates = list(all_results.values())
# Rerank
if request.use_reranking and candidates:
candidates = reranker.rerank(request.query, candidates, k=request.k)
else:
candidates = sorted(candidates, key=lambda x: x["score"], reverse=True)[:request.k]
return SearchResponse(
results=[
SearchResult(
id=r["id"],
content=r["content"],
score=r.get("rerank_score", r["score"]),
metadata=r.get("metadata", {})
)
for r in candidates
],
query_variants=query_variants
)
@app.post("/index")
async def index_documents(documents: list[dict]):
"""Add documents to all indexes."""
dense_retriever.add_documents(documents)
sparse_retriever.add_documents(documents)
hybrid_retriever.add_documents(documents)
return {"indexed": len(documents)}
References
- Dense Passage Retrieval: https://arxiv.org/abs/2004.04906
- HyDE Paper: https://arxiv.org/abs/2212.10496
- Sentence Transformers: https://www.sbert.net/
- Anthropic Contextual Retrieval: https://www.anthropic.com/news/contextual-retrieval
Conclusion
Effective retrieval requires matching the strategy to your data and queries. Dense retrieval excels at semantic similarity but may miss exact keyword matches. Sparse retrieval handles specific terms well but lacks semantic understanding. Hybrid search combines both for robust results. Query expansion and HyDE improve recall for ambiguous queries. Two-stage retrieval with reranking provides the best precision by using fast retrieval for candidates and accurate reranking for final selection. Start with hybrid search and reranking as a strong baseline, then optimize based on your specific failure cases.
Discover more from C4: Container, Code, Cloud & Context
Subscribe to get the latest posts sent to your email.