Memory Systems for LLMs: Buffers, Summaries, and Vector Storage

Introduction: LLMs have no inherent memory—each request starts fresh. Building effective memory systems enables conversations that span sessions, personalization based on user history, and agents that learn from past interactions. Memory architectures range from simple conversation buffers to sophisticated vector-based long-term storage with semantic retrieval. This guide covers practical memory patterns: conversation buffers, sliding windows, summary-based compression, vector store memory, and hybrid systems that combine multiple approaches for optimal recall and context efficiency.

Memory Systems
Memory Systems: Short-Term Buffer, Long-Term Vector Store, Summary Memory

Conversation Buffer Memory

from dataclasses import dataclass, field
from typing import Optional
from datetime import datetime
from collections import deque

@dataclass
class Message:
    """A single message in conversation."""
    
    role: str
    content: str
    timestamp: datetime = field(default_factory=datetime.now)
    metadata: dict = field(default_factory=dict)

class ConversationBuffer:
    """Simple buffer that stores all messages."""
    
    def __init__(self, max_messages: int = 100):
        self.messages: list[Message] = []
        self.max_messages = max_messages
    
    def add(self, role: str, content: str, **metadata):
        """Add message to buffer."""
        
        message = Message(
            role=role,
            content=content,
            metadata=metadata
        )
        
        self.messages.append(message)
        
        # Trim if over limit
        if len(self.messages) > self.max_messages:
            self.messages = self.messages[-self.max_messages:]
    
    def get_messages(self) -> list[dict]:
        """Get messages for API call."""
        
        return [
            {"role": m.role, "content": m.content}
            for m in self.messages
        ]
    
    def clear(self):
        """Clear all messages."""
        self.messages = []
    
    def get_context_string(self) -> str:
        """Get conversation as string."""
        
        return "\n".join([
            f"{m.role}: {m.content}"
            for m in self.messages
        ])

class SlidingWindowBuffer:
    """Buffer with sliding window based on token count."""
    
    def __init__(
        self,
        max_tokens: int = 4000,
        counter = None
    ):
        self.max_tokens = max_tokens
        self.messages: deque = deque()
        self.current_tokens = 0
        
        # Token counter (use tiktoken)
        if counter is None:
            import tiktoken
            enc = tiktoken.get_encoding("cl100k_base")
            self.count_tokens = lambda x: len(enc.encode(x))
        else:
            self.count_tokens = counter
    
    def add(self, role: str, content: str):
        """Add message and slide window if needed."""
        
        tokens = self.count_tokens(content) + 10  # Overhead
        
        self.messages.append({
            "role": role,
            "content": content,
            "tokens": tokens
        })
        
        self.current_tokens += tokens
        
        # Slide window
        while self.current_tokens > self.max_tokens and len(self.messages) > 1:
            removed = self.messages.popleft()
            self.current_tokens -= removed["tokens"]
    
    def get_messages(self) -> list[dict]:
        """Get messages for API call."""
        
        return [
            {"role": m["role"], "content": m["content"]}
            for m in self.messages
        ]

# Token-aware buffer with priority
class PriorityBuffer:
    """Buffer that keeps important messages longer."""
    
    def __init__(self, max_tokens: int = 4000):
        self.max_tokens = max_tokens
        self.system_messages: list[dict] = []
        self.important_messages: list[dict] = []
        self.regular_messages: deque = deque()
        
        import tiktoken
        enc = tiktoken.get_encoding("cl100k_base")
        self.count_tokens = lambda x: len(enc.encode(x))
    
    def add(
        self,
        role: str,
        content: str,
        important: bool = False
    ):
        """Add message with priority."""
        
        message = {"role": role, "content": content}
        
        if role == "system":
            self.system_messages.append(message)
        elif important:
            self.important_messages.append(message)
        else:
            self.regular_messages.append(message)
        
        self._trim()
    
    def _trim(self):
        """Trim regular messages to fit budget."""
        
        # Calculate fixed tokens
        fixed_tokens = sum(
            self.count_tokens(m["content"]) + 10
            for m in self.system_messages + self.important_messages
        )
        
        available = self.max_tokens - fixed_tokens
        
        # Trim regular messages
        current = sum(
            self.count_tokens(m["content"]) + 10
            for m in self.regular_messages
        )
        
        while current > available and self.regular_messages:
            removed = self.regular_messages.popleft()
            current -= self.count_tokens(removed["content"]) + 10
    
    def get_messages(self) -> list[dict]:
        """Get all messages in order."""
        
        return (
            self.system_messages +
            list(self.regular_messages) +
            self.important_messages
        )

Summary Memory

from dataclasses import dataclass
from typing import Optional

@dataclass
class ConversationSummary:
    """Summary of conversation history."""
    
    summary: str
    message_count: int
    last_updated: datetime
    key_points: list[str] = field(default_factory=list)

class SummaryMemory:
    """Memory that summarizes old messages."""
    
    def __init__(
        self,
        client,
        max_messages_before_summary: int = 10,
        summary_model: str = "gpt-4o-mini"
    ):
        self.client = client
        self.max_messages = max_messages_before_summary
        self.summary_model = summary_model
        
        self.current_summary: Optional[ConversationSummary] = None
        self.recent_messages: list[Message] = []
    
    def add(self, role: str, content: str):
        """Add message, summarize if needed."""
        
        self.recent_messages.append(Message(role=role, content=content))
        
        if len(self.recent_messages) >= self.max_messages:
            self._summarize()
    
    def _summarize(self):
        """Summarize recent messages."""
        
        # Build conversation text
        conversation = "\n".join([
            f"{m.role}: {m.content}"
            for m in self.recent_messages
        ])
        
        # Include existing summary
        context = ""
        if self.current_summary:
            context = f"Previous summary: {self.current_summary.summary}\n\n"
        
        prompt = f"""{context}Summarize this conversation concisely, preserving key information:

{conversation}

Provide:
1. A brief summary (2-3 sentences)
2. Key points as a list"""
        
        response = self.client.chat.completions.create(
            model=self.summary_model,
            messages=[{"role": "user", "content": prompt}]
        )
        
        summary_text = response.choices[0].message.content
        
        # Update summary
        self.current_summary = ConversationSummary(
            summary=summary_text,
            message_count=(
                (self.current_summary.message_count if self.current_summary else 0) +
                len(self.recent_messages)
            ),
            last_updated=datetime.now()
        )
        
        # Keep only last few messages
        self.recent_messages = self.recent_messages[-2:]
    
    def get_context(self) -> str:
        """Get memory context for prompt."""
        
        parts = []
        
        if self.current_summary:
            parts.append(f"Conversation summary:\n{self.current_summary.summary}")
        
        if self.recent_messages:
            recent = "\n".join([
                f"{m.role}: {m.content}"
                for m in self.recent_messages
            ])
            parts.append(f"Recent messages:\n{recent}")
        
        return "\n\n".join(parts)
    
    def get_messages(self) -> list[dict]:
        """Get messages for API call."""
        
        messages = []
        
        if self.current_summary:
            messages.append({
                "role": "system",
                "content": f"Conversation history summary: {self.current_summary.summary}"
            })
        
        for m in self.recent_messages:
            messages.append({"role": m.role, "content": m.content})
        
        return messages

# Progressive summarization
class ProgressiveSummary:
    """Summarize in layers for long conversations."""
    
    def __init__(self, client):
        self.client = client
        self.summaries: list[str] = []  # Oldest to newest
        self.current_chunk: list[Message] = []
        self.chunk_size = 10
    
    def add(self, role: str, content: str):
        """Add message."""
        
        self.current_chunk.append(Message(role=role, content=content))
        
        if len(self.current_chunk) >= self.chunk_size:
            self._summarize_chunk()
    
    def _summarize_chunk(self):
        """Summarize current chunk."""
        
        conversation = "\n".join([
            f"{m.role}: {m.content}"
            for m in self.current_chunk
        ])
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": f"Summarize this conversation in 2-3 sentences:\n\n{conversation}"
            }]
        )
        
        self.summaries.append(response.choices[0].message.content)
        self.current_chunk = []
        
        # Consolidate old summaries if too many
        if len(self.summaries) > 5:
            self._consolidate_summaries()
    
    def _consolidate_summaries(self):
        """Consolidate multiple summaries into one."""
        
        old_summaries = self.summaries[:-2]
        
        combined = "\n".join([
            f"- {s}" for s in old_summaries
        ])
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": f"Consolidate these summaries into one:\n\n{combined}"
            }]
        )
        
        self.summaries = [
            response.choices[0].message.content
        ] + self.summaries[-2:]

Vector Store Memory

from dataclasses import dataclass
from typing import Optional
import numpy as np

@dataclass
class MemoryEntry:
    """A memory entry with embedding."""
    
    id: str
    content: str
    embedding: list[float]
    metadata: dict
    timestamp: datetime

class VectorMemory:
    """Long-term memory using vector similarity."""
    
    def __init__(
        self,
        embedding_client,
        embedding_model: str = "text-embedding-3-small"
    ):
        self.embedding_client = embedding_client
        self.embedding_model = embedding_model
        self.memories: list[MemoryEntry] = []
    
    def _embed(self, text: str) -> list[float]:
        """Get embedding for text."""
        
        response = self.embedding_client.embeddings.create(
            model=self.embedding_model,
            input=text
        )
        
        return response.data[0].embedding
    
    def add(
        self,
        content: str,
        metadata: dict = None
    ) -> str:
        """Add memory entry."""
        
        import uuid
        
        entry = MemoryEntry(
            id=str(uuid.uuid4()),
            content=content,
            embedding=self._embed(content),
            metadata=metadata or {},
            timestamp=datetime.now()
        )
        
        self.memories.append(entry)
        return entry.id
    
    def search(
        self,
        query: str,
        top_k: int = 5,
        threshold: float = 0.7
    ) -> list[MemoryEntry]:
        """Search memories by similarity."""
        
        query_embedding = self._embed(query)
        
        # Calculate similarities
        similarities = []
        
        for memory in self.memories:
            sim = self._cosine_similarity(query_embedding, memory.embedding)
            if sim >= threshold:
                similarities.append((memory, sim))
        
        # Sort by similarity
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        return [m for m, _ in similarities[:top_k]]
    
    def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
        """Calculate cosine similarity."""
        
        a = np.array(a)
        b = np.array(b)
        
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    def get_relevant_context(
        self,
        query: str,
        max_tokens: int = 2000
    ) -> str:
        """Get relevant memories as context string."""
        
        memories = self.search(query)
        
        context_parts = []
        current_tokens = 0
        
        for memory in memories:
            # Rough token estimate
            tokens = len(memory.content.split()) * 1.5
            
            if current_tokens + tokens > max_tokens:
                break
            
            context_parts.append(memory.content)
            current_tokens += tokens
        
        return "\n\n".join(context_parts)

# Conversation-aware vector memory
class ConversationVectorMemory:
    """Vector memory that stores conversation turns."""
    
    def __init__(self, embedding_client):
        self.vector_memory = VectorMemory(embedding_client)
        self.conversation_id: Optional[str] = None
    
    def start_conversation(self, conversation_id: str = None):
        """Start new conversation."""
        
        import uuid
        self.conversation_id = conversation_id or str(uuid.uuid4())
    
    def add_turn(self, user_message: str, assistant_response: str):
        """Add conversation turn to memory."""
        
        # Store as combined turn
        content = f"User: {user_message}\nAssistant: {assistant_response}"
        
        self.vector_memory.add(
            content=content,
            metadata={
                "conversation_id": self.conversation_id,
                "user_message": user_message,
                "assistant_response": assistant_response
            }
        )
    
    def recall(self, query: str, top_k: int = 3) -> list[dict]:
        """Recall relevant past conversations."""
        
        memories = self.vector_memory.search(query, top_k=top_k)
        
        return [
            {
                "user": m.metadata.get("user_message", ""),
                "assistant": m.metadata.get("assistant_response", ""),
                "conversation_id": m.metadata.get("conversation_id", "")
            }
            for m in memories
        ]

Hybrid Memory System

from dataclasses import dataclass
from typing import Optional

@dataclass
class MemoryConfig:
    """Configuration for hybrid memory."""
    
    buffer_max_messages: int = 10
    summary_threshold: int = 20
    vector_search_k: int = 5
    max_context_tokens: int = 4000

class HybridMemory:
    """Combines buffer, summary, and vector memory."""
    
    def __init__(
        self,
        client,
        embedding_client,
        config: MemoryConfig = None
    ):
        self.client = client
        self.config = config or MemoryConfig()
        
        # Short-term: recent messages
        self.buffer = SlidingWindowBuffer(
            max_tokens=self.config.max_context_tokens // 2
        )
        
        # Medium-term: summaries
        self.summary = SummaryMemory(
            client,
            max_messages_before_summary=self.config.summary_threshold
        )
        
        # Long-term: vector store
        self.vector = VectorMemory(embedding_client)
        
        # Track all messages for vector storage
        self.message_count = 0
    
    def add(self, role: str, content: str):
        """Add message to all memory systems."""
        
        # Add to buffer
        self.buffer.add(role, content)
        
        # Add to summary system
        self.summary.add(role, content)
        
        # Add to vector store (every few messages)
        self.message_count += 1
        if self.message_count % 5 == 0:
            # Store recent context
            recent = self.buffer.get_messages()[-5:]
            context = "\n".join([
                f"{m['role']}: {m['content']}"
                for m in recent
            ])
            self.vector.add(context)
    
    def get_context(self, current_query: str) -> dict:
        """Get combined context from all memory systems."""
        
        # Recent messages from buffer
        recent_messages = self.buffer.get_messages()
        
        # Summary of older conversation
        summary_context = self.summary.get_context()
        
        # Relevant memories from vector store
        relevant_memories = self.vector.get_relevant_context(
            current_query,
            max_tokens=500
        )
        
        return {
            "recent_messages": recent_messages,
            "summary": summary_context,
            "relevant_memories": relevant_memories
        }
    
    def build_messages(
        self,
        system_prompt: str,
        current_query: str
    ) -> list[dict]:
        """Build messages list for API call."""
        
        context = self.get_context(current_query)
        
        messages = []
        
        # System prompt with memory context
        memory_context = ""
        
        if context["summary"]:
            memory_context += f"\n\nConversation history:\n{context['summary']}"
        
        if context["relevant_memories"]:
            memory_context += f"\n\nRelevant past context:\n{context['relevant_memories']}"
        
        messages.append({
            "role": "system",
            "content": system_prompt + memory_context
        })
        
        # Recent messages
        messages.extend(context["recent_messages"])
        
        return messages

# Entity memory
class EntityMemory:
    """Track entities mentioned in conversation."""
    
    def __init__(self, client):
        self.client = client
        self.entities: dict[str, dict] = {}
    
    def extract_entities(self, text: str) -> list[dict]:
        """Extract entities from text using LLM."""
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": f"""Extract entities from this text. Return JSON array:
[{{"name": "entity name", "type": "person/place/org/concept", "info": "relevant info"}}]

Text: {text}"""
            }],
            response_format={"type": "json_object"}
        )
        
        import json
        result = json.loads(response.choices[0].message.content)
        return result.get("entities", [])
    
    def update(self, text: str):
        """Update entity memory from text."""
        
        entities = self.extract_entities(text)
        
        for entity in entities:
            name = entity.get("name", "").lower()
            
            if name in self.entities:
                # Merge info
                existing = self.entities[name]
                existing["info"] = f"{existing['info']}; {entity.get('info', '')}"
            else:
                self.entities[name] = entity
    
    def get_entity_context(self) -> str:
        """Get entity context for prompt."""
        
        if not self.entities:
            return ""
        
        lines = ["Known entities:"]
        for name, info in self.entities.items():
            lines.append(f"- {name} ({info['type']}): {info['info']}")
        
        return "\n".join(lines)

Production Memory Service

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
import uuid

app = FastAPI()

# Initialize memory systems
from openai import OpenAI
client = OpenAI()

# Session-based memory storage
sessions: dict[str, HybridMemory] = {}

def get_or_create_session(session_id: str) -> HybridMemory:
    """Get or create memory for session."""
    
    if session_id not in sessions:
        sessions[session_id] = HybridMemory(client, client)
    
    return sessions[session_id]

class ChatRequest(BaseModel):
    session_id: Optional[str] = None
    message: str
    system_prompt: Optional[str] = "You are a helpful assistant."

class MemoryRequest(BaseModel):
    session_id: str
    content: str
    metadata: Optional[dict] = None

@app.post("/v1/chat")
async def chat_with_memory(request: ChatRequest):
    """Chat endpoint with memory."""
    
    # Get or create session
    session_id = request.session_id or str(uuid.uuid4())
    memory = get_or_create_session(session_id)
    
    # Build messages with memory context
    messages = memory.build_messages(
        request.system_prompt,
        request.message
    )
    
    # Add current message
    messages.append({"role": "user", "content": request.message})
    
    # Get response
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )
    
    assistant_message = response.choices[0].message.content
    
    # Update memory
    memory.add("user", request.message)
    memory.add("assistant", assistant_message)
    
    return {
        "session_id": session_id,
        "response": assistant_message,
        "memory_stats": {
            "buffer_size": len(memory.buffer.messages),
            "vector_count": len(memory.vector.memories)
        }
    }

@app.post("/v1/memory/add")
async def add_memory(request: MemoryRequest):
    """Add explicit memory entry."""
    
    memory = get_or_create_session(request.session_id)
    
    memory_id = memory.vector.add(
        content=request.content,
        metadata=request.metadata
    )
    
    return {"memory_id": memory_id}

@app.get("/v1/memory/search")
async def search_memory(session_id: str, query: str, top_k: int = 5):
    """Search session memory."""
    
    if session_id not in sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    
    memory = sessions[session_id]
    results = memory.vector.search(query, top_k=top_k)
    
    return {
        "results": [
            {"content": r.content, "metadata": r.metadata}
            for r in results
        ]
    }

@app.delete("/v1/session/{session_id}")
async def delete_session(session_id: str):
    """Delete session and its memory."""
    
    if session_id in sessions:
        del sessions[session_id]
    
    return {"deleted": True}

@app.get("/health")
async def health():
    return {
        "status": "healthy",
        "active_sessions": len(sessions)
    }

References

Conclusion

Effective memory systems transform stateless LLMs into contextually aware assistants. Use conversation buffers for immediate context—sliding windows with token limits prevent context overflow. Implement summary memory to compress older conversations while preserving key information. Vector store memory enables semantic retrieval of relevant past interactions across sessions. Hybrid systems combining all three approaches provide the best results: recent context from buffers, compressed history from summaries, and relevant long-term memories from vector search. For production, implement session management, memory persistence, and cleanup policies. The goal is giving the model enough context to be helpful without overwhelming it with irrelevant history.


Discover more from C4: Container, Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.