Skip to content
GitHubX/TwitterRSS

Memory & State Tracking for AI Agents: Complete Guide

Memory & State Tracking for AI Agents: Production-Ready Implementation Guide

Section titled β€œMemory & State Tracking for AI Agents: Production-Ready Implementation Guide”

A single agent processing 50,000 daily requests can burn an extra $47,000 per week by failing to manage context properly. The culprit? Storing full conversation history in every request instead of using memory banks and context caching. This guide provides production-ready implementations for agent memory management, state tracking, and context optimization that can reduce token costs by 60-80% while improving performance.

The shift from simple chatbots to autonomous agents has fundamentally changed how we approach memory. Agents maintain multi-turn conversations, execute long-running tasks, and coordinate across sessionsβ€”each requiring sophisticated state tracking.

Without proper memory management, costs scale exponentially:

ScenarioDaily RequestsAvg Tokens/RequestDaily Cost (GPT-4o)Monthly Cost
No optimization50,0008,000$2,000$60,000
With memory bank50,0002,000$500$15,000
With caching + bank50,000800$200$6,000

Based on GPT-4o pricing: $5.00/$15.00 per 1M tokens (128K context)

Even with massive context windows, performance degrades as usage increases. Research from Google Cloud shows that while Gemini 1.5 Pro supports 2 million tokens with greater than 99% retrieval accuracy, models still experience β€œattention budget” depletionβ€”every token added reduces focus on other tokens Google Cloud Documentation.

Memory banks store long-term information outside the context window, retrieving only relevant data when needed. This pattern is essential for cross-session continuity and cost control.

Google Cloud’s Memory Bank provides scope-based isolation with automatic expiration and revision tracking Google Cloud Documentation.

import requests
import json
from typing import List, Dict, Optional
class AgentMemoryManager:
"""Manages long-term memory for AI agents using Vertex AI Memory Bank."""
def __init__(self, project_id: str, location: str, agent_id: str):
self.project_id = project_id
self.location = location
self.agent_id = agent_id
self.base_url = f"https://{location}-aiplatform.googleapis.com/v1"
self.session_id = None
def create_session(self, user_id: str) -> str:
"""Create a new session for a user."""
url = f"{self.base_url}/projects/{self.project_id}/locations/{self.location}/reasoningEngines/{self.agent_id}/sessions"
payload = {
"user_id": user_id
}
try:
response = requests.post(url, json=payload)
response.raise_for_status()
self.session_id = response.json()["name"].split("/")[-1]
return self.session_id
except requests.exceptions.RequestException as e:
print(f"Error creating session: {e}")
raise
def append_event(self, event_type: str, content: str) -> bool:
"""Append an event to the current session."""
if not self.session_id:
raise ValueError("Session must be created first")
url = f"{self.base_url}/projects/{self.project_id}/locations/{self.location}/reasoningEngines/{self.agent_id}/sessions/{self.session_id}:appendEvent"
payload = {
"event": {
"event_type": event_type,
"content": content
}
}
try:
response = requests.post(url, json=payload)
response.raise_for_status()
return True
except requests.exceptions.RequestException as e:
print(f"Error appending event: {e}")
return False
def generate_memories(self, topics: Optional[List[str]] = None) -> Dict:
"""Generate memories from conversation history."""
if not self.session_id:
raise ValueError("Session must be created first")
url = f"{self.base_url}/projects/{self.project_id}/locations/{self.location}/reasoningEngines/{self.agent_id}/memories:generate"
payload = {
"scope": {
"agent_name": self.agent_id,
"user": self.session_id
}
}
if topics:
payload["memory_topics"] = topics
try:
response = requests.post(url, json=payload)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Error generating memories: {e}")
raise
def retrieve_memories(self, query: str, top_k: int = 5) -> List[Dict]:
"""Retrieve relevant memories using similarity search."""
if not self.session_id:
raise ValueError("Session must be created first")
url = f"{self.base_url}/projects/{self.project_id}/locations/{self.location}/reasoningEngines/{self.agent_id}/memories:retrieve"
payload = {
"scope": {
"agent_name": self.agent_id,
"user": self.session_id
},
"query": query,
"top_k": top_k
}
try:
response = requests.post(url, json=payload)
response.raise_for_status()
return response.json().get("memories", [])
except requests.exceptions.RequestException as e:
print(f"Error retrieving memories: {e}")
return []
# Example usage
if __name__ == "__main__":
# Initialize memory manager
memory_manager = AgentMemoryManager(
project_id="your-project-id",
location="us-central1",
agent_id="my-agent"
)
# Create session for user
session_id = memory_manager.create_session("user-123")
print(f"Created session: {session_id}")
# Add conversation events
memory_manager.append_event("user_message", "I prefer Python for data analysis")
memory_manager.append_event("agent_response", "I'll remember you prefer Python")
# Generate memories from conversation
memories = memory_manager.generate_memories(topics=["programming_languages", "user_preferences"])
print(f"Generated memories: {json.dumps(memories, indent=2)}")
# Retrieve relevant memories
relevant = memory_manager.retrieve_memories("What programming language does the user prefer?", top_k=3)
print(f"Retrieved memories: {json.dumps(relevant, indent=2)}")

Key Features:

  • Scope-based isolation: Memories are accessible only to specific users and agents
  • Automatic expiration: TTL (Time-To-Live) prevents stale data accumulation
  • Revision tracking: All memory updates are versioned for audit trails
  • Similarity search: Retrieve relevant memories using vector embeddings

Context caching stores common prefixes (like system prompts) once and reuses them across requests, dramatically reducing costs for repetitive operations.

vLLM’s PagedAttention and prefix caching can improve throughput by 2-4x while reducing memory waste from 60-80% to near-zero Princeton COS597.

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
import time
from typing import List, Dict
class EfficientAgentMemory:
"""Optimized memory management for AI agents using vLLM's PagedAttention."""
def __init__(self, model_name: str = "meta-llama/Llama-2-7b-chat-hf",
gpu_memory_utilization: float = 0.95,
max_model_len: int = 4096):
"""
Initialize vLLM engine with memory-efficient settings.
Args:
model_name: HuggingFace model identifier
gpu_memory_utilization: Fraction of GPU memory to use (0.0-1.0)
max_model_len: Maximum sequence length to cache
"""
self.engine_args = EngineArgs(
model=model_name,
gpu_memory_utilization=gpu_memory_utilization,
max_model_len=max_model_len,
enable_prefix_caching=True, # Cache common prefixes
quantization="awq" # Use AWQ for memory efficiency if supported
)
self.llm = LLM(**self.engine_args.__dict__)
self.conversation_cache = {}
def process_multi_turn_conversation(self, conversation_history: List[Dict[str, str]]) -> str:
"""
Process multi-turn conversation with context caching.
Args:
conversation_history: List of {'role': 'user'|'assistant', 'content': str}
Returns:
Generated response
"""
# Format conversation with proper tokens
formatted_prompt = self._format_conversation(conversation_history)
# Use sampling params optimized for agent responses
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.95,
max_tokens=512,
repetition_penalty=1.1
)
try:
start_time = time.time()
outputs = self.llm.generate([formatted_prompt], sampling_params)
generation_time = time.time() - start_time
# Log performance metrics
generated_text = outputs[0].outputs[0].text
prompt_tokens = len(outputs[0].prompt_token_ids)
generated_tokens = len(outputs[0].outputs[0].token_ids)
print(f"Generation completed in {generation_time:.2f}s")
print(f"Prompt tokens: {prompt_tokens}, Generated tokens: {generated_tokens}")
print(f"Throughput: {(prompt_tokens + generated_tokens) / generation_time:.2f} tokens/sec")
return generated_text
except Exception as e:
print(f"Error during generation: {e}")
raise
def _format_conversation(self, conversation_history: List[Dict[str, str]]) -> str:
"""Format conversation history into model prompt."""
formatted = []
for turn in conversation_history:
role = turn['role']
content = turn['content']
if role == 'user':
formatted.append(f"<|user|>\n{content}<|end|>")
elif role == 'assistant':
formatted.append(f"<|assistant|>\n{content}<|end|>")
formatted.append("<|assistant|>\n")
return "\n".join(formatted)
def batch_process_agents(self, agent_prompts: List[str]) -> List[str]:
"""
Process multiple agent prompts simultaneously using continuous batching.
Args:
agent_prompts: List of formatted prompts for different agents
Returns:
List of generated responses
"""
sampling_params = SamplingParams(
temperature=0.7,
max_tokens=256
)
try:
start_time = time.time()
outputs = self.llm.generate(agent_prompts, sampling_params)
batch_time = time.time() - start_time
total_tokens = sum(
len(out.prompt_token_ids) + len(out.outputs[0].token_ids)
for out in outputs
)
print(f"Batch processed {len(agent_prompts)} requests in {batch_time:.2f}s")
print(f"Total throughput: {total_tokens / batch_time:.2f} tokens/sec")
return [out.outputs[0].text for out in outputs]
except Exception as e:
print(f"Error in batch processing: {e}")
raise
# Example usage
if __name__ == "__main__":
# Initialize memory-efficient agent
agent = EfficientAgentMemory(
model_name="mistralai/Mistral-7B-Instruct-v0.1",
gpu_memory_utilization=0.90,
max_model_len=8192
)
# Single conversation example
conversation = [
{"role": "user", "content": "What's the best way to manage agent memory?"},
{"role": "assistant", "content": "Use context caching and memory banks for long-term storage."},
{"role": "user", "content": "Can you elaborate on that?"}
]
response = agent.process_multi_turn_conversation(conversation)
print(f"\nAgent response:\n{response}")
# Batch processing example
batch_prompts = [
"<|user|>\nExplain PagedAttention<|end|><|assistant|>\n",
"<|user|>\nWhat is context caching?<|end|><|assistant|>\n",
"<|user|>\nHow does vLLM improve throughput?<|end|><|assistant|>\n"
]
batch_results = agent.batch_process_agents(batch_prompts)
for i, result in enumerate(batch_results):
print(f"\nBatch result {i+1}: {result}")

Performance Benefits:

  • 2-4x throughput improvement over traditional serving systems
  • Near-zero memory waste through fixed-size block allocation
  • Continuous batching handles multiple requests efficiently
  • Prefix caching eliminates redundant computation for common instructions

Production agents require real-time state tracking to prevent context overflow and detect memory pressure issues.

interface AgentState {
sessionId: string;
userId: string;
contextWindow: number;
memoryUsage: MemoryUsage;
conversationHistory: Message[];
lastMemoryUpdate: Date;
}
interface MemoryUsage {
tokensUsed: number;
tokensAvailable: number;
utilizationPercent: number;
}
interface Message {
role: 'user' | 'assistant' | 'system';
content: string;
timestamp: Date;
tokenCount: number;
}
class AgentStateDebugger {
private state: AgentState;
private maxContextWindow: number;
private memoryThreshold: number;
constructor(userId: string, maxContextWindow: number = 128000) {
this.maxContextWindow = maxContextWindow;
this.memoryThreshold = 0.85; // Alert at 85% usage
this.state = {
sessionId: this.generateSessionId(),
userId,
contextWindow: maxContextWindow,
memoryUsage: {
tokensUsed: 0,
tokensAvailable: maxContextWindow,
utilizationPercent: 0
},
conversationHistory: [],
lastMemoryUpdate: new Date()
};
}
private generateSessionId(): string {
return `sess_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
}
// Estimate token count (approximate for debugging)
private estimateTokenCount(text: string): number {
// Rough estimate: 1 token β‰ˆ 4 characters
return Math.ceil(text.length / 4);
}
addMessage(role: 'user' | 'assistant' | 'system', content: string): void {
const tokenCount = this.estimateTokenCount(content);
const message: Message = {
role,
content,
timestamp: new Date(),
tokenCount
};
this.state.conversationHistory.push(message);
this.state.memoryUsage.tokensUsed += tokenCount;
this.state.memoryUsage.tokensAvailable = this.maxContextWindow - this.state.memoryUsage.tokensUsed;
this.state.memoryUsage.utilizationPercent = (this.state.memoryUsage.tokensUsed / this.maxContextWindow) * 100;
this.state.lastMemoryUpdate = new Date();
// Check for memory pressure
if (this.state.memoryUsage.utilizationPercent >= this.memoryThreshold * 100) {
this.handleMemoryPressure();
}
}
private handleMemoryPressure(): void {
const alert = {
level: 'WARNING',
message: 'Context window approaching capacity',
sessionId: this.state.sessionId,
utilization: this.state.memoryUsage.utilizationPercent.toFixed(2) + '%',
tokensUsed: this.state.memoryUsage.tokensUsed,
recommendedAction: 'Consider summarizing history or using memory bank'
};
console.warn(JSON.stringify(alert, null, 2));
this.logState();
}
logState(): void {
console.log(`\n=== Agent State Debug ===`);
console.log(`Session: ${this.state.sessionId}`);
console.log(`User: ${this.state.userId}`);
console.log(`Messages: ${this.state.conversationHistory.length}`);
console.log(`Memory: ${this.state.memoryUsage.tokensUsed}/${this.maxContextWindow} tokens`);
console.log(`Utilization: ${this.state.memoryUsage.utilizationPercent.toFixed(2)}%`);
console.log(`Last Update: ${this.state.lastMemoryUpdate.toISOString()}`);
console.log(`========================\n`);
}
getSummary(): AgentState {
return { ...this.state };
}
// Simulate memory optimization strategy
optimizeContext(): void {
if (this.state.memoryUsage.utilizationPercent > 70) {
console.log('Triggering context optimization...');
// Strategy 1: Summarize older messages
if (this.state.conversationHistory.length > 10) {
const recentMessages = this.state.conversationHistory.slice(-5);
const olderMessages = this.state.conversationHistory.slice(0, -5);
const summaryTokenCount = olderMessages.reduce((sum, msg) => sum + msg.tokenCount, 0);
console.log(`Summarized ${olderMessages.length} older messages (${summaryTokenCount} tokens)`);
console.log(`Keeping ${recentMessages.length} recent messages`);
// In production, you would send these to a memory bank
this.saveToMemoryBank(olderMessages);
}
}
}
private saveToMemoryBank(messages: Message[]): void {
// Placeholder for memory bank integration
const memoryEntry = {
type: 'conversation_summary',
sessionId: this.state.sessionId,
timestamp: new Date(),
messageCount: messages.length,
totalTokens: messages.reduce((sum, msg) => sum + msg.tokenCount, 0)
};
console.log('Saving to memory bank:', JSON.stringify(memoryEntry, null, 2));
}
resetSession(): void {
this.state.conversationHistory = [];
this.state.memoryUsage.tokensUsed = 0;
this.state.memoryUsage.tokensAvailable = this.maxContextWindow;
this.state.memoryUsage.utilizationPercent = 0;
this.state.lastMemoryUpdate = new Date();
console.log(`Session ${this.state.sessionId} reset`);
}
}
// Example usage
const debugger = new AgentStateDebugger('user-12345', 200000);
// Simulate conversation
const sampleConversation = [
{ role: 'user', content: 'I need help with Python data analysis' },
{ role: 'assistant', content: 'I can help you with pandas, numpy, and matplotlib. What specific task are you working on?' },
{ role: 'user', content: 'I have a CSV file with 10 million rows and need to clean it efficiently' },
{ role: 'assistant', content: 'For 10 million rows, use pandas with chunking or Dask for parallel processing. Let me show you the optimal approach...' },
{ role: 'user', content: 'Can you also explain memory optimization techniques?' }
];
sampleConversation.forEach(msg => {
debugger.addMessage(msg.role, msg.content);
});
// Show current state
debugger.logState();
// Check if optimization is needed
debugger.optimizeContext();
export { AgentStateDebugger, AgentState, Message };