from typing import List, Dict, Any
class RAGTradeoffAnalyzer:
Analyzes the latency tradeoff between RAG (Retrieval-Augmented Generation)
and in-context learning for different context lengths.
retrieval_time_ms: float = 150.0,
token_generation_rate: float = 80.0,
prefill_rate_per_token: float = 0.05):
self.retrieval_time_ms = retrieval_time_ms
self.token_generation_rate = token_generation_rate
self.prefill_rate_per_token = prefill_rate_per_token
def calculate_rag_latency(self,
output_tokens: int) -> Dict[str, float]:
rag_context_tokens = min(context_tokens, 2000)
prefill_time = (query_tokens + rag_context_tokens) * self.prefill_rate_per_token
generation_time = (output_tokens / self.token_generation_rate) * 1000
total_time = self.retrieval_time_ms + prefill_time + generation_time
"retrieval_ms": self.retrieval_time_ms,
"prefill_ms": prefill_time,
"generation_ms": generation_time,
"context_tokens": rag_context_tokens
def calculate_in_context_latency(self,
full_context_tokens: int,
output_tokens: int) -> Dict[str, float]:
prefill_time = (query_tokens + full_context_tokens) * self.prefill_rate_per_token
generation_time = (output_tokens / self.token_generation_rate) * 1000
total_time = prefill_time + generation_time
"approach": "In-context",
"prefill_ms": prefill_time,
"generation_ms": generation_time,
"context_tokens": full_context_tokens
def compare_approaches(self,
full_context_tokens: int = 8000,
output_tokens: int = 200) -> Dict[str, Any]:
rag = self.calculate_rag_latency(query_tokens, full_context_tokens, output_tokens)
ic = self.calculate_in_context_latency(query_tokens, full_context_tokens, output_tokens)
faster_approach = "RAG" if rag["total_ms"] < ic["total_ms"] else "In-context"
latency_diff = abs(rag["total_ms"] - ic["total_ms"])
percent_diff = (latency_diff / min(rag["total_ms"], ic["total_ms"])) * 100
"comparison": {"RAG": rag, "In-context": ic},
"faster_approach": faster_approach,
"latency_difference_ms": round(latency_diff, 2),
"percent_difference": round(percent_diff, 1),
"recommendation": self._generate_recommendation(rag, ic, full_context_tokens)
def _generate_recommendation(self, rag: Dict, ic: Dict, full_context: int) -> str:
return "Use in-context learning - context is short enough for efficient processing"
elif rag["total_ms"] < ic["total_ms"]:
return "Use RAG - retrieval overhead is less than long context processing"
return "Use in-context learning - context length is manageable"
if __name__ == "__main__":
analyzer = RAGTradeoffAnalyzer()
print("=== RAG vs In-Context Learning Tradeoff Analysis ===")
print("\\nScenario: Query with 50 tokens, generating 200 tokens")
print("RAG retrieval time: 150ms, Generation rate: 80 tokens/sec\\n")
test_contexts = [1000, 2000, 4000, 8000, 16000, 32000]
print(f"{'Context':<12} {'RAG (ms)':<12} {'In-Context (ms)':<18} {'Faster':<12")