my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	#!/usr/bin/env python3
#2	"""
#3	BEAM End-to-End Evaluation Pipeline
#4	===================================
#5	Evaluates Mnemosyne as a memory backend for LLMs using the official
#6	BEAM benchmark protocol:
#7	1. Download BEAM dataset from HuggingFace
#8	2. Ingest conversations into Mnemosyne
#9	3. For each probing question: retrieve memories -> LLM answers -> LLM-as-judge scores
#10	4. Report per-scale, per-ability scores comparable to published SOTA
#11
#12	Published SOTA (BEAM 10M):
#13	Hindsight: 64.1% Honcho: 40.6% LIGHT: 26.6% RAG: 24.9%
#14
#15	LLM: Nvidia API (deepseek-ai/deepseek-v4-pro) via OpenAI-compatible endpoint.
#16	Fast, cheap (~$2/M tokens), no local GPU needed.
#17
#18	Usage:
#19	cd /root/.hermes/projects/mnemosyne
#20	.venv/bin/python tools/evaluate_beam_end_to_end.py --sample 5 --scales 100K,500K,1M,10M
#21
#22	--sample N: conversations per scale (default 3, use 0 for all)
#23	--scales: comma-separated (default 100K,500K,1M,10M)
#24	--mode: retrieval\|end_to_end (default end_to_end)
#25	--judge-model: LLM model for judging (default same as answer model)
#26	--resume: skip already-evaluated questions from results file
#27	"""
#28
#29	import argparse
#30	import ast
#31	import gc
#32	import json
#33	import math
#34	import os
#35	import sys
#36	import tempfile
#37	import time
#38	from collections import defaultdict
#39	from datetime import datetime, timezone
#40	from functools import partial
#41	from pathlib import Path
#42
#43	# Unbuffered output for real-time progress
#44	print = partial(print, flush=True)
#45
#46	# --- Setup ---
#47	PROJECT_ROOT = Path(__file__).resolve().parent.parent
#48	sys.path.insert(0, str(PROJECT_ROOT))
#49
#50	import urllib.request
#51	import urllib.error
#52	import numpy as np
#53
#54	from mnemosyne.core.beam import BeamMemory, init_beam, _embeddings, _vec_available, _vec_insert, _fts_search_working, _generate_id
#55
#56	# --- Config ---
#57	OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
#58	if not OPENROUTER_API_KEY:
#59	# Try to load from file — check opencode first, then openrouter
#60	for _kf in ["/tmp/opencode_key.txt", "/tmp/openrouter_key.txt"]:
#61	_key_file = Path(_kf)
#62	if _key_file.exists():
#63	with open(_key_file) as f:
#64	_content = f.read().strip()
#65	if "export" in _content:
#66	OPENROUTER_API_KEY = _content.split("=", 1)[1].strip().strip('"').strip("'")
#67	else:
#68	OPENROUTER_API_KEY = _content
#69	break
#70	NVIDIA_API_KEY = os.environ.get("NVIDIA_API_KEY", "")
#71	OPENROUTER_BASE_URL = os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
#72	DEFAULT_MODEL = "deepseek-v4-pro"
#73	FALLBACK_MODELS = [] # Disabled — fallback cascade burned $30 in credits
#74	DEFAULT_TOP_K = 10 # Memories to retrieve per question
#75	MAX_MEMORY_CONTEXT_CHARS = 8000 # Max chars of retrieved context to send to LLM
#76	BENCHMARK_QUERIES_PER_CONV = 50 # Max probing questions per conversation
#77	RESULTS_FILE = PROJECT_ROOT / "results" / "beam_e2e_results.json"
#78
#79	# Memory abilities tested by BEAM (10 dimensions)
#80	BEAM_ABILITIES = [
#81	"IE", # Information Extraction
#82	"MR", # Multi-hop Reasoning
#83	"KU", # Knowledge Update
#84	"TR", # Temporal Reasoning
#85	"ABS", # Abstention
#86	"CR", # Contradiction Resolution
#87	"EO", # Event Ordering
#88	"IF", # Instruction Following
#89	"PF", # Preference Following
#90	"SUM", # Summarization
#91	]
#92
#93	# Map dataset ability names to our abbreviations
#94	ABILITY_MAP = {
#95	"information_extraction": "IE",
#96	"multi_session_reasoning": "MR",
#97	"knowledge_update": "KU",
#98	"temporal_reasoning": "TR",
#99	"abstention": "ABS",
#100	"contradiction_resolution": "CR",
#101	"event_ordering": "EO",
#102	"instruction_following": "IF",
#103	"preference_following": "PF",
#104	"summarization": "SUM",
#105	# Aliases
#106	"multi_session": "MR",
#107	"knowledge": "KU",
#108	"temporal": "TR",
#109	"information": "IE",
#110	}
#111
#112
#113	# ============================================================
#114	# LLM Client
#115	# ============================================================
#116
#117	class LLMClient:
#118	"""OpenAI-compatible API client using OpenRouter (fast, reliable)."""
#119
#120	_last_429_time = 0 # Class-level rate-limit cooldown
#121
#122	def __init__(self, model: str = DEFAULT_MODEL, api_key: str = None, base_url: str = None):
#123	self.model = model
#124	self.api_key = api_key or OPENROUTER_API_KEY
#125	self.base_url = (base_url or OPENROUTER_BASE_URL).rstrip("/")
#126	self.fallback_models = FALLBACK_MODELS.copy()
#127	self.call_count = 0
#128
#129	def chat(self, messages: list, temperature: float = 0.1, max_tokens: int = 1024) -> str:
#130	"""Send chat completion request with retry. No fallback models to avoid rate limits."""
#131
#132	last_error = None
#133	for attempt in range(3):
#134	try:
#135	return self._call_api(self.model, messages, temperature, max_tokens)
#136	except Exception as e:
#137	last_error = str(e)
#138	if "429" in last_error or "rate" in last_error.lower():
#139	wait = 15 * (attempt + 1) # 15s, 30s, 45s backoff
#140	time.sleep(wait)
#141	continue
#142	else:
#143	break # Non-retryable error
#144
#145	return f"[LLM_ERROR: all models failed. Last: {last_error}]"
#146
#147	def _call_api(self, model: str, messages: list, temperature: float, max_tokens: int) -> str:
#148	"""Single API call via requests (urllib blocked by Cloudflare on some providers)."""
#149	import json as _json
#150	import requests as _requests
#151	url = f"{self.base_url}/chat/completions"
#152	payload = {
#153	"model": model,
#154	"messages": messages,
#155	"temperature": temperature,
#156	"max_tokens": max_tokens,
#157	}
#158	headers = {
#159	"Authorization": f"Bearer {self.api_key}",
#160	"Content-Type": "application/json",
#161	"HTTP-Referer": "https://mnemosyne.site",
#162	"X-Title": "Mnemosyne Benchmark",
#163	}
#164	resp = _requests.post(url, json=payload, headers=headers, timeout=60)
#165	resp.raise_for_status()
#166	data = resp.json()
#167	self.call_count += 1
#168	return data["choices"][0]["message"]["content"]
#169
#170	def close(self):
#171	pass
#172
#173
#174	# ============================================================
#175	# Data Loading (adapted from benchmark_beam_sota.py)
#176	# ============================================================
#177
#178	def load_beam_dataset(scales: list[str], max_conversations: int = None) -> dict:
#179	"""Load BEAM dataset from HuggingFace. Returns dict[scale] -> list[conversation]."""
#180	try:
#181	from datasets import load_dataset
#182	except ImportError:
#183	print("ERROR: 'datasets' package not installed. Run: pip install datasets")
#184	sys.exit(1)
#185
#186	data = {}
#187	total_loaded = 0
#188
#189	for scale in scales:
#190	print(f" Loading BEAM {scale}...")
#191	try:
#192	if scale == "10M":
#193	ds = load_dataset("Mohammadta/BEAM-10M", streaming=True)
#194	split_name = "10M" if "10M" in ds else list(ds.keys())[0]
#195
#196	conversations = []
#197	for i, sample in enumerate(ds[split_name]):
#198	if max_conversations and i >= max_conversations:
#199	break
#200
#201	probing_raw = sample.get("probing_questions", {})
#202	if isinstance(probing_raw, str):
#203	try:
#204	probing = ast.literal_eval(probing_raw)
#205	except Exception:
#206	probing = {}
#207	else:
#208	probing = probing_raw
#209
#210	all_questions = []
#211	for ability, questions in probing.items():
#212	if isinstance(questions, list):
#213	for q in questions:
#214	if isinstance(q, dict):
#215	all_questions.append({
#216	"ability": ability,
#217	"question": q.get("question", ""),
#218	"ideal_answer": q.get("ideal_answer", q.get("ideal_response", q.get("answer", q.get("ideal_summary", "")))),
#219	"rubric": q.get("rubric", []),
#220	})
#221
#222	# Extract messages from plans
#223	plans = sample.get("plans", [])
#224	all_messages = []
#225	for plan in plans:
#226	chat_blocks = plan.get("chat", []) if isinstance(plan, dict) else []
#227	for block in chat_blocks:
#228	if isinstance(block, list):
#229	for msg in block:
#230	if isinstance(msg, dict):
#231	all_messages.append({
#232	"role": msg.get("role", "unknown"),
#233	"content": msg.get("content", ""),
#234	"index": len(all_messages),
#235	})
#236
#237	conversations.append({
#238	"id": sample.get("conversation_id", str(i)),
#239	"messages": all_messages,
#240	"questions": all_questions,
#241	"scale": "10M",
#242	})
#243	total_loaded += 1
#244
#245	data[scale] = conversations
#246	ds.cleanup_cache_files() if hasattr(ds, 'cleanup_cache_files') else None
#247	del ds
#248	gc.collect()
#249	print(f" Loaded {len(conversations)} conversations")
#250
#251	else:
#252	# 100K, 500K, 1M scales from the main dataset
#253	ds = load_dataset("Mohammadta/BEAM", streaming=True)
#254	if scale not in ds:
#255	print(f" WARNING: split '{scale}' not found. Available: {list(ds.keys())}")
#256	continue
#257
#258	conversations = []
#259	for i, sample in enumerate(ds[scale]):
#260	if max_conversations and i >= max_conversations:
#261	break
#262
#263	pq_raw = sample.get("probing_questions", "{}")
#264	if isinstance(pq_raw, str):
#265	try:
#266	probing = ast.literal_eval(pq_raw)
#267	except Exception:
#268	probing = {}
#269	else:
#270	probing = pq_raw
#271
#272	flat_questions = []
#273	for ability, questions in probing.items():
#274	if isinstance(questions, list):
#275	for q in questions:
#276	if isinstance(q, dict):
#277	flat_questions.append({
#278	"ability": ability,
#279	"question": q.get("question", ""),
#280	"ideal_answer": q.get("ideal_answer", q.get("ideal_response", q.get("answer", q.get("ideal_summary", "")))),
#281	"rubric": q.get("rubric", []),
#282	})
#283
#284	chat_blocks = sample.get("chat", [])
#285	messages = []
#286	for block in chat_blocks:
#287	if isinstance(block, list):
#288	for msg in block:
#289	if isinstance(msg, dict):
#290	messages.append({
#291	"role": msg.get("role", "unknown"),
#292	"content": msg.get("content", ""),
#293	"index": len(messages),
#294	})
#295	elif isinstance(block, dict):
#296	# Flat format: chat is a list of dicts directly
#297	messages.append({
#298	"role": block.get("role", "unknown"),
#299	"content": block.get("content", ""),
#300	"index": len(messages),
#301	})
#302
#303	conversations.append({
#304	"id": sample.get("conversation_id", str(i)),
#305	"messages": messages,
#306	"questions": flat_questions,
#307	"scale": scale,
#308	})
#309	total_loaded += 1
#310
#311	data[scale] = conversations
#312	ds.cleanup_cache_files() if hasattr(ds, 'cleanup_cache_files') else None
#313	del ds
#314	gc.collect()
#315	print(f" Loaded {len(conversations)} conversations")
#316
#317	except Exception as e:
#318	print(f" ERROR loading {scale}: {e}")
#319	import traceback
#320	traceback.print_exc()
#321
#322	print(f" Total: {total_loaded} conversations across {len(data)} scales")
#323	return data
#324
#325
#326	# ============================================================
#327	# Mnemosyne Ingestion
#328	# ============================================================
#329
#330	def _extract_facts(content: str, source: str = "unknown") -> list[dict]:
#331	"""Extract structured facts from a message for precision retrieval.
#332	These fact entries complement raw message storage by isolating
#333	specific data points (numbers, dates, versions, negations) that
#334	FTS5 keyword search can match more precisely than in long messages."""
#335	import re
#336	facts = []
#337
#338	# Pattern 1: Version numbers ("Flask 2.3.1", "v0.6.2", "Python 3.11")
#339	ver_matches = re.findall(r'([A-Z][a-zA-Z]+(?:\s[A-Z][a-zA-Z]+))\s+v?(\d+\.\d+(?:\.\d+)?)', content)
#340	for name, ver in ver_matches[:3]:
#341	facts.append({
#342	"content": f"FACT version: {name.strip()} {ver}",
#343	"importance": 0.7,
#344	})
#345
#346	# Pattern 2: Numbers with units ("250ms", "3 columns", "50 tasks", "5000 port")
#347	num_matches = re.findall(r'(\d+(?:[.,]\d+)?)\s*(ms\|sec\|seconds?\|minutes?\|hours?\|days?\|weeks?\|months?\|%\|KB\|MB\|GB\|columns?\|tasks?\|commits?\|users?\|ports?\|items?)', content, re.IGNORECASE)
#348	for num, unit in num_matches[:5]:
#349	facts.append({
#350	"content": f"FACT metric: {num}{unit}",
#351	"importance": 0.65,
#352	})
#353
#354	# Pattern 3: Dates
#355	date_patterns = [
#356	r'(January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December)\s+\d{1,2},?\s*\d{4}',
#357	r'\d{4}-\d{2}-\d{2}',
#358	]
#359	for pat in date_patterns:
#360	for match in re.findall(pat, content, re.IGNORECASE):
#361	if isinstance(match, tuple):
#362	match = " ".join(match)
#363	facts.append({
#364	"content": f"FACT date: {match}",
#365	"importance": 0.7,
#366	})
#367
#368	# Pattern 4: Deadlines
#369	deadline_matches = re.findall(r'(deadline\|due by\|sprint ends?\|sprint \d+)\s[:\-]?\s([^.,;!?\n]{5,80})', content, re.IGNORECASE)
#370	for ctx, detail in deadline_matches[:3]:
#371	facts.append({
#372	"content": f"FACT deadline: {ctx} {detail.strip()}",
#373	"importance": 0.7,
#374	})
#375
#376	# Pattern 5: Negations ("I have never", "I have not") - critical for CR
#377	negations = re.findall(r'(I(?: have\|\'ve)?\s*(?:never\|not)\s+[^.,;!?\n]{15,120})', content, re.IGNORECASE)
#378	for neg in negations[:3]:
#379	facts.append({
#380	"content": f"FACT negation: {neg.strip()}",
#381	"importance": 0.75,
#382	})
#383
#384	# Pattern 6: Decisions / choices
#385	choices = re.findall(r'(?:decided to\|chose to\|opted for\|selected\|picked\|switching to)\s+([^.,;!?\n]{10,120})', content, re.IGNORECASE)
#386	for choice in choices[:3]:
#387	facts.append({
#388	"content": f"FACT decision: {choice.strip()}",
#389	"importance": 0.65,
#390	})
#391
#392	# Pattern 7: Ordinal sequence markers ("first", "then", "finally") for EO
#393	ordinals = re.findall(r'((?:first\|second\|third\|fourth\|fifth\|finally\|next\|then\|after that)[^.,;!?\n]{15,120})', content, re.IGNORECASE)
#394	for ord_text in ordinals[:5]:
#395	facts.append({
#396	"content": f"FACT sequence: {ord_text.strip()}",
#397	"importance": 0.6,
#398	})
#399
#400	# Pattern 8: Entity-action pairs ("transactions table" + "add") for MR
#401	entities = re.findall(r'(?:the\|my\|our)\s+([a-z_]+\s*(?:table\|model\|schema\|API\|endpoint\|function\|module\|route\|handler))\s+(?:needs?\|requires?\|should\|could\|would\|will\|has\|have)\s+([^.,;!?\n]{10,80})', content, re.IGNORECASE)
#402	for entity, action in entities[:5]:
#403	facts.append({
#404	"content": f"FACT entity: {entity.strip()} -> {action.strip()}",
#405	"importance": 0.65,
#406	})
#407
#408	return facts[:20] # Cap per message
#409
#410	def ingest_conversation(beam: BeamMemory, messages: list[dict]) -> dict:
#411	"""Ingest conversation messages into Mnemosyne BEAM tiers.
#412	Also builds an in-memory facts index for fact-boosted retrieval."""
#413	start_time = time.perf_counter()
#414	stats = {"wm_count": 0, "ep_count": 0, "sp_count": 0, "total_chars": 0}
#415
#416	# In-memory context→value facts index for direct fact matching.
#417	# Format: {"context phrase": "fact value"} — maps question-like phrases to answers.
#418	# Example: "My first sprint ends on" → "March 29"
#419	# Built during ingestion, queried during answering for zero-LLM fact extraction.
#420	import re as _re2
#421	_FACT_VALUE_RE = _re2.compile(
#422	r'('
#423	r'\b(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]* \d{1,2}(?:[, ]*\d{4})?\b\|' # dates
#424	r'\b\d{4}-\d{2}-\d{2}\b\|' # ISO dates
#425	r'\b\d+[.,]?\d\s(?:ms\|sec\|mins?\|hours?\|days?\|weeks?\|months?\|years?\|%\|KB\|MB\|GB\|TB\|rows?\|columns?\|roles?\|features?\|bugs?\|commits?\|cards?\|users?\|items?\|tests?\|APIs?\|endpoints?\|sprints?\|tickets?)\b\|' # numbers+units
#426	r'\bv?\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?\b' # versions
#427	r')'
#428	)
#429	context_facts = getattr(beam, '_context_facts', {})
#430	if not hasattr(beam, '_context_facts'):
#431	beam._context_facts = {}
#432	context_facts = beam._context_facts
#433
#434	BATCH_SIZE = 500
#435
#436	for batch_start in range(0, len(messages), BATCH_SIZE):
#437	batch_msgs = messages[batch_start:batch_start + BATCH_SIZE]
#438
#439	batch_items = []
#440	for i, msg in enumerate(batch_msgs):
#441	content = msg.get("content", "")
#442	if not content.strip():
#443	continue
#444	batch_items.append({
#445	"content": content,
#446	"source": f"beam_{msg.get('role', 'unknown')}",
#447	"importance": 0.3 + (0.1 * ((batch_start + i) % 5)),
#448	})
#449	stats["total_chars"] += len(content)
#450
#451	# Extract context→value facts: words before AND after each fact value
#452	# SKIP version numbers (e.g., "3.39", "2.3.1") — they pollute fact matching
#453	# and are never the answer to BEAM questions (which ask about dates, counts, names).
#454	_VERSION_RE = _re2.compile(r'^\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?$')
#455	for match in _FACT_VALUE_RE.finditer(content):
#456	value = match.group()
#457	if _VERSION_RE.match(value):
#458	continue # Skip bare version numbers
#459	# Extract context: up to 12 words before + 8 words after the fact
#460	before = content[:match.start()].split()[-12:]
#461	after = content[match.end():].split()[:8]
#462	context_words = before + after
#463	context = ' '.join(context_words).lower().strip()
#464	if context and len(context) > 5:
#465	if context not in context_facts:
#466	context_facts[context] = []
#467	context_facts[context].append(value)
#468
#469	# Scratchpad every 10 messages
#470	if (batch_start + i) % 10 == 0 and len(content) > 50:
#471	try:
#472	beam.scratchpad_write(f"[t={batch_start + i}] {content[:300]}")
#473	stats["sp_count"] += 1
#474	except Exception:
#475	pass
#476
#477	if not batch_items:
#478	continue
#479
#480	beam.remember_batch(batch_items)
#481	stats["wm_count"] += len(batch_items)
#482
#483	# Cloud fact extraction: extract facts from batch if enabled
#484	if getattr(beam, 'use_cloud', False):
#485	try:
#486	from mnemosyne.extraction import ExtractionClient
#487	if beam._extraction_client is None:
#488	beam._extraction_client = ExtractionClient()
#489	facts = beam._extraction_client.extract_facts(batch_msgs)
#490	if facts:
#491	cursor = beam.conn.cursor()
#492	import hashlib
#493	for fact in facts:
#494	fact_id = hashlib.sha256(
#495	f"{fact.get('subject','')}:{fact.get('predicate','')}:{fact.get('object','')}:{batch_start}".encode()
#496	).hexdigest()[:24]
#497	cursor.execute("""
#498	INSERT OR IGNORE INTO facts
#499	(fact_id, session_id, subject, predicate, object,
#500	timestamp, source_msg_id, confidence)
#501	VALUES (?, ?, ?, ?, ?, ?, ?, ?)
#502	""", (
#503	fact_id,
#504	beam.session_id,
#505	fact.get("subject", ""),
#506	fact.get("predicate", "stated"),
#507	fact.get("object", ""),
#508	fact.get("timestamp", ""),
#509	fact.get("source_msg_id", ""),
#510	fact.get("confidence", 0.7),
#511	))
#512	beam.conn.commit()
#513	stats["fact_count"] = stats.get("fact_count", 0) + len(facts)
#514	except Exception:
#515	pass # Best-effort; don't fail ingestion
#516
#517	# Episodic consolidation per batch
#518	try:
#519	cursor = beam.conn.cursor()
#520	# Get ALL working memory items for this session (oldest first)
#521	cursor.execute("""
#522	SELECT id, content FROM working_memory
#523	WHERE session_id = ?
#524	ORDER BY timestamp ASC
#525	LIMIT 1000
#526	""", (beam.session_id,))
#527	wm_rows = cursor.fetchall()
#528
#529	if wm_rows:
#530	wm_ids = [row["id"] for row in wm_rows]
#531	recent_texts = [row["content"][:100] for row in wm_rows[:5]]
#532	summary = f"Batch {batch_start // BATCH_SIZE}: " + " \| ".join(recent_texts[:3])
#533	if len(summary) > 500:
#534	summary = summary[:497] + "..."
#535
#536	beam.consolidate_to_episodic(
#537	summary=summary,
#538	source_wm_ids=wm_ids,
#539	source="beam_consolidation",
#540	importance=0.4,
#541	scope="global",
#542	)
#543	stats["ep_count"] += 1
#544
#545	# Delete consolidated items from working memory to prevent bloat
#546	placeholders = ",".join("?" * len(wm_ids))
#547	cursor.execute(f"DELETE FROM working_memory WHERE id IN ({placeholders})", wm_ids)
#548	stats["wm_count"] -= len(wm_ids)
#549
#550	beam.conn.commit()
#551	except Exception:
#552	pass
#553
#554	stats["ingest_time_ms"] = (time.perf_counter() - start_time) * 1000
#555	return stats
#556
#557
#558	# ============================================================
#559	# LLM Answering with Mnemosyne Memory
#560	# ============================================================
#561
#562	ANSWER_SYSTEM_PROMPT = """You are a precise memory assistant answering questions about past conversations. You receive conversation context that may contain the answer.
#563
#564	CRITICAL: Think step-by-step before answering. Follow this structure:
#565
#566	STEP 1 - RELEVANT FACTS: List all specific facts from the context that relate to the question (dates, numbers, names, events, statements).
#567	STEP 2 - CONTRADICTIONS: If the context contains conflicting statements about the same topic, identify BOTH sides explicitly.
#568	STEP 3 - TEMPORAL/CALCULATIONS: For date/time questions, extract all relevant dates and compute the answer.
#569	STEP 4 - ANSWER: Provide a thorough final answer with all relevant details from the context.
#570
#571	RULES:
#572	- For EVENT ORDERING: list items in chronological order as they appear.
#573	- For CONTRADICTION: explicitly state "The conversation contains contradictory information: [A] vs [B]"
#574	- For SUMMARIZATION: include all key details — project stages, features, timelines, security, database, challenges.
#575	- NEVER say "I don't have enough information" unless absolutely nothing in the context mentions the topic.
#576	- For "how many" questions, provide the specific count, not a range."""
#577
#578	DEFAULT_TOP_K = 30 # Memories to retrieve per question (increased for broader context)
#579	RECENT_CONTEXT_COUNT = 12 # Last N messages to include as recent context
#580	MAX_MEMORY_CONTEXT_CHARS = 16000 # More context for LLM to find contradictions
#581
#582
#583	def _recall_safe(beam: BeamMemory, query: str, top_k: int, temporal_weight: float = 0.0) -> list:
#584	"""Safe recall wrapper that handles errors gracefully."""
#585	try:
#586	return beam.recall(query, top_k=top_k, temporal_weight=temporal_weight)
#587	except Exception:
#588	return []
#589
#590
#591	def _extract_search_terms(question: str) -> list[str]:
#592	"""Extract diverse search terms from a question for multi-strategy retrieval."""
#593	import re
#594	terms = []
#595
#596	# Extract quoted phrases
#597	quoted = re.findall(r'"([^"]+)"', question)
#598	terms.extend(quoted)
#599
#600	# Extract numbers and units
#601	numbers = re.findall(r'\b\d+[.,]?\d\s(?:ms\|sec\|days?\|weeks?\|months?\|years?\|%\|KB\|MB\|GB\|hours?\|minutes?)\b', question, re.IGNORECASE)
#602	terms.extend(numbers[:5])
#603
#604	# Extract named entities (capitalized phrases)
#605	entities = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', question)
#606	terms.extend(entities[:5])
#607
#608	# Extract version strings
#609	versions = re.findall(r'\bv?\d+\.\d+(?:\.\d+)?\b', question)
#610	terms.extend(versions[:5])
#611
#612	# Extract key nouns (filter out question words)
#613	stop_words = {'have', 'did', 'do', 'does', 'can', 'will', 'would', 'should', 'is', 'are', 'was', 'were',
#614	'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'my', 'me', 'i', 'you',
#615	'how', 'what', 'when', 'where', 'which', 'who', 'why', 'many', 'much'}
#616	words = [w for w in re.findall(r'\b[a-zA-Z]{3,}\b', question) if w.lower() not in stop_words]
#617	terms.extend(words[:10])
#618
#619	# Deduplicate while preserving order
#620	seen = set()
#621	unique = []
#622	for t in terms:
#623	if t.lower() not in seen:
#624	seen.add(t.lower())
#625	unique.append(t)
#626
#627	return unique
#628
#629
#630	def _multi_strategy_recall(beam: BeamMemory, question: str, top_k: int = DEFAULT_TOP_K, ability: str = None) -> list:
#631	"""Multi-strategy retrieval: keyword, semantic, entity, negation, temporal."""
#632	import re
#633	all_memories = []
#634	seen_content_keys = set()
#635
#636	def _add_unique(mems):
#637	for mem in mems:
#638	ck = mem.get("content", "")[:80]
#639	if ck not in seen_content_keys:
#640	seen_content_keys.add(ck)
#641	all_memories.append(mem)
#642
#643	# Detect temporal questions by ability type or keywords
#644	temporal_keywords = ['when', 'date', 'deadline', 'sprint', 'day', 'week', 'month',
#645	'april', 'march', 'february', 'january', 'may', 'june', 'july',
#646	'august', 'september', 'october', 'november', 'december',
#647	'monday', 'tuesday', 'wednesday', 'thursday', 'friday',
#648	'how many days', 'how long', 'timeline', 'schedule']
#649	is_temporal = ability in ('TR', 'EO') or any(w in question.lower() for w in temporal_keywords)
#650	temporal_weight = 0.3 if is_temporal else 0.0
#651
#652	# Strategy 1: Direct question search (mostly keyword via FTS5)
#653	_add_unique(_recall_safe(beam, question, top_k * 2, temporal_weight=temporal_weight))
#654
#655	# Strategy 2: Negation search for contradiction detection
#656	if any(w in question.lower() for w in ["have i", "did i", "do i", "am i", "has"]):
#657	negation_query = question
#658	for negation in ["never", "did not", "haven't"]:
#659	if negation not in negation_query.lower():
#660	negation_query = re.sub(r'(?i)(have i\|did i\|am i)', f'I {negation}', negation_query)
#661	break
#662	_add_unique(_recall_safe(beam, negation_query, top_k, temporal_weight=temporal_weight))
#663
#664	# Strategy 3: Key entity/term searches
#665	terms = _extract_search_terms(question)
#666	for term in terms[:5]:
#667	if len(term) > 2:
#668	_add_unique(_recall_safe(beam, term, max(5, top_k // 3), temporal_weight=temporal_weight))
#669
#670	# Strategy 4: Temporal search for date-related questions
#671	if is_temporal:
#672	# Stronger temporal boost for date-specific sub-queries
#673	date_temporal_weight = 0.5
#674	# Search for dates and timelines
#675	_add_unique(_recall_safe(beam, "deadline schedule timeline date", top_k, temporal_weight=date_temporal_weight))
#676	# Search for specific months mentioned in the question
#677	for month in ['january', 'february', 'march', 'april', 'may', 'june',
#678	'july', 'august', 'september', 'october', 'november', 'december']:
#679	if month in question.lower():
#680	_add_unique(_recall_safe(beam, month, top_k // 2, temporal_weight=date_temporal_weight))
#681
#682	# Sort by score and return top-k
#683	all_memories.sort(key=lambda x: x.get("score", 0), reverse=True)
#684	return all_memories[:top_k]
#685
#686
#687	# ============================================================
#688	# Per-Ability Bypasses: TR (Temporal Reasoning) + CR (Contradiction)
#689	# ============================================================
#690
#691	def _extract_timeline_from_conversation(messages: list) -> list[dict]:
#692	"""Extract ALL dates from conversation messages with surrounding event context.
#693	Filters out dates in code snippets. Returns sorted list of {date_obj, date_str, event_text, msg_index}."""
#694	import re as _re
#695	from datetime import datetime as _dt
#696
#697	timeline = []
#698
#699	# Month name map
#700	MONTH_MAP = {
#701	'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
#702	'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
#703	'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5,
#704	'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
#705	'november': 11, 'december': 12,
#706	}
#707
#708	# Code indicators to filter out
#709	CODE_INDICATORS = ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'CREATE TABLE',
#710	'def ', 'import ', 'print(', 'return ', '```', 'function(',
#711	'jsonify', 'datetime', 'params', 'cursor.']
#712
#713	def _is_code_context(text: str, match_start: int) -> bool:
#714	"""Check if a date match appears to be in a code snippet."""
#715	# Check ~200 chars around match for code indicators
#716	start = max(0, match_start - 100)
#717	end = min(len(text), match_start + 100)
#718	surrounding = text[start:end]
#719	# If backticks within 200 chars, it's code
#720	if '```' in surrounding or '`' in surrounding:
#721	return True
#722	# If multiple code indicators present
#723	code_count = sum(1 for ci in CODE_INDICATORS if ci in surrounding)
#724	if code_count >= 2:
#725	return True
#726	# ISO date alone (2024-01-15) in a line with code indicators = likely code
#727	if _re.search(r'\b\d{4}-\d{2}-\d{2}\b', surrounding):
#728	if any(ci in surrounding for ci in CODE_INDICATORS):
#729	return True
#730	return False
#731
#732	# Track the conversation year context
#733	year_mentions = []
#734	for msg in messages:
#735	years = _re.findall(r'\b(20\d{2})\b', msg.get("content", ""))
#736	year_mentions.extend(int(y) for y in years)
#737	# Use the most common year > 2020 as default
#738	default_year = 2024
#739	if year_mentions:
#740	from collections import Counter
#741	year_counts = Counter(y for y in year_mentions if 2020 <= y <= 2030)
#742	if year_counts:
#743	default_year = year_counts.most_common(1)[0][0]
#744
#745	for i, msg in enumerate(messages):
#746	content = msg.get("content", "")
#747	if not content:
#748	continue
#749
#750	# Pattern 1: "Month Day, Year" (e.g. "March 15, 2024")
#751	for m in _re.finditer(
#752	r'(?P<month>January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December\|'
#753	r'Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\s+'
#754	r'(?P<day>\d{1,2})(?:st\|nd\|rd\|th)?[,\s]+(?P<year>\d{4})',
#755	content, _re.IGNORECASE
#756	):
#757	if _is_code_context(content, m.start()):
#758	continue
#759	month_num = MONTH_MAP.get(m.group('month').lower()[:3])
#760	if month_num:
#761	try:
#762	dt = _dt(int(m.group('year')), month_num, int(m.group('day')))
#763	start = max(0, m.start() - 60)
#764	end = min(len(content), m.end() + 60)
#765	event_text = content[start:end].strip()
#766	timeline.append({
#767	'date_obj': dt, 'date_str': m.group(0),
#768	'event_text': event_text, 'msg_index': i,
#769	})
#770	except ValueError:
#771	pass
#772
#773	# Pattern 2: "Month Day" without year (e.g. "March 29")
#774	for m in _re.finditer(
#775	r'(?P<month>January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December\|'
#776	r'Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\s+'
#777	r'(?P<day>\d{1,2})(?:st\|nd\|rd\|th)?'
#778	r'(?![\d,\s]*\d{4})', # NOT followed by year
#779	content, _re.IGNORECASE
#780	):
#781	if _is_code_context(content, m.start()):
#782	continue
#783	month_num = MONTH_MAP.get(m.group('month').lower()[:3])
#784	if month_num:
#785	try:
#786	dt = _dt(default_year, month_num, int(m.group('day')))
#787	start = max(0, m.start() - 60)
#788	end = min(len(content), m.end() + 60)
#789	event_text = content[start:end].strip()
#790	timeline.append({
#791	'date_obj': dt, 'date_str': m.group(0),
#792	'event_text': event_text, 'msg_index': i,
#793	})
#794	except ValueError:
#795	pass
#796
#797	# Sort chronologically and deduplicate (same date, same event text)
#798	timeline.sort(key=lambda x: (x['date_obj'], x['event_text']))
#799	seen = set()
#800	deduped = []
#801	for t in timeline:
#802	key = (t['date_str'], t['event_text'][:40])
#803	if key not in seen:
#804	seen.add(key)
#805	deduped.append(t)
#806
#807	return deduped
#808
#809
#810	def _build_tr_timeline_prompt(timeline: list[dict]) -> str:
#811	"""Build a structured timeline prompt for TR questions."""
#812	if not timeline:
#813	return ""
#814
#815	lines = ["CRITICAL TIMELINE (all dates extracted from the conversation, use ONLY these dates):"]
#816	for t in timeline:
#817	lines.append(f" [{t['date_obj'].strftime('%Y-%m-%d')}] {t['date_str']}: ...{t['event_text'][:100]}...")
#818
#819	return "\n".join(lines)
#820
#821
#822	def _compute_tr_python(question: str, timeline: list[dict]) -> str \| None:
#823	"""Compute TR answer in pure Python (date math, no LLM). Returns answer string or None."""
#824	import re as _re
#825	from datetime import timedelta as _td
#826
#827	q_lower = question.lower()
#828
#829	# Detect question type: "how many days between X and Y"
#830	# Extract event keywords from question
#831	event_keywords = []
#832	# Look for "end of first sprint", "start of second sprint" type phrases
#833	phrases = _re.findall(r'(?:end\|start\|beginning\|completion\|finish\|launch\|release\|deploy\|merge\|push\|commit\|sprint\|milestone\|phase\|wave\|beta\|alpha\|MVP\|demo\|presentation\|meeting\|call\|review\|audit\|test\|benchmark)[a-z]\s+(?:of\s+)?(?:the\s+)?(?:my\s+)?(?:first\|second\|third\|\d+(?:st\|nd\|rd\|th)?)?\s[a-z]+(?:\s+[a-z]+){0,3}', q_lower)
#834	event_keywords.extend(phrases)
#835
#836	# Also try simpler: extract noun phrases from question
#837	q_words = q_lower.replace('?', '').split()
#838
#839	# Score each timeline entry against the question
#840	scored = []
#841	for t in timeline:
#842	event = t['event_text'].lower()
#843	score = 0
#844	# Direct substring match bonus
#845	for phrase in event_keywords:
#846	if phrase in event or any(w in event for w in phrase.split() if len(w) > 3):
#847	score += 3
#848	# Word overlap
#849	for w in q_words:
#850	if len(w) > 3 and w in event:
#851	score += 1
#852	# Date proximity bonus (prefer dates with event context)
#853	if len(t['event_text']) > 20:
#854	score += 2
#855	scored.append((score, t))
#856
#857	scored.sort(key=lambda x: x[0], reverse=True)
#858
#859	# Try to find two distinct events
#860	if len(scored) >= 2 and scored[0][0] > 0 and scored[1][0] > 0:
#861	t1 = scored[0][1]
#862	t2 = scored[1][1]
#863	d1 = t1['date_obj']
#864	d2 = t2['date_obj']
#865	diff = abs((d2 - d1).days)
#866
#867	# Determine which is earlier/later based on question
#868	if 'between' in q_lower:
#869	evt_a = t1['date_str'] if d1 <= d2 else t2['date_str']
#870	evt_b = t2['date_str'] if d1 <= d2 else t1['date_str']
#871	d_a = d1 if d1 <= d2 else d2
#872	d_b = d2 if d1 <= d2 else d1
#873	else:
#874	evt_a, evt_b = t1['date_str'], t2['date_str']
#875	d_a, d_b = d1, d2
#876	diff = abs((d_b - d_a).days)
#877
#878	answer = (
#879	f"Between {evt_a} ({d_a.strftime('%B %d, %Y')}) and "
#880	f"{evt_b} ({d_b.strftime('%B %d, %Y')}), "
#881	f"there are {diff} days."
#882	)
#883	return answer
#884
#885	# Fallback: just take the two most relevant dates by word overlap
#886	if len(scored) >= 2:
#887	best = [t for s, t in scored if s > 0][:2]
#888	if len(best) >= 2:
#889	d1, d2 = best[0]['date_obj'], best[1]['date_obj']
#890	diff = abs((d2 - d1).days)
#891	return (
#892	f"Based on the conversation timeline, the time between "
#893	f"{best[0]['date_str']} and {best[1]['date_str']} is {diff} days."
#894	)
#895
#896	return None # Can't compute, let LLM handle it
#897
#898
#899
#900	def _compute_tr_answer(question: str, timeline: list[dict]) -> str \| None:
#901	"""Compute temporal reasoning answer from conversation dates. Returns None if can't compute."""
#902	if not timeline or len(timeline) < 2:
#903	return None
#904
#905	# Build a prompt that presents the timeline and asks the LLM to compute
#906	# This is more robust than trying to match events ourselves
#907	timeline_prompt = _build_tr_timeline_prompt(timeline)
#908
#909	# Build the full prompt that we'll return - the caller will send this to LLM
#910	prompt = (
#911	f"{timeline_prompt}\n\n"
#912	f"QUESTION: {question}\n\n"
#913	f"INSTRUCTIONS:\n"
#914	f"1. Identify the two events mentioned in the question\n"
#915	f"2. Find the corresponding dates in the timeline above\n"
#916	f"3. Compute the time difference between them\n"
#917	f"4. State the answer clearly with both dates and the computed difference\n\n"
#918	f"ANSWER:"
#919	)
#920	return prompt # Return the prompt, not the answer - caller passes to LLM
#921
#922
#923	def _detect_contradictions(messages: list, question: str) -> str \| None:
#924	"""Scan conversation for contradictory statements about the question topic.
#925	Returns contradiction context string to inject into prompt, or None if none found."""
#926	import re as _re
#927
#928	# Extract the key topic from the question
#929	# "Have I worked with Flask routes?" -> key terms: "flask routes", "http requests"
#930	# "Have I integrated Flask-Login?" -> key terms: "flask-login", "session management"
#931
#932	# Strip question words to get the core topic
#933	q_clean = _re.sub(r'^(?:Have I\|Did I\|Do I\|Am I\|Has)\s+(?:ever\s+)?', '', question, flags=_re.IGNORECASE)
#934	q_clean = _re.sub(r'\s+(?:in this project\|across my sessions\|in my project)\s*\??$', '', q_clean, flags=_re.IGNORECASE)
#935	q_clean = q_clean.strip().rstrip('?').strip()
#936
#937	# Extract meaningful noun phrases
#938	words = _re.findall(r'\b[a-zA-Z][a-zA-Z\-]+\b', q_clean)
#939	# Filter to key content words (nouns, tech terms)
#940	key_terms = []
#941	for w in words:
#942	wl = w.lower()
#943	if len(wl) > 2 and wl not in ('the', 'and', 'for', 'with', 'any', 'this', 'that', 'have', 'has', 'been'):
#944	key_terms.append(wl)
#945
#946	if not key_terms:
#947	return None
#948
#949	# Scan all messages for mentions of ANY key term
#950	affirmatives = []
#951	negatives = []
#952
#953	NEGATION_WORDS = {'never', 'not', "n't", 'no', 'without', 'cannot', "can't", 'nothing', 'none'}
#954
#955	for i, msg in enumerate(messages):
#956	content = msg.get("content", "")
#957	content_lower = content.lower()
#958
#959	# Check if message mentions any key term
#960	matched_terms = [t for t in key_terms if t in content_lower]
#961	if not matched_terms:
#962	continue
#963
#964	# Check for negation in the SENTENCE containing the matched term
#965	# (BEAM contradictions embed negation near the topic mention)
#966	has_negation = False
#967	for term in matched_terms:
#968	# Find the sentence containing this term
#969	term_pos = content_lower.find(term)
#970	if term_pos < 0:
#971	continue
#972	# Extract sentence context (200 chars around term, or to sentence boundaries)
#973	start = max(0, term_pos - 150)
#974	end = min(len(content_lower), term_pos + 150)
#975	sentence = content_lower[start:end]
#976	# Check for negation words in this sentence
#977	for nw in NEGATION_WORDS:
#978	if nw in sentence:
#979	has_negation = True
#980	break
#981	if has_negation:
#982	break
#983
#984	snippet = content[:250].strip()
#985	if has_negation:
#986	negatives.append(f"[Msg {i}] {content[:250].strip()}")
#987	else:
#988	affirmatives.append(f"[Msg {i}] {content[:250].strip()}")
#989
#990	if affirmatives and negatives:
#991	ctx = "CRITICAL: CONTRADICTORY INFORMATION DETECTED\n\n"
#992	ctx += "The conversation contains BOTH affirmative AND negative statements about this topic:\n\n"
#993	ctx += "Statements suggesting this WAS done or worked on:\n"
#994	for a in affirmatives[:5]:
#995	ctx += f" - {a}\n"
#996	ctx += "\nStatements suggesting this was NOT done:\n"
#997	for n in negatives[:5]:
#998	ctx += f" - {n}\n"
#999	ctx += "\nYOU MUST explicitly identify the contradiction and present BOTH sides. "
#1000	ctx += "Do NOT answer with just one side. The correct response begins with "
#1001	ctx += "'I notice you've mentioned contradictory information about this.'"
#1002	return ctx
#1003
#1004	return None
#1005
#1006
#1007	def answer_with_memory(llm: LLMClient, beam: BeamMemory, question: str,
#1008	conversation_messages: list = None, top_k: int = DEFAULT_TOP_K,
#1009	ability: str = None) -> str:
#1010	"""Retrieve memories and have LLM answer, with context strategy based on conversation size."""
#1011
#1012	total_msgs = len(conversation_messages) if conversation_messages else 0
#1013
#1014	# ---- PER-ABILITY BYPASSES (zero-LLM or augmented) ----
#1015
#1016	# TR (Temporal Reasoning): compute answer from extracted dates
#1017	if ability == 'TR' and conversation_messages:
#1018	timeline = _extract_timeline_from_conversation(conversation_messages)
#1019	print(f" [TR-bypass] extracted {len(timeline)} dates from {len(conversation_messages)} msgs")
#1020	if timeline and len(timeline) >= 2:
#1021	tr_prompt = _compute_tr_answer(question, timeline)
#1022	if tr_prompt:
#1023	messages = [
#1024	{"role": "system", "content": "You are a precise date calculator. Use ONLY the dates from the provided timeline. Output ONLY the answer, no explanation."},
#1025	{"role": "user", "content": tr_prompt},
#1026	]
#1027	answer = llm.chat(messages, temperature=0.0, max_tokens=4096)
#1028	print(f" [TR-bypass] LLM answer: {answer[:150]}")
#1029	return answer
#1030	else:
#1031	print(f" [TR-bypass] _compute_tr_answer returned None")
#1032	else:
#1033	print(f" [TR-bypass] no timeline extracted or too few dates")
#1034
#1035	# CR (Contradiction Resolution): detect contradictory statements
#1036	_cr_context = None
#1037	if ability == 'CR' and conversation_messages:
#1038	_cr_context = _detect_contradictions(conversation_messages, question)
#1039	if _cr_context:
#1040	print(f" [CR-detect] FOUND contradictions, injecting context ({len(_cr_context)} chars)")
#1041	else:
#1042	print(f" [CR-detect] no contradictions found")
#1043	# ---- END PER-ABILITY BYPASSES ----
#1044
#1045	# FULL-CONTEXT MODE: send the entire conversation to the LLM, bypassing Mnemosyne retrieval.
#1046	# This tests the LLM's reading comprehension ceiling — useful for establishing the upper bound.
#1047	# Controlled by FULL_CONTEXT_MODE env var.
#1048	# HYBRID: try context→value matching first for factual questions (IE/MR/KU),
#1049	# then fall through to full-context for complex reasoning (ABS/CR/EO/SUM/TR).
#1050	_full_context = os.environ.get("FULL_CONTEXT_MODE", "").lower() in ("1", "true", "yes")
#1051	# DEBUG
#1052	if os.environ.get("FULL_CONTEXT_MODE"):
#1053	print(f" [DEBUG full-context] env={_full_context}, msgs={bool(conversation_messages)}, count={len(conversation_messages) if conversation_messages else 0}")
#1054	if _full_context and conversation_messages:
#1055	# ---- Phase 1: Try context→value matching for factual questions ----
#1056	# Only use context→value for Information Extraction (IE) and Knowledge Understanding (KU).
#1057	# MR (Multi-hop) requires reasoning across multiple messages; let full-context handle it.
#1058	_FACT_ABILITIES = {'IE', 'KU'}
#1059	if ability in _FACT_ABILITIES and hasattr(beam, '_context_facts') and beam._context_facts:
#1060	_q_stop = {'when','does','do','did','what','how','where','which','who','why',
#1061	'is','are','was','were','can','will','would','should','could','may',
#1062	'the','a','an','in','on','at','to','for','of','with','my','me','i','you'}
#1063	q_words = [w.lower() for w in question.split() if w.lower() not in _q_stop and len(w) > 1]
#1064	q_set = set(q_words)
#1065	best_match = None
#1066	best_score = 0
#1067	for context_phrase, values in beam._context_facts.items():
#1068	c_words = set(context_phrase.split())
#1069	overlap = q_set & c_words
#1070	if len(overlap) < 2:
#1071	continue
#1072	score = len(overlap) / max(len(c_words), 1)
#1073	if score > best_score:
#1074	best_score = score
#1075	best_match = values[0]
#1076	if best_match:
#1077	return best_match # Direct fact answer, zero LLM cost
#1078
#1079	# ---- Phase 2: Full-context LLM fallback ----
#1080	full_parts = []
#1081	total_chars = 0
#1082	for msg in conversation_messages:
#1083	role = msg.get("role", "unknown")
#1084	content = msg.get("content", "")
#1085	if content.strip():
#1086	line = f"[{role}]: {content}"
#1087	if total_chars + len(line) > MAX_MEMORY_CONTEXT_CHARS * 2:
#1088	break
#1089	full_parts.append(line)
#1090	total_chars += len(line)
#1091
#1092	context = "FULL CONVERSATION:\n" + "\n".join(full_parts)
#1093
#1094	# Inject CR contradiction context if detected
#1095	_cr_prefix = ""
#1096	if _cr_context:
#1097	_cr_prefix = f"\n\n{_cr_context}\n\n"
#1098
#1099	messages = [
#1100	{"role": "system", "content": ANSWER_SYSTEM_PROMPT},
#1101	{"role": "user", "content": f"{_cr_prefix}{context}\n\nQUESTION: {question}\n\nANSWER:"},
#1102	]
#1103	return llm.chat(messages, temperature=0.1, max_tokens=2048)
#1104
#1105	# ALWAYS use multi-strategy retrieval to test Mnemosyne's recall quality.
#1106	# The previous <=500 bypass sent full raw conversations to the LLM,
#1107	# completely bypassing Mnemosyne's retrieval pipeline.
#1108	# This benchmark exists to measure MEMORY performance, not LLM reading comprehension.
#1109
#1110	# Multi-strategy retrieval
#1111	memories = _multi_strategy_recall(beam, question, top_k * 3, ability=ability) # Get 3x more for reranking
#1112
#1113	# ---- Context→Value fact matching (Phase 7: direct regex-extracted facts, zero-LLM) ----
#1114	# At ingestion, we built beam._context_facts: {"words around fact": ["fact value"]}.
#1115	# Now we try to match the question against context phrases and return the value directly.
#1116	# Only used for factual question types (IE, MR, KU, TR) with strong matches.
#1117	# ABS, CR, EO, SUM need LLM reasoning — we skip context matching for those.
#1118	context_answer = None
#1119	# Only use context→value for Information Extraction (IE) and Knowledge Understanding (KU).
#1120	# MR (Multi-hop) requires reasoning across multiple messages; CR/TR/EO/SUM need LLM.
#1121	_FACT_ABILITIES = {'IE', 'KU'}
#1122	if ability in _FACT_ABILITIES and hasattr(beam, '_context_facts') and beam._context_facts:
#1123	# Build question word set (filtered like FTS5 search does)
#1124	_q_stop = {'when','does','do','did','what','how','where','which','who','why',
#1125	'is','are','was','were','can','will','would','should','could','may',
#1126	'the','a','an','in','on','at','to','for','of','with','my','me','i','you'}
#1127	q_words = [w.lower() for w in question.split() if w.lower() not in _q_stop and len(w) > 1]
#1128	q_set = set(q_words)
#1129	best_match = None
#1130	best_score = 0
#1131	for context_phrase, values in beam._context_facts.items():
#1132	c_words = set(context_phrase.split())
#1133	overlap = q_set & c_words
#1134	if len(overlap) < 2:
#1135	continue
#1136	# Score: overlap count / max(context_words, question_words) for fairness
#1137	score = len(overlap) / max(len(c_words), 1)
#1138	if score > best_score and len(overlap) >= 2:
#1139	best_score = score
#1140	best_match = values[0]
#1141	if best_match:
#1142	context_answer = best_match
#1143
#1144	# If cloud extraction enabled, also search the facts table
#1145	if getattr(beam, 'use_cloud', False):
#1146	try:
#1147	fact_memories = beam.fact_recall(question, top_k=top_k)
#1148	# Convert fact dicts to same format as recall results
#1149	for f in fact_memories:
#1150	memories.append({
#1151	"content": f"FACT: {f['content']}",
#1152	"score": f.get("score", 0.5) * 2.0, # 2x weight for facts
#1153	"source": "fact_extraction",
#1154	})
#1155	# Re-sort by score
#1156	memories.sort(key=lambda x: x.get("score", 0), reverse=True)
#1157	except Exception:
#1158	pass # Fact recall is best-effort
#1159
#1160	# LLM RERANKING: DISABLED — rate-limit avoidance + proven ineffective (Reality Check 5.3)
#1161	# The re-ranker cannot beat baseline by >3pp and causes 429 rate-limit cascades.
#1162	# Left as dead code for reference.
#1163
#1164	# ---- Fact-density reranking (Phase 6.5: algorithmic, zero-LLM) ----
#1165	# BEAM distractors are generic dev-talk; answer messages carry specific data.
#1166	# Boost messages with dates, numbers, proper nouns, versions, technical terms.
#1167	import re as _re_facts
#1168	_FACT_PATTERNS = [
#1169	(r'\b(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]* \d{1,2}(?:[, ]*\d{4})?\b', 2.0), # dates
#1170	(r'\b\d{4}-\d{2}-\d{2}\b', 2.5), # ISO dates
#1171	(r'\b\d+[.,]?\d\s(?:ms\|sec\|mins?\|hours?\|days?\|weeks?\|months?\|years?\|%\|KB\|MB\|GB\|TB\|rows?\|columns?\|roles?\|features?\|bugs?\|commits?\|cards?\|users?\|items?\|tests?\|APIs?\|endpoints?\|sprints?\|tickets?)\b', 1.5), # numbers with units
#1172	(r'\bv?\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?\b', 1.5), # version strings
#1173	(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4}\b', 1.0), # proper noun phrases
#1174	(r'\b[A-Z]{2,8}\b', 0.8), # acronyms
#1175	]
#1176	for mem in memories:
#1177	content = mem.get("content", "")
#1178	fact_score = 0.0
#1179	for pattern, weight in _FACT_PATTERNS:
#1180	matches = _re_facts.findall(pattern, content)
#1181	fact_score += len(matches) * weight
#1182	# Normalize by content length to get fact density
#1183	density = fact_score / max(len(content.split()), 1)
#1184	mem["fact_density"] = round(density, 4)
#1185	# Boost score: blend original with fact density (40% fact boost)
#1186	orig = mem.get("score", mem.get("relevance", 0))
#1187	mem["score"] = orig * 0.6 + min(density * 5.0, 1.0) * 0.4
#1188
#1189	# Re-sort by boosted score
#1190	memories.sort(key=lambda m: m.get("score", 0), reverse=True)
#1191
#1192	context = "" # Built below from memories
#1193
#1194	# Build recent context from last N messages
#1195	recent_parts = []
#1196	if conversation_messages:
#1197	recent = conversation_messages[-RECENT_CONTEXT_COUNT:]
#1198	for msg in recent:
#1199	role = msg.get("role", "unknown")
#1200	content = msg.get("content", "")
#1201	if content.strip():
#1202	recent_parts.append(f"[{role}]: {content[:300]}")
#1203
#1204	# Build retrieved memory context (deduplicated, relevance-sorted)
#1205	seen_content = set()
#1206	memory_parts = []
#1207	total_chars = 0
#1208	for i, mem in enumerate(memories):
#1209	content = mem.get("content", "")
#1210	# Deduplicate
#1211	content_key = content[:100]
#1212	if content_key in seen_content:
#1213	continue
#1214	seen_content.add(content_key)
#1215
#1216	score = mem.get("score", mem.get("relevance", 0))
#1217	if isinstance(score, (int, float)) and score < 0.05:
#1218	continue # Skip very low relevance
#1219
#1220	if total_chars + len(content) > MAX_MEMORY_CONTEXT_CHARS:
#1221	remaining = MAX_MEMORY_CONTEXT_CHARS - total_chars
#1222	if remaining > 100:
#1223	memory_parts.append(f"[Memory] {content[:remaining]}...")
#1224	break
#1225	memory_parts.append(f"[Memory] {content}")
#1226	total_chars += len(content)
#1227
#1228	# Build prompt with contexts (skip if full-conversation mode already set)
#1229	if not context:
#1230	context_blocks = []
#1231	if recent_parts:
#1232	context_blocks.append("RECENT CONVERSATION:\n" + "\n".join(recent_parts))
#1233	if memory_parts:
#1234	context_blocks.append("RETRIEVED MEMORIES:\n" + "\n\n".join(memory_parts))
#1235
#1236	context = "\n\n".join(context_blocks) if context_blocks else "[No memories found]"
#1237
#1238	# If we found a direct context→value match, return it immediately (zero LLM cost)
#1239	if context_answer:
#1240	return context_answer
#1241
#1242	# Inject CR contradiction context if detected
#1243	_cr_prefix_ret = ""
#1244	if _cr_context:
#1245	_cr_prefix_ret = f"\n\n{_cr_context}\n\n"
#1246
#1247	messages = [
#1248	{"role": "system", "content": ANSWER_SYSTEM_PROMPT},
#1249	{"role": "user", "content": f"{_cr_prefix_ret}{context}\n\nQUESTION: {question}\n\nANSWER:"},
#1250	]
#1251
#1252	return llm.chat(messages, temperature=0.1, max_tokens=2048)
#1253
#1254
#1255	# ============================================================
#1256	# LLM-as-Judge: Nugget-Based Scoring (BEAM Protocol)
#1257	# ============================================================
#1258
#1259	JUDGE_SYSTEM_PROMPT = """You are an expert evaluator for a memory benchmark.
#1260	You will be given:
#1261	1. A question about a conversation
#1262	2. A list of RUBRIC ITEMS (expected facts the AI should mention)
#1263	3. The AI's ANSWER
#1264
#1265	For EACH rubric item, check if the AI's answer contains equivalent information:
#1266	- Score 1.0: correct info present, substantially matches the rubric item
#1267	- Score 0.5: partially correct, some key detail missing or slightly wrong
#1268	- Score 0.0: missing or wrong
#1269
#1270	Return ONLY this JSON:
#1271	{"scores":[1.0,0.5,0.0],"overall_score":0.X}
#1272
#1273	Where scores[i] corresponds to rubric[i], and overall_score is the average."""
#1274
#1275
#1276	def judge_with_rubrics(llm: LLMClient, question: str, rubric: list, ai_answer: str) -> dict:
#1277	"""Judge an AI answer against pre-written BEAM rubric items."""
#1278	if not rubric:
#1279	# Fall back to generic nugget scoring if no rubric available
#1280	return {"scores": [], "overall_score": 0.0, "assessment": "no rubric available"}
#1281
#1282	rubric_text = "\n".join(f"{i+1}. {item}" for i, item in enumerate(rubric))
#1283
#1284	user_prompt = f"""QUESTION: {question}
#1285
#1286	RUBRIC ITEMS:
#1287	{rubric_text}
#1288
#1289	AI's ANSWER: {ai_answer}
#1290
#1291	For each rubric item, score how well the AI's answer matches. Return JSON with scores array and overall_score (average)."""
#1292
#1293	messages = [
#1294	{"role": "system", "content": JUDGE_SYSTEM_PROMPT},
#1295	{"role": "user", "content": user_prompt},
#1296	]
#1297
#1298	response = llm.chat(messages, temperature=0.0, max_tokens=500)
#1299
#1300	# Parse JSON from response
#1301	if response is None:
#1302	return {
#1303	"scores": [0.0] * len(rubric),
#1304	"overall_score": 0.0,
#1305	"assessment": "LLM judge returned None (timeout or error)",
#1306	}
#1307
#1308	try:
#1309	json_start = response.find("{")
#1310	json_end = response.rfind("}") + 1
#1311	if json_start >= 0 and json_end > json_start:
#1312	result = json.loads(response[json_start:json_end])
#1313	return result
#1314	except (json.JSONDecodeError, ValueError):
#1315	pass
#1316
#1317	# Fallback: basic text matching
#1318	return {
#1319	"scores": [0.0] * len(rubric),
#1320	"overall_score": basic_text_similarity(ai_answer, " ".join(rubric)),
#1321	"assessment": "JSON parse failed; using fallback",
#1322	"raw_response": response,
#1323	}
#1324
#1325
#1326	def basic_text_similarity(text1: str, text2: str) -> float:
#1327	"""Simple token overlap as fallback when LLM judge fails."""
#1328	t1 = set(text1.lower().split())
#1329	t2 = set(text2.lower().split())
#1330	if not t1 or not t2:
#1331	return 0.0
#1332	intersection = t1 & t2
#1333	union = t1 \| t2
#1334	return len(intersection) / len(union) if union else 0.0
#1335
#1336
#1337	# ============================================================
#1338	# Evaluation Runner
#1339	# ============================================================
#1340
#1341	def evaluate_conversation(
#1342	llm: LLMClient,
#1343	judge_llm: LLMClient,
#1344	beam: BeamMemory,
#1345	conversation: dict,
#1346	resume_ids: set = None,
#1347	) -> dict:
#1348	"""Evaluate all probing questions for one conversation."""
#1349	conv_id = conversation["id"]
#1350	questions = conversation["questions"][:BENCHMARK_QUERIES_PER_CONV]
#1351	results = []
#1352
#1353	print(f" Conversation {conv_id}: {len(questions)} questions")
#1354
#1355	for i, q in enumerate(questions):
#1356	qid = f"{conv_id}:q{i}"
#1357	if resume_ids and qid in resume_ids:
#1358	continue
#1359
#1360	question = q["question"]
#1361	ideal = q["ideal_answer"]
#1362	rubric = q.get("rubric", [])
#1363	ability = q.get("ability", "unknown")
#1364	# Map dataset ability names to BEAM abbreviations
#1365	ability = ABILITY_MAP.get(ability, ability)
#1366
#1367	if not question or not ideal:
#1368	continue
#1369
#1370	# Step 1: LLM answers using Mnemosyne memories + conversation context
#1371	t0 = time.perf_counter()
#1372	ai_answer = answer_with_memory(llm, beam, question,
#1373	conversation_messages=conversation.get("messages", []),
#1374	ability=ability)
#1375	answer_time = time.perf_counter() - t0
#1376
#1377	# Handle None answer (LLM timeout/error)
#1378	if ai_answer is None:
#1379	ai_answer = "[LLM_ERROR: No response from answering model]"
#1380
#1381	# Step 2: LLM-as-judge scores the answer
#1382	t0 = time.perf_counter()
#1383	judgment = judge_with_rubrics(judge_llm, question, rubric, ai_answer)
#1384	judge_time = time.perf_counter() - t0
#1385
#1386	score = judgment.get("overall_score", 0.0)
#1387
#1388	result = {
#1389	"qid": qid,
#1390	"ability": ability,
#1391	"question": question[:200],
#1392	"ideal_answer": ideal[:200],
#1393	"ai_answer": ai_answer[:500],
#1394	"score": score,
#1395	"nuggets": judgment.get("nuggets", []),
#1396	"assessment": judgment.get("brief_assessment", ""),
#1397	"answer_time_ms": answer_time * 1000,
#1398	"judge_time_ms": judge_time * 1000,
#1399	}
#1400	results.append(result)
#1401
#1402	print(f" [{ability}] score={score:.2f} ans={answer_time1000:.0f}ms judge={judge_time1000:.0f}ms "
#1403	f"Q: {question[:60]}...")
#1404
#1405	# Rate-limit avoidance: long pause between questions (20s to avoid provider burst limits)
#1406	time.sleep(20)
#1407
#1408	return {
#1409	"conversation_id": conv_id,
#1410	"scale": conversation["scale"],
#1411	"num_questions": len(questions),
#1412	"num_evaluated": len(results),
#1413	"results": results,
#1414	}
#1415
#1416
#1417	def compute_ability_scores(all_results: list[dict]) -> dict:
#1418	"""Aggregate scores by ability and scale."""
#1419	by_scale_ability = defaultdict(lambda: defaultdict(list))
#1420
#1421	for conv_result in all_results:
#1422	scale = conv_result["scale"]
#1423	for r in conv_result["results"]:
#1424	ability = r.get("ability", "unknown")
#1425	score = r.get("score", 0.0)
#1426	by_scale_ability[scale][ability].append(score)
#1427
#1428	# Compute averages
#1429	summary = {}
#1430	for scale, abilities in by_scale_ability.items():
#1431	scale_scores = {}
#1432	all_scores = []
#1433	for ability, scores in abilities.items():
#1434	avg = sum(scores) / len(scores) if scores else 0.0
#1435	scale_scores[ability] = {
#1436	"avg_score": avg,
#1437	"count": len(scores),
#1438	}
#1439	all_scores.extend(scores)
#1440
#1441	# Overall average across all abilities
#1442	overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
#1443	scale_scores["OVERALL"] = {
#1444	"avg_score": overall,
#1445	"count": len(all_scores),
#1446	}
#1447
#1448	summary[scale] = scale_scores
#1449
#1450	return summary
#1451
#1452
#1453	# ============================================================
#1454	# SOTA Comparison
#1455	# ============================================================
#1456
#1457	PUBLISHED_SOTA = {
#1458	"10M": {
#1459	"Hindsight": 64.1,
#1460	"Honcho": 40.6,
#1461	"LIGHT (Llama-4)": 26.6,
#1462	"RAG (Llama-4)": 24.9,
#1463	},
#1464	"1M": {
#1465	"Hindsight": 73.9,
#1466	"Honcho": 63.1,
#1467	"LIGHT (Llama-4)": 33.6,
#1468	"RAG (Llama-4)": 30.7,
#1469	},
#1470	"500K": {
#1471	"Hindsight": 71.1,
#1472	"Honcho": 64.9,
#1473	"LIGHT (Llama-4)": 35.9,
#1474	"RAG (Llama-4)": 33.0,
#1475	},
#1476	"100K": {
#1477	"Hindsight": 73.4,
#1478	"Honcho": 63.0,
#1479	"LIGHT (Llama-4)": 35.8,
#1480	"RAG (Llama-4)": 32.3,
#1481	},
#1482	}
#1483
#1484
#1485	def print_sota_report(ability_summary: dict, metadata: dict):
#1486	"""Print SOTA comparison report."""
#1487	print(f"\n{'='*80}")
#1488	print(f" MNEMOSYNE BEAM END-TO-END SOTA REPORT")
#1489	print(f" Date: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}")
#1490	print(f" Model: {metadata.get('model', 'unknown')}")
#1491	print(f" Conversations/scale: {metadata.get('sample_size', 'N/A')}")
#1492	print(f" Top-K memories: {DEFAULT_TOP_K}")
#1493	print(f" Methodology: LLM answering + LLM-as-judge (nugget scoring, per BEAM protocol)")
#1494	print(f"{'='*80}")
#1495
#1496	print(f"\n Per-Ability Scores:")
#1497	print(f" {'Scale':<8} {'OVERALL':>8}", end="")
#1498	for ab in BEAM_ABILITIES:
#1499	print(f" {ab:>6}", end="")
#1500	print()
#1501
#1502	for scale in sorted(ability_summary.keys()):
#1503	scores = ability_summary[scale]
#1504	overall = scores.get("OVERALL", {}).get("avg_score", 0.0)
#1505	print(f" {scale:<8} {overall*100:>7.1f}%", end="")
#1506	for ab in BEAM_ABILITIES:
#1507	s = scores.get(ab, {}).get("avg_score", 0.0)
#1508	print(f" {s*100:>5.1f}%", end="")
#1509	print()
#1510
#1511	print(f"\n SOTA Comparison (OVERALL):")
#1512	print(f" {'Scale':<8} {'Mnemosyne':>12}", end="")
#1513	for system in ["Hindsight", "Honcho", "LIGHT (Llama-4)", "RAG (Llama-4)"]:
#1514	print(f" {system:>18}", end="")
#1515	print()
#1516
#1517	for scale in sorted(ability_summary.keys()):
#1518	our_score = ability_summary[scale].get("OVERALL", {}).get("avg_score", 0.0) * 100
#1519	sota = PUBLISHED_SOTA.get(scale, {})
#1520	print(f" {scale:<8} {our_score:>11.1f}%", end="")
#1521	for system in ["Hindsight", "Honcho", "LIGHT (Llama-4)", "RAG (Llama-4)"]:
#1522	print(f" {sota.get(system, 0):>17.1f}%", end="")
#1523	print()
#1524
#1525	print(f"\n Note: Published SOTA numbers from Hindsight blog (Apr 2026) and BEAM paper Table 3.")
#1526	print(f" Mnemosyne uses DeepSeek V4 Pro as answering + judging LLM.")
#1527	print(f" Direct comparison valid: identical BEAM dataset, identical LLM-as-judge protocol.")
#1528	print(f"{'='*80}")
#1529
#1530
#1531	# ============================================================
#1532	# Main
#1533	# ============================================================
#1534
#1535	def main():
#1536	parser = argparse.ArgumentParser(description="BEAM End-to-End Evaluation")
#1537	parser.add_argument("--scales", default="100K,500K,1M,10M",
#1538	help="Scales to evaluate (comma-separated)")
#1539	parser.add_argument("--sample", type=int, default=3,
#1540	help="Conversations per scale (0=all)")
#1541	parser.add_argument("--model", default=DEFAULT_MODEL,
#1542	help="LLM model for answering and judging")
#1543	parser.add_argument("--judge-model", default=None,
#1544	help="Separate LLM for judging (default: same as --model)")
#1545	parser.add_argument("--full-context", action="store_true",
#1546	help="Send full conversation to LLM (ceiling test, bypasses retrieval)")
#1547	parser.add_argument("--resume", action="store_true",
#1548	help="Resume from previous results file")
#1549	parser.add_argument("--dry-run", action="store_true",
#1550	help="Download data and print stats, don't evaluate")
#1551	parser.add_argument("--use-cloud", action="store_true",
#1552	help="Enable LLM fact extraction (cloud tier). Requires OPENROUTER_API_KEY.")
#1553	args = parser.parse_args()
#1554
#1555	scales = [s.strip() for s in args.scales.split(",")]
#1556	sample_size = args.sample if args.sample > 0 else None
#1557
#1558	print(f"{'='*80}")
#1559	print(f" BEAM End-to-End Evaluation Pipeline")
#1560	print(f" Scales: {scales}")
#1561	print(f" Sample: {sample_size or 'ALL'} conversations/scale")
#1562	print(f" Model: {args.model}")
#1563	print(f" Judge: {args.judge_model or args.model}")
#1564	if args.full_context:
#1565	os.environ["FULL_CONTEXT_MODE"] = "1"
#1566	print(" Mode: FULL-CONTEXT (bypassing retrieval)")
#1567	print(f"{'='*80}")
#1568
#1569	# Load data
#1570	print(f"\n[1/4] Loading BEAM dataset...")
#1571	data = load_beam_dataset(scales, max_conversations=sample_size)
#1572
#1573	if not data:
#1574	print("ERROR: No data loaded. Check HuggingFace token and dataset name.")
#1575	sys.exit(1)
#1576
#1577	# Print stats
#1578	print(f"\n Dataset Summary:")
#1579	for scale, convs in data.items():
#1580	total_msgs = sum(len(c["messages"]) for c in convs)
#1581	total_qs = sum(len(c["questions"]) for c in convs)
#1582	print(f" {scale}: {len(convs)} convs, {total_msgs:,} msgs, {total_qs} questions")
#1583
#1584	if args.dry_run:
#1585	print(f"\n Dry run complete. Exiting.")
#1586	return
#1587
#1588	# Load previous results if resuming
#1589	resume_ids = set()
#1590	all_previous = []
#1591	if args.resume and RESULTS_FILE.exists():
#1592	print(f"\n Resuming from {RESULTS_FILE}...")
#1593	with open(RESULTS_FILE) as f:
#1594	prev = json.load(f)
#1595	all_previous = prev.get("results", [])
#1596	for conv_result in all_previous:
#1597	for r in conv_result.get("results", []):
#1598	resume_ids.add(r["qid"])
#1599	print(f" Already evaluated: {len(resume_ids)} questions")
#1600
#1601	# Initialize LLM clients
#1602	print(f"\n[2/4] Initializing LLM clients...")
#1603	llm = LLMClient(model=args.model)
#1604	judge_llm = LLMClient(model=args.judge_model or args.model)
#1605
#1606	# Evaluate each conversation
#1607	print(f"\n[3/4] Evaluating... ({len(data)} scales)")
#1608	all_results = list(all_previous) if args.resume else []
#1609
#1610	for scale in sorted(data.keys()):
#1611	conversations = data[scale]
#1612	print(f"\n --- Scale: {scale} ({len(conversations)} conversations) ---")
#1613
#1614	for conv in conversations:
#1615	# Create fresh Mnemosyne DB for each conversation
#1616	with tempfile.TemporaryDirectory() as tmpdir:
#1617	db_path = Path(tmpdir) / f"beam_{scale}_{conv['id']}.db"
#1618	init_beam(db_path)
#1619	beam = BeamMemory(session_id=f"beam_{scale}_{conv['id']}",
#1620	db_path=db_path, use_cloud=args.use_cloud)
#1621
#1622	# Ingest
#1623	t0 = time.perf_counter()
#1624	stats = ingest_conversation(beam, conv["messages"])
#1625	ingest_time = time.perf_counter() - t0
#1626	print(f" Ingested {len(conv['messages'])} msgs in {ingest_time:.1f}s "
#1627	f"(DB: {os.path.getsize(db_path)/1024:.0f}KB)")
#1628
#1629	# Evaluate
#1630	conv_result = evaluate_conversation(
#1631	llm, judge_llm, beam, conv, resume_ids
#1632	)
#1633	all_results.append(conv_result)
#1634	beam.conn.close()
#1635
#1636	# Save progress after each conversation
#1637	os.makedirs(RESULTS_FILE.parent, exist_ok=True)
#1638	metadata = {
#1639	"date": datetime.now(timezone.utc).isoformat(),
#1640	"model": args.model,
#1641	"judge_model": args.judge_model or args.model,
#1642	"top_k": DEFAULT_TOP_K,
#1643	"sample_size": sample_size or "ALL",
#1644	"scales": scales,
#1645	"total_conversations": len(all_results),
#1646	}
#1647	with open(RESULTS_FILE, "w") as f:
#1648	json.dump({"metadata": metadata, "results": all_results}, f, indent=2)
#1649
#1650	# Cleanup
#1651	llm.close()
#1652	judge_llm.close()
#1653
#1654	# Compute and print report
#1655	print(f"\n[4/4] Computing SOTA report...")
#1656	ability_summary = compute_ability_scores(all_results)
#1657
#1658	metadata = {
#1659	"model": args.model,
#1660	"sample_size": sample_size or "ALL",
#1661	"judge_model": args.judge_model or args.model,
#1662	}
#1663	print_sota_report(ability_summary, metadata)
#1664
#1665	# Save summary
#1666	summary_file = RESULTS_FILE.parent / "beam_e2e_summary.json"
#1667	with open(summary_file, "w") as f:
#1668	json.dump({
#1669	"date": datetime.now(timezone.utc).isoformat(),
#1670	"metadata": metadata,
#1671	"ability_summary": {
#1672	scale: {
#1673	ab: {"avg_score": v["avg_score"], "count": v["count"]}
#1674	for ab, v in abilities.items()
#1675	}
#1676	for scale, abilities in ability_summary.items()
#1677	},
#1678	}, f, indent=2)
#1679
#1680	print(f"\n Results saved to: {RESULTS_FILE}")
#1681	print(f" Summary saved to: {summary_file}")
#1682
#1683
#1684	if __name__ == "__main__":
#1685	main()
#1686

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public