repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources15d ago| #1 | #!/usr/bin/env python3 |
| #2 | """ |
| #3 | BEAM End-to-End Evaluation Pipeline |
| #4 | =================================== |
| #5 | Evaluates Mnemosyne as a memory backend for LLMs using the official |
| #6 | BEAM benchmark protocol: |
| #7 | 1. Download BEAM dataset from HuggingFace |
| #8 | 2. Ingest conversations into Mnemosyne |
| #9 | 3. For each probing question: retrieve memories -> LLM answers -> LLM-as-judge scores |
| #10 | 4. Report per-scale, per-ability scores comparable to published SOTA |
| #11 | |
| #12 | Published SOTA (BEAM 10M): |
| #13 | Hindsight: 64.1% Honcho: 40.6% LIGHT: 26.6% RAG: 24.9% |
| #14 | |
| #15 | LLM: Nvidia API (deepseek-ai/deepseek-v4-pro) via OpenAI-compatible endpoint. |
| #16 | Fast, cheap (~$2/M tokens), no local GPU needed. |
| #17 | |
| #18 | Usage: |
| #19 | cd /root/.hermes/projects/mnemosyne |
| #20 | .venv/bin/python tools/evaluate_beam_end_to_end.py --sample 5 --scales 100K,500K,1M,10M |
| #21 | |
| #22 | --sample N: conversations per scale (default 3, use 0 for all) |
| #23 | --scales: comma-separated (default 100K,500K,1M,10M) |
| #24 | --mode: retrieval|end_to_end (default end_to_end) |
| #25 | --judge-model: LLM model for judging (default same as answer model) |
| #26 | --resume: skip already-evaluated questions from results file |
| #27 | """ |
| #28 | |
| #29 | import argparse |
| #30 | import ast |
| #31 | import gc |
| #32 | import json |
| #33 | import math |
| #34 | import os |
| #35 | import sys |
| #36 | import tempfile |
| #37 | import time |
| #38 | from collections import defaultdict |
| #39 | from datetime import datetime, timezone |
| #40 | from functools import partial |
| #41 | from pathlib import Path |
| #42 | |
| #43 | # Unbuffered output for real-time progress |
| #44 | print = partial(print, flush=True) |
| #45 | |
| #46 | # --- Setup --- |
| #47 | PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| #48 | sys.path.insert(0, str(PROJECT_ROOT)) |
| #49 | |
| #50 | import urllib.request |
| #51 | import urllib.error |
| #52 | import numpy as np |
| #53 | |
| #54 | from mnemosyne.core.beam import BeamMemory, init_beam, _embeddings, _vec_available, _vec_insert, _fts_search_working, _generate_id |
| #55 | |
| #56 | # --- Config --- |
| #57 | OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "") |
| #58 | if not OPENROUTER_API_KEY: |
| #59 | # Try to load from file — check opencode first, then openrouter |
| #60 | for _kf in ["/tmp/opencode_key.txt", "/tmp/openrouter_key.txt"]: |
| #61 | _key_file = Path(_kf) |
| #62 | if _key_file.exists(): |
| #63 | with open(_key_file) as f: |
| #64 | _content = f.read().strip() |
| #65 | if "export" in _content: |
| #66 | OPENROUTER_API_KEY = _content.split("=", 1)[1].strip().strip('"').strip("'") |
| #67 | else: |
| #68 | OPENROUTER_API_KEY = _content |
| #69 | break |
| #70 | NVIDIA_API_KEY = os.environ.get("NVIDIA_API_KEY", "") |
| #71 | OPENROUTER_BASE_URL = os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1") |
| #72 | DEFAULT_MODEL = "deepseek-v4-pro" |
| #73 | FALLBACK_MODELS = [] # Disabled — fallback cascade burned $30 in credits |
| #74 | DEFAULT_TOP_K = 10 # Memories to retrieve per question |
| #75 | MAX_MEMORY_CONTEXT_CHARS = 8000 # Max chars of retrieved context to send to LLM |
| #76 | BENCHMARK_QUERIES_PER_CONV = 50 # Max probing questions per conversation |
| #77 | RESULTS_FILE = PROJECT_ROOT / "results" / "beam_e2e_results.json" |
| #78 | |
| #79 | # Memory abilities tested by BEAM (10 dimensions) |
| #80 | BEAM_ABILITIES = [ |
| #81 | "IE", # Information Extraction |
| #82 | "MR", # Multi-hop Reasoning |
| #83 | "KU", # Knowledge Update |
| #84 | "TR", # Temporal Reasoning |
| #85 | "ABS", # Abstention |
| #86 | "CR", # Contradiction Resolution |
| #87 | "EO", # Event Ordering |
| #88 | "IF", # Instruction Following |
| #89 | "PF", # Preference Following |
| #90 | "SUM", # Summarization |
| #91 | ] |
| #92 | |
| #93 | # Map dataset ability names to our abbreviations |
| #94 | ABILITY_MAP = { |
| #95 | "information_extraction": "IE", |
| #96 | "multi_session_reasoning": "MR", |
| #97 | "knowledge_update": "KU", |
| #98 | "temporal_reasoning": "TR", |
| #99 | "abstention": "ABS", |
| #100 | "contradiction_resolution": "CR", |
| #101 | "event_ordering": "EO", |
| #102 | "instruction_following": "IF", |
| #103 | "preference_following": "PF", |
| #104 | "summarization": "SUM", |
| #105 | # Aliases |
| #106 | "multi_session": "MR", |
| #107 | "knowledge": "KU", |
| #108 | "temporal": "TR", |
| #109 | "information": "IE", |
| #110 | } |
| #111 | |
| #112 | |
| #113 | # ============================================================ |
| #114 | # LLM Client |
| #115 | # ============================================================ |
| #116 | |
| #117 | class LLMClient: |
| #118 | """OpenAI-compatible API client using OpenRouter (fast, reliable).""" |
| #119 | |
| #120 | _last_429_time = 0 # Class-level rate-limit cooldown |
| #121 | |
| #122 | def __init__(self, model: str = DEFAULT_MODEL, api_key: str = None, base_url: str = None): |
| #123 | self.model = model |
| #124 | self.api_key = api_key or OPENROUTER_API_KEY |
| #125 | self.base_url = (base_url or OPENROUTER_BASE_URL).rstrip("/") |
| #126 | self.fallback_models = FALLBACK_MODELS.copy() |
| #127 | self.call_count = 0 |
| #128 | |
| #129 | def chat(self, messages: list, temperature: float = 0.1, max_tokens: int = 1024) -> str: |
| #130 | """Send chat completion request with retry. No fallback models to avoid rate limits.""" |
| #131 | |
| #132 | last_error = None |
| #133 | for attempt in range(3): |
| #134 | try: |
| #135 | return self._call_api(self.model, messages, temperature, max_tokens) |
| #136 | except Exception as e: |
| #137 | last_error = str(e) |
| #138 | if "429" in last_error or "rate" in last_error.lower(): |
| #139 | wait = 15 * (attempt + 1) # 15s, 30s, 45s backoff |
| #140 | time.sleep(wait) |
| #141 | continue |
| #142 | else: |
| #143 | break # Non-retryable error |
| #144 | |
| #145 | return f"[LLM_ERROR: all models failed. Last: {last_error}]" |
| #146 | |
| #147 | def _call_api(self, model: str, messages: list, temperature: float, max_tokens: int) -> str: |
| #148 | """Single API call via requests (urllib blocked by Cloudflare on some providers).""" |
| #149 | import json as _json |
| #150 | import requests as _requests |
| #151 | url = f"{self.base_url}/chat/completions" |
| #152 | payload = { |
| #153 | "model": model, |
| #154 | "messages": messages, |
| #155 | "temperature": temperature, |
| #156 | "max_tokens": max_tokens, |
| #157 | } |
| #158 | headers = { |
| #159 | "Authorization": f"Bearer {self.api_key}", |
| #160 | "Content-Type": "application/json", |
| #161 | "HTTP-Referer": "https://mnemosyne.site", |
| #162 | "X-Title": "Mnemosyne Benchmark", |
| #163 | } |
| #164 | resp = _requests.post(url, json=payload, headers=headers, timeout=60) |
| #165 | resp.raise_for_status() |
| #166 | data = resp.json() |
| #167 | self.call_count += 1 |
| #168 | return data["choices"][0]["message"]["content"] |
| #169 | |
| #170 | def close(self): |
| #171 | pass |
| #172 | |
| #173 | |
| #174 | # ============================================================ |
| #175 | # Data Loading (adapted from benchmark_beam_sota.py) |
| #176 | # ============================================================ |
| #177 | |
| #178 | def load_beam_dataset(scales: list[str], max_conversations: int = None) -> dict: |
| #179 | """Load BEAM dataset from HuggingFace. Returns dict[scale] -> list[conversation].""" |
| #180 | try: |
| #181 | from datasets import load_dataset |
| #182 | except ImportError: |
| #183 | print("ERROR: 'datasets' package not installed. Run: pip install datasets") |
| #184 | sys.exit(1) |
| #185 | |
| #186 | data = {} |
| #187 | total_loaded = 0 |
| #188 | |
| #189 | for scale in scales: |
| #190 | print(f" Loading BEAM {scale}...") |
| #191 | try: |
| #192 | if scale == "10M": |
| #193 | ds = load_dataset("Mohammadta/BEAM-10M", streaming=True) |
| #194 | split_name = "10M" if "10M" in ds else list(ds.keys())[0] |
| #195 | |
| #196 | conversations = [] |
| #197 | for i, sample in enumerate(ds[split_name]): |
| #198 | if max_conversations and i >= max_conversations: |
| #199 | break |
| #200 | |
| #201 | probing_raw = sample.get("probing_questions", {}) |
| #202 | if isinstance(probing_raw, str): |
| #203 | try: |
| #204 | probing = ast.literal_eval(probing_raw) |
| #205 | except Exception: |
| #206 | probing = {} |
| #207 | else: |
| #208 | probing = probing_raw |
| #209 | |
| #210 | all_questions = [] |
| #211 | for ability, questions in probing.items(): |
| #212 | if isinstance(questions, list): |
| #213 | for q in questions: |
| #214 | if isinstance(q, dict): |
| #215 | all_questions.append({ |
| #216 | "ability": ability, |
| #217 | "question": q.get("question", ""), |
| #218 | "ideal_answer": q.get("ideal_answer", q.get("ideal_response", q.get("answer", q.get("ideal_summary", "")))), |
| #219 | "rubric": q.get("rubric", []), |
| #220 | }) |
| #221 | |
| #222 | # Extract messages from plans |
| #223 | plans = sample.get("plans", []) |
| #224 | all_messages = [] |
| #225 | for plan in plans: |
| #226 | chat_blocks = plan.get("chat", []) if isinstance(plan, dict) else [] |
| #227 | for block in chat_blocks: |
| #228 | if isinstance(block, list): |
| #229 | for msg in block: |
| #230 | if isinstance(msg, dict): |
| #231 | all_messages.append({ |
| #232 | "role": msg.get("role", "unknown"), |
| #233 | "content": msg.get("content", ""), |
| #234 | "index": len(all_messages), |
| #235 | }) |
| #236 | |
| #237 | conversations.append({ |
| #238 | "id": sample.get("conversation_id", str(i)), |
| #239 | "messages": all_messages, |
| #240 | "questions": all_questions, |
| #241 | "scale": "10M", |
| #242 | }) |
| #243 | total_loaded += 1 |
| #244 | |
| #245 | data[scale] = conversations |
| #246 | ds.cleanup_cache_files() if hasattr(ds, 'cleanup_cache_files') else None |
| #247 | del ds |
| #248 | gc.collect() |
| #249 | print(f" Loaded {len(conversations)} conversations") |
| #250 | |
| #251 | else: |
| #252 | # 100K, 500K, 1M scales from the main dataset |
| #253 | ds = load_dataset("Mohammadta/BEAM", streaming=True) |
| #254 | if scale not in ds: |
| #255 | print(f" WARNING: split '{scale}' not found. Available: {list(ds.keys())}") |
| #256 | continue |
| #257 | |
| #258 | conversations = [] |
| #259 | for i, sample in enumerate(ds[scale]): |
| #260 | if max_conversations and i >= max_conversations: |
| #261 | break |
| #262 | |
| #263 | pq_raw = sample.get("probing_questions", "{}") |
| #264 | if isinstance(pq_raw, str): |
| #265 | try: |
| #266 | probing = ast.literal_eval(pq_raw) |
| #267 | except Exception: |
| #268 | probing = {} |
| #269 | else: |
| #270 | probing = pq_raw |
| #271 | |
| #272 | flat_questions = [] |
| #273 | for ability, questions in probing.items(): |
| #274 | if isinstance(questions, list): |
| #275 | for q in questions: |
| #276 | if isinstance(q, dict): |
| #277 | flat_questions.append({ |
| #278 | "ability": ability, |
| #279 | "question": q.get("question", ""), |
| #280 | "ideal_answer": q.get("ideal_answer", q.get("ideal_response", q.get("answer", q.get("ideal_summary", "")))), |
| #281 | "rubric": q.get("rubric", []), |
| #282 | }) |
| #283 | |
| #284 | chat_blocks = sample.get("chat", []) |
| #285 | messages = [] |
| #286 | for block in chat_blocks: |
| #287 | if isinstance(block, list): |
| #288 | for msg in block: |
| #289 | if isinstance(msg, dict): |
| #290 | messages.append({ |
| #291 | "role": msg.get("role", "unknown"), |
| #292 | "content": msg.get("content", ""), |
| #293 | "index": len(messages), |
| #294 | }) |
| #295 | elif isinstance(block, dict): |
| #296 | # Flat format: chat is a list of dicts directly |
| #297 | messages.append({ |
| #298 | "role": block.get("role", "unknown"), |
| #299 | "content": block.get("content", ""), |
| #300 | "index": len(messages), |
| #301 | }) |
| #302 | |
| #303 | conversations.append({ |
| #304 | "id": sample.get("conversation_id", str(i)), |
| #305 | "messages": messages, |
| #306 | "questions": flat_questions, |
| #307 | "scale": scale, |
| #308 | }) |
| #309 | total_loaded += 1 |
| #310 | |
| #311 | data[scale] = conversations |
| #312 | ds.cleanup_cache_files() if hasattr(ds, 'cleanup_cache_files') else None |
| #313 | del ds |
| #314 | gc.collect() |
| #315 | print(f" Loaded {len(conversations)} conversations") |
| #316 | |
| #317 | except Exception as e: |
| #318 | print(f" ERROR loading {scale}: {e}") |
| #319 | import traceback |
| #320 | traceback.print_exc() |
| #321 | |
| #322 | print(f" Total: {total_loaded} conversations across {len(data)} scales") |
| #323 | return data |
| #324 | |
| #325 | |
| #326 | # ============================================================ |
| #327 | # Mnemosyne Ingestion |
| #328 | # ============================================================ |
| #329 | |
| #330 | def _extract_facts(content: str, source: str = "unknown") -> list[dict]: |
| #331 | """Extract structured facts from a message for precision retrieval. |
| #332 | These fact entries complement raw message storage by isolating |
| #333 | specific data points (numbers, dates, versions, negations) that |
| #334 | FTS5 keyword search can match more precisely than in long messages.""" |
| #335 | import re |
| #336 | facts = [] |
| #337 | |
| #338 | # Pattern 1: Version numbers ("Flask 2.3.1", "v0.6.2", "Python 3.11") |
| #339 | ver_matches = re.findall(r'([A-Z][a-zA-Z]+(?:\s*[A-Z][a-zA-Z]+)*)\s+v?(\d+\.\d+(?:\.\d+)?)', content) |
| #340 | for name, ver in ver_matches[:3]: |
| #341 | facts.append({ |
| #342 | "content": f"FACT version: {name.strip()} {ver}", |
| #343 | "importance": 0.7, |
| #344 | }) |
| #345 | |
| #346 | # Pattern 2: Numbers with units ("250ms", "3 columns", "50 tasks", "5000 port") |
| #347 | num_matches = re.findall(r'(\d+(?:[.,]\d+)?)\s*(ms|sec|seconds?|minutes?|hours?|days?|weeks?|months?|%|KB|MB|GB|columns?|tasks?|commits?|users?|ports?|items?)', content, re.IGNORECASE) |
| #348 | for num, unit in num_matches[:5]: |
| #349 | facts.append({ |
| #350 | "content": f"FACT metric: {num}{unit}", |
| #351 | "importance": 0.65, |
| #352 | }) |
| #353 | |
| #354 | # Pattern 3: Dates |
| #355 | date_patterns = [ |
| #356 | r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s*\d{4}', |
| #357 | r'\d{4}-\d{2}-\d{2}', |
| #358 | ] |
| #359 | for pat in date_patterns: |
| #360 | for match in re.findall(pat, content, re.IGNORECASE): |
| #361 | if isinstance(match, tuple): |
| #362 | match = " ".join(match) |
| #363 | facts.append({ |
| #364 | "content": f"FACT date: {match}", |
| #365 | "importance": 0.7, |
| #366 | }) |
| #367 | |
| #368 | # Pattern 4: Deadlines |
| #369 | deadline_matches = re.findall(r'(deadline|due by|sprint ends?|sprint \d+)\s*[:\-]?\s*([^.,;!?\n]{5,80})', content, re.IGNORECASE) |
| #370 | for ctx, detail in deadline_matches[:3]: |
| #371 | facts.append({ |
| #372 | "content": f"FACT deadline: {ctx} {detail.strip()}", |
| #373 | "importance": 0.7, |
| #374 | }) |
| #375 | |
| #376 | # Pattern 5: Negations ("I have never", "I have not") - critical for CR |
| #377 | negations = re.findall(r'(I(?: have|\'ve)?\s*(?:never|not)\s+[^.,;!?\n]{15,120})', content, re.IGNORECASE) |
| #378 | for neg in negations[:3]: |
| #379 | facts.append({ |
| #380 | "content": f"FACT negation: {neg.strip()}", |
| #381 | "importance": 0.75, |
| #382 | }) |
| #383 | |
| #384 | # Pattern 6: Decisions / choices |
| #385 | choices = re.findall(r'(?:decided to|chose to|opted for|selected|picked|switching to)\s+([^.,;!?\n]{10,120})', content, re.IGNORECASE) |
| #386 | for choice in choices[:3]: |
| #387 | facts.append({ |
| #388 | "content": f"FACT decision: {choice.strip()}", |
| #389 | "importance": 0.65, |
| #390 | }) |
| #391 | |
| #392 | # Pattern 7: Ordinal sequence markers ("first", "then", "finally") for EO |
| #393 | ordinals = re.findall(r'((?:first|second|third|fourth|fifth|finally|next|then|after that)[^.,;!?\n]{15,120})', content, re.IGNORECASE) |
| #394 | for ord_text in ordinals[:5]: |
| #395 | facts.append({ |
| #396 | "content": f"FACT sequence: {ord_text.strip()}", |
| #397 | "importance": 0.6, |
| #398 | }) |
| #399 | |
| #400 | # Pattern 8: Entity-action pairs ("transactions table" + "add") for MR |
| #401 | entities = re.findall(r'(?:the|my|our)\s+([a-z_]+\s*(?:table|model|schema|API|endpoint|function|module|route|handler))\s+(?:needs?|requires?|should|could|would|will|has|have)\s+([^.,;!?\n]{10,80})', content, re.IGNORECASE) |
| #402 | for entity, action in entities[:5]: |
| #403 | facts.append({ |
| #404 | "content": f"FACT entity: {entity.strip()} -> {action.strip()}", |
| #405 | "importance": 0.65, |
| #406 | }) |
| #407 | |
| #408 | return facts[:20] # Cap per message |
| #409 | |
| #410 | def ingest_conversation(beam: BeamMemory, messages: list[dict]) -> dict: |
| #411 | """Ingest conversation messages into Mnemosyne BEAM tiers. |
| #412 | Also builds an in-memory facts index for fact-boosted retrieval.""" |
| #413 | start_time = time.perf_counter() |
| #414 | stats = {"wm_count": 0, "ep_count": 0, "sp_count": 0, "total_chars": 0} |
| #415 | |
| #416 | # In-memory context→value facts index for direct fact matching. |
| #417 | # Format: {"context phrase": "fact value"} — maps question-like phrases to answers. |
| #418 | # Example: "My first sprint ends on" → "March 29" |
| #419 | # Built during ingestion, queried during answering for zero-LLM fact extraction. |
| #420 | import re as _re2 |
| #421 | _FACT_VALUE_RE = _re2.compile( |
| #422 | r'(' |
| #423 | r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}(?:[, ]*\d{4})?\b|' # dates |
| #424 | r'\b\d{4}-\d{2}-\d{2}\b|' # ISO dates |
| #425 | r'\b\d+[.,]?\d*\s*(?:ms|sec|mins?|hours?|days?|weeks?|months?|years?|%|KB|MB|GB|TB|rows?|columns?|roles?|features?|bugs?|commits?|cards?|users?|items?|tests?|APIs?|endpoints?|sprints?|tickets?)\b|' # numbers+units |
| #426 | r'\bv?\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?\b' # versions |
| #427 | r')' |
| #428 | ) |
| #429 | context_facts = getattr(beam, '_context_facts', {}) |
| #430 | if not hasattr(beam, '_context_facts'): |
| #431 | beam._context_facts = {} |
| #432 | context_facts = beam._context_facts |
| #433 | |
| #434 | BATCH_SIZE = 500 |
| #435 | |
| #436 | for batch_start in range(0, len(messages), BATCH_SIZE): |
| #437 | batch_msgs = messages[batch_start:batch_start + BATCH_SIZE] |
| #438 | |
| #439 | batch_items = [] |
| #440 | for i, msg in enumerate(batch_msgs): |
| #441 | content = msg.get("content", "") |
| #442 | if not content.strip(): |
| #443 | continue |
| #444 | batch_items.append({ |
| #445 | "content": content, |
| #446 | "source": f"beam_{msg.get('role', 'unknown')}", |
| #447 | "importance": 0.3 + (0.1 * ((batch_start + i) % 5)), |
| #448 | }) |
| #449 | stats["total_chars"] += len(content) |
| #450 | |
| #451 | # Extract context→value facts: words before AND after each fact value |
| #452 | # SKIP version numbers (e.g., "3.39", "2.3.1") — they pollute fact matching |
| #453 | # and are never the answer to BEAM questions (which ask about dates, counts, names). |
| #454 | _VERSION_RE = _re2.compile(r'^\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?$') |
| #455 | for match in _FACT_VALUE_RE.finditer(content): |
| #456 | value = match.group() |
| #457 | if _VERSION_RE.match(value): |
| #458 | continue # Skip bare version numbers |
| #459 | # Extract context: up to 12 words before + 8 words after the fact |
| #460 | before = content[:match.start()].split()[-12:] |
| #461 | after = content[match.end():].split()[:8] |
| #462 | context_words = before + after |
| #463 | context = ' '.join(context_words).lower().strip() |
| #464 | if context and len(context) > 5: |
| #465 | if context not in context_facts: |
| #466 | context_facts[context] = [] |
| #467 | context_facts[context].append(value) |
| #468 | |
| #469 | # Scratchpad every 10 messages |
| #470 | if (batch_start + i) % 10 == 0 and len(content) > 50: |
| #471 | try: |
| #472 | beam.scratchpad_write(f"[t={batch_start + i}] {content[:300]}") |
| #473 | stats["sp_count"] += 1 |
| #474 | except Exception: |
| #475 | pass |
| #476 | |
| #477 | if not batch_items: |
| #478 | continue |
| #479 | |
| #480 | beam.remember_batch(batch_items) |
| #481 | stats["wm_count"] += len(batch_items) |
| #482 | |
| #483 | # Cloud fact extraction: extract facts from batch if enabled |
| #484 | if getattr(beam, 'use_cloud', False): |
| #485 | try: |
| #486 | from mnemosyne.extraction import ExtractionClient |
| #487 | if beam._extraction_client is None: |
| #488 | beam._extraction_client = ExtractionClient() |
| #489 | facts = beam._extraction_client.extract_facts(batch_msgs) |
| #490 | if facts: |
| #491 | cursor = beam.conn.cursor() |
| #492 | import hashlib |
| #493 | for fact in facts: |
| #494 | fact_id = hashlib.sha256( |
| #495 | f"{fact.get('subject','')}:{fact.get('predicate','')}:{fact.get('object','')}:{batch_start}".encode() |
| #496 | ).hexdigest()[:24] |
| #497 | cursor.execute(""" |
| #498 | INSERT OR IGNORE INTO facts |
| #499 | (fact_id, session_id, subject, predicate, object, |
| #500 | timestamp, source_msg_id, confidence) |
| #501 | VALUES (?, ?, ?, ?, ?, ?, ?, ?) |
| #502 | """, ( |
| #503 | fact_id, |
| #504 | beam.session_id, |
| #505 | fact.get("subject", ""), |
| #506 | fact.get("predicate", "stated"), |
| #507 | fact.get("object", ""), |
| #508 | fact.get("timestamp", ""), |
| #509 | fact.get("source_msg_id", ""), |
| #510 | fact.get("confidence", 0.7), |
| #511 | )) |
| #512 | beam.conn.commit() |
| #513 | stats["fact_count"] = stats.get("fact_count", 0) + len(facts) |
| #514 | except Exception: |
| #515 | pass # Best-effort; don't fail ingestion |
| #516 | |
| #517 | # Episodic consolidation per batch |
| #518 | try: |
| #519 | cursor = beam.conn.cursor() |
| #520 | # Get ALL working memory items for this session (oldest first) |
| #521 | cursor.execute(""" |
| #522 | SELECT id, content FROM working_memory |
| #523 | WHERE session_id = ? |
| #524 | ORDER BY timestamp ASC |
| #525 | LIMIT 1000 |
| #526 | """, (beam.session_id,)) |
| #527 | wm_rows = cursor.fetchall() |
| #528 | |
| #529 | if wm_rows: |
| #530 | wm_ids = [row["id"] for row in wm_rows] |
| #531 | recent_texts = [row["content"][:100] for row in wm_rows[:5]] |
| #532 | summary = f"Batch {batch_start // BATCH_SIZE}: " + " | ".join(recent_texts[:3]) |
| #533 | if len(summary) > 500: |
| #534 | summary = summary[:497] + "..." |
| #535 | |
| #536 | beam.consolidate_to_episodic( |
| #537 | summary=summary, |
| #538 | source_wm_ids=wm_ids, |
| #539 | source="beam_consolidation", |
| #540 | importance=0.4, |
| #541 | scope="global", |
| #542 | ) |
| #543 | stats["ep_count"] += 1 |
| #544 | |
| #545 | # Delete consolidated items from working memory to prevent bloat |
| #546 | placeholders = ",".join("?" * len(wm_ids)) |
| #547 | cursor.execute(f"DELETE FROM working_memory WHERE id IN ({placeholders})", wm_ids) |
| #548 | stats["wm_count"] -= len(wm_ids) |
| #549 | |
| #550 | beam.conn.commit() |
| #551 | except Exception: |
| #552 | pass |
| #553 | |
| #554 | stats["ingest_time_ms"] = (time.perf_counter() - start_time) * 1000 |
| #555 | return stats |
| #556 | |
| #557 | |
| #558 | # ============================================================ |
| #559 | # LLM Answering with Mnemosyne Memory |
| #560 | # ============================================================ |
| #561 | |
| #562 | ANSWER_SYSTEM_PROMPT = """You are a precise memory assistant answering questions about past conversations. You receive conversation context that may contain the answer. |
| #563 | |
| #564 | CRITICAL: Think step-by-step before answering. Follow this structure: |
| #565 | |
| #566 | STEP 1 - RELEVANT FACTS: List all specific facts from the context that relate to the question (dates, numbers, names, events, statements). |
| #567 | STEP 2 - CONTRADICTIONS: If the context contains conflicting statements about the same topic, identify BOTH sides explicitly. |
| #568 | STEP 3 - TEMPORAL/CALCULATIONS: For date/time questions, extract all relevant dates and compute the answer. |
| #569 | STEP 4 - ANSWER: Provide a thorough final answer with all relevant details from the context. |
| #570 | |
| #571 | RULES: |
| #572 | - For EVENT ORDERING: list items in chronological order as they appear. |
| #573 | - For CONTRADICTION: explicitly state "The conversation contains contradictory information: [A] vs [B]" |
| #574 | - For SUMMARIZATION: include all key details — project stages, features, timelines, security, database, challenges. |
| #575 | - NEVER say "I don't have enough information" unless absolutely nothing in the context mentions the topic. |
| #576 | - For "how many" questions, provide the specific count, not a range.""" |
| #577 | |
| #578 | DEFAULT_TOP_K = 30 # Memories to retrieve per question (increased for broader context) |
| #579 | RECENT_CONTEXT_COUNT = 12 # Last N messages to include as recent context |
| #580 | MAX_MEMORY_CONTEXT_CHARS = 16000 # More context for LLM to find contradictions |
| #581 | |
| #582 | |
| #583 | def _recall_safe(beam: BeamMemory, query: str, top_k: int, temporal_weight: float = 0.0) -> list: |
| #584 | """Safe recall wrapper that handles errors gracefully.""" |
| #585 | try: |
| #586 | return beam.recall(query, top_k=top_k, temporal_weight=temporal_weight) |
| #587 | except Exception: |
| #588 | return [] |
| #589 | |
| #590 | |
| #591 | def _extract_search_terms(question: str) -> list[str]: |
| #592 | """Extract diverse search terms from a question for multi-strategy retrieval.""" |
| #593 | import re |
| #594 | terms = [] |
| #595 | |
| #596 | # Extract quoted phrases |
| #597 | quoted = re.findall(r'"([^"]+)"', question) |
| #598 | terms.extend(quoted) |
| #599 | |
| #600 | # Extract numbers and units |
| #601 | numbers = re.findall(r'\b\d+[.,]?\d*\s*(?:ms|sec|days?|weeks?|months?|years?|%|KB|MB|GB|hours?|minutes?)\b', question, re.IGNORECASE) |
| #602 | terms.extend(numbers[:5]) |
| #603 | |
| #604 | # Extract named entities (capitalized phrases) |
| #605 | entities = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', question) |
| #606 | terms.extend(entities[:5]) |
| #607 | |
| #608 | # Extract version strings |
| #609 | versions = re.findall(r'\bv?\d+\.\d+(?:\.\d+)?\b', question) |
| #610 | terms.extend(versions[:5]) |
| #611 | |
| #612 | # Extract key nouns (filter out question words) |
| #613 | stop_words = {'have', 'did', 'do', 'does', 'can', 'will', 'would', 'should', 'is', 'are', 'was', 'were', |
| #614 | 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'my', 'me', 'i', 'you', |
| #615 | 'how', 'what', 'when', 'where', 'which', 'who', 'why', 'many', 'much'} |
| #616 | words = [w for w in re.findall(r'\b[a-zA-Z]{3,}\b', question) if w.lower() not in stop_words] |
| #617 | terms.extend(words[:10]) |
| #618 | |
| #619 | # Deduplicate while preserving order |
| #620 | seen = set() |
| #621 | unique = [] |
| #622 | for t in terms: |
| #623 | if t.lower() not in seen: |
| #624 | seen.add(t.lower()) |
| #625 | unique.append(t) |
| #626 | |
| #627 | return unique |
| #628 | |
| #629 | |
| #630 | def _multi_strategy_recall(beam: BeamMemory, question: str, top_k: int = DEFAULT_TOP_K, ability: str = None) -> list: |
| #631 | """Multi-strategy retrieval: keyword, semantic, entity, negation, temporal.""" |
| #632 | import re |
| #633 | all_memories = [] |
| #634 | seen_content_keys = set() |
| #635 | |
| #636 | def _add_unique(mems): |
| #637 | for mem in mems: |
| #638 | ck = mem.get("content", "")[:80] |
| #639 | if ck not in seen_content_keys: |
| #640 | seen_content_keys.add(ck) |
| #641 | all_memories.append(mem) |
| #642 | |
| #643 | # Detect temporal questions by ability type or keywords |
| #644 | temporal_keywords = ['when', 'date', 'deadline', 'sprint', 'day', 'week', 'month', |
| #645 | 'april', 'march', 'february', 'january', 'may', 'june', 'july', |
| #646 | 'august', 'september', 'october', 'november', 'december', |
| #647 | 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', |
| #648 | 'how many days', 'how long', 'timeline', 'schedule'] |
| #649 | is_temporal = ability in ('TR', 'EO') or any(w in question.lower() for w in temporal_keywords) |
| #650 | temporal_weight = 0.3 if is_temporal else 0.0 |
| #651 | |
| #652 | # Strategy 1: Direct question search (mostly keyword via FTS5) |
| #653 | _add_unique(_recall_safe(beam, question, top_k * 2, temporal_weight=temporal_weight)) |
| #654 | |
| #655 | # Strategy 2: Negation search for contradiction detection |
| #656 | if any(w in question.lower() for w in ["have i", "did i", "do i", "am i", "has"]): |
| #657 | negation_query = question |
| #658 | for negation in ["never", "did not", "haven't"]: |
| #659 | if negation not in negation_query.lower(): |
| #660 | negation_query = re.sub(r'(?i)(have i|did i|am i)', f'I {negation}', negation_query) |
| #661 | break |
| #662 | _add_unique(_recall_safe(beam, negation_query, top_k, temporal_weight=temporal_weight)) |
| #663 | |
| #664 | # Strategy 3: Key entity/term searches |
| #665 | terms = _extract_search_terms(question) |
| #666 | for term in terms[:5]: |
| #667 | if len(term) > 2: |
| #668 | _add_unique(_recall_safe(beam, term, max(5, top_k // 3), temporal_weight=temporal_weight)) |
| #669 | |
| #670 | # Strategy 4: Temporal search for date-related questions |
| #671 | if is_temporal: |
| #672 | # Stronger temporal boost for date-specific sub-queries |
| #673 | date_temporal_weight = 0.5 |
| #674 | # Search for dates and timelines |
| #675 | _add_unique(_recall_safe(beam, "deadline schedule timeline date", top_k, temporal_weight=date_temporal_weight)) |
| #676 | # Search for specific months mentioned in the question |
| #677 | for month in ['january', 'february', 'march', 'april', 'may', 'june', |
| #678 | 'july', 'august', 'september', 'october', 'november', 'december']: |
| #679 | if month in question.lower(): |
| #680 | _add_unique(_recall_safe(beam, month, top_k // 2, temporal_weight=date_temporal_weight)) |
| #681 | |
| #682 | # Sort by score and return top-k |
| #683 | all_memories.sort(key=lambda x: x.get("score", 0), reverse=True) |
| #684 | return all_memories[:top_k] |
| #685 | |
| #686 | |
| #687 | # ============================================================ |
| #688 | # Per-Ability Bypasses: TR (Temporal Reasoning) + CR (Contradiction) |
| #689 | # ============================================================ |
| #690 | |
| #691 | def _extract_timeline_from_conversation(messages: list) -> list[dict]: |
| #692 | """Extract ALL dates from conversation messages with surrounding event context. |
| #693 | Filters out dates in code snippets. Returns sorted list of {date_obj, date_str, event_text, msg_index}.""" |
| #694 | import re as _re |
| #695 | from datetime import datetime as _dt |
| #696 | |
| #697 | timeline = [] |
| #698 | |
| #699 | # Month name map |
| #700 | MONTH_MAP = { |
| #701 | 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, |
| #702 | 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, |
| #703 | 'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, |
| #704 | 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, |
| #705 | 'november': 11, 'december': 12, |
| #706 | } |
| #707 | |
| #708 | # Code indicators to filter out |
| #709 | CODE_INDICATORS = ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'CREATE TABLE', |
| #710 | 'def ', 'import ', 'print(', 'return ', '```', 'function(', |
| #711 | 'jsonify', 'datetime', 'params', 'cursor.'] |
| #712 | |
| #713 | def _is_code_context(text: str, match_start: int) -> bool: |
| #714 | """Check if a date match appears to be in a code snippet.""" |
| #715 | # Check ~200 chars around match for code indicators |
| #716 | start = max(0, match_start - 100) |
| #717 | end = min(len(text), match_start + 100) |
| #718 | surrounding = text[start:end] |
| #719 | # If backticks within 200 chars, it's code |
| #720 | if '```' in surrounding or '`' in surrounding: |
| #721 | return True |
| #722 | # If multiple code indicators present |
| #723 | code_count = sum(1 for ci in CODE_INDICATORS if ci in surrounding) |
| #724 | if code_count >= 2: |
| #725 | return True |
| #726 | # ISO date alone (2024-01-15) in a line with code indicators = likely code |
| #727 | if _re.search(r'\b\d{4}-\d{2}-\d{2}\b', surrounding): |
| #728 | if any(ci in surrounding for ci in CODE_INDICATORS): |
| #729 | return True |
| #730 | return False |
| #731 | |
| #732 | # Track the conversation year context |
| #733 | year_mentions = [] |
| #734 | for msg in messages: |
| #735 | years = _re.findall(r'\b(20\d{2})\b', msg.get("content", "")) |
| #736 | year_mentions.extend(int(y) for y in years) |
| #737 | # Use the most common year > 2020 as default |
| #738 | default_year = 2024 |
| #739 | if year_mentions: |
| #740 | from collections import Counter |
| #741 | year_counts = Counter(y for y in year_mentions if 2020 <= y <= 2030) |
| #742 | if year_counts: |
| #743 | default_year = year_counts.most_common(1)[0][0] |
| #744 | |
| #745 | for i, msg in enumerate(messages): |
| #746 | content = msg.get("content", "") |
| #747 | if not content: |
| #748 | continue |
| #749 | |
| #750 | # Pattern 1: "Month Day, Year" (e.g. "March 15, 2024") |
| #751 | for m in _re.finditer( |
| #752 | r'(?P<month>January|February|March|April|May|June|July|August|September|October|November|December|' |
| #753 | r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+' |
| #754 | r'(?P<day>\d{1,2})(?:st|nd|rd|th)?[,\s]+(?P<year>\d{4})', |
| #755 | content, _re.IGNORECASE |
| #756 | ): |
| #757 | if _is_code_context(content, m.start()): |
| #758 | continue |
| #759 | month_num = MONTH_MAP.get(m.group('month').lower()[:3]) |
| #760 | if month_num: |
| #761 | try: |
| #762 | dt = _dt(int(m.group('year')), month_num, int(m.group('day'))) |
| #763 | start = max(0, m.start() - 60) |
| #764 | end = min(len(content), m.end() + 60) |
| #765 | event_text = content[start:end].strip() |
| #766 | timeline.append({ |
| #767 | 'date_obj': dt, 'date_str': m.group(0), |
| #768 | 'event_text': event_text, 'msg_index': i, |
| #769 | }) |
| #770 | except ValueError: |
| #771 | pass |
| #772 | |
| #773 | # Pattern 2: "Month Day" without year (e.g. "March 29") |
| #774 | for m in _re.finditer( |
| #775 | r'(?P<month>January|February|March|April|May|June|July|August|September|October|November|December|' |
| #776 | r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+' |
| #777 | r'(?P<day>\d{1,2})(?:st|nd|rd|th)?' |
| #778 | r'(?![\d,\s]*\d{4})', # NOT followed by year |
| #779 | content, _re.IGNORECASE |
| #780 | ): |
| #781 | if _is_code_context(content, m.start()): |
| #782 | continue |
| #783 | month_num = MONTH_MAP.get(m.group('month').lower()[:3]) |
| #784 | if month_num: |
| #785 | try: |
| #786 | dt = _dt(default_year, month_num, int(m.group('day'))) |
| #787 | start = max(0, m.start() - 60) |
| #788 | end = min(len(content), m.end() + 60) |
| #789 | event_text = content[start:end].strip() |
| #790 | timeline.append({ |
| #791 | 'date_obj': dt, 'date_str': m.group(0), |
| #792 | 'event_text': event_text, 'msg_index': i, |
| #793 | }) |
| #794 | except ValueError: |
| #795 | pass |
| #796 | |
| #797 | # Sort chronologically and deduplicate (same date, same event text) |
| #798 | timeline.sort(key=lambda x: (x['date_obj'], x['event_text'])) |
| #799 | seen = set() |
| #800 | deduped = [] |
| #801 | for t in timeline: |
| #802 | key = (t['date_str'], t['event_text'][:40]) |
| #803 | if key not in seen: |
| #804 | seen.add(key) |
| #805 | deduped.append(t) |
| #806 | |
| #807 | return deduped |
| #808 | |
| #809 | |
| #810 | def _build_tr_timeline_prompt(timeline: list[dict]) -> str: |
| #811 | """Build a structured timeline prompt for TR questions.""" |
| #812 | if not timeline: |
| #813 | return "" |
| #814 | |
| #815 | lines = ["CRITICAL TIMELINE (all dates extracted from the conversation, use ONLY these dates):"] |
| #816 | for t in timeline: |
| #817 | lines.append(f" [{t['date_obj'].strftime('%Y-%m-%d')}] {t['date_str']}: ...{t['event_text'][:100]}...") |
| #818 | |
| #819 | return "\n".join(lines) |
| #820 | |
| #821 | |
| #822 | def _compute_tr_python(question: str, timeline: list[dict]) -> str | None: |
| #823 | """Compute TR answer in pure Python (date math, no LLM). Returns answer string or None.""" |
| #824 | import re as _re |
| #825 | from datetime import timedelta as _td |
| #826 | |
| #827 | q_lower = question.lower() |
| #828 | |
| #829 | # Detect question type: "how many days between X and Y" |
| #830 | # Extract event keywords from question |
| #831 | event_keywords = [] |
| #832 | # Look for "end of first sprint", "start of second sprint" type phrases |
| #833 | phrases = _re.findall(r'(?:end|start|beginning|completion|finish|launch|release|deploy|merge|push|commit|sprint|milestone|phase|wave|beta|alpha|MVP|demo|presentation|meeting|call|review|audit|test|benchmark)[a-z]*\s+(?:of\s+)?(?:the\s+)?(?:my\s+)?(?:first|second|third|\d+(?:st|nd|rd|th)?)?\s*[a-z]+(?:\s+[a-z]+){0,3}', q_lower) |
| #834 | event_keywords.extend(phrases) |
| #835 | |
| #836 | # Also try simpler: extract noun phrases from question |
| #837 | q_words = q_lower.replace('?', '').split() |
| #838 | |
| #839 | # Score each timeline entry against the question |
| #840 | scored = [] |
| #841 | for t in timeline: |
| #842 | event = t['event_text'].lower() |
| #843 | score = 0 |
| #844 | # Direct substring match bonus |
| #845 | for phrase in event_keywords: |
| #846 | if phrase in event or any(w in event for w in phrase.split() if len(w) > 3): |
| #847 | score += 3 |
| #848 | # Word overlap |
| #849 | for w in q_words: |
| #850 | if len(w) > 3 and w in event: |
| #851 | score += 1 |
| #852 | # Date proximity bonus (prefer dates with event context) |
| #853 | if len(t['event_text']) > 20: |
| #854 | score += 2 |
| #855 | scored.append((score, t)) |
| #856 | |
| #857 | scored.sort(key=lambda x: x[0], reverse=True) |
| #858 | |
| #859 | # Try to find two distinct events |
| #860 | if len(scored) >= 2 and scored[0][0] > 0 and scored[1][0] > 0: |
| #861 | t1 = scored[0][1] |
| #862 | t2 = scored[1][1] |
| #863 | d1 = t1['date_obj'] |
| #864 | d2 = t2['date_obj'] |
| #865 | diff = abs((d2 - d1).days) |
| #866 | |
| #867 | # Determine which is earlier/later based on question |
| #868 | if 'between' in q_lower: |
| #869 | evt_a = t1['date_str'] if d1 <= d2 else t2['date_str'] |
| #870 | evt_b = t2['date_str'] if d1 <= d2 else t1['date_str'] |
| #871 | d_a = d1 if d1 <= d2 else d2 |
| #872 | d_b = d2 if d1 <= d2 else d1 |
| #873 | else: |
| #874 | evt_a, evt_b = t1['date_str'], t2['date_str'] |
| #875 | d_a, d_b = d1, d2 |
| #876 | diff = abs((d_b - d_a).days) |
| #877 | |
| #878 | answer = ( |
| #879 | f"Between {evt_a} ({d_a.strftime('%B %d, %Y')}) and " |
| #880 | f"{evt_b} ({d_b.strftime('%B %d, %Y')}), " |
| #881 | f"there are {diff} days." |
| #882 | ) |
| #883 | return answer |
| #884 | |
| #885 | # Fallback: just take the two most relevant dates by word overlap |
| #886 | if len(scored) >= 2: |
| #887 | best = [t for s, t in scored if s > 0][:2] |
| #888 | if len(best) >= 2: |
| #889 | d1, d2 = best[0]['date_obj'], best[1]['date_obj'] |
| #890 | diff = abs((d2 - d1).days) |
| #891 | return ( |
| #892 | f"Based on the conversation timeline, the time between " |
| #893 | f"{best[0]['date_str']} and {best[1]['date_str']} is {diff} days." |
| #894 | ) |
| #895 | |
| #896 | return None # Can't compute, let LLM handle it |
| #897 | |
| #898 | |
| #899 | |
| #900 | def _compute_tr_answer(question: str, timeline: list[dict]) -> str | None: |
| #901 | """Compute temporal reasoning answer from conversation dates. Returns None if can't compute.""" |
| #902 | if not timeline or len(timeline) < 2: |
| #903 | return None |
| #904 | |
| #905 | # Build a prompt that presents the timeline and asks the LLM to compute |
| #906 | # This is more robust than trying to match events ourselves |
| #907 | timeline_prompt = _build_tr_timeline_prompt(timeline) |
| #908 | |
| #909 | # Build the full prompt that we'll return - the caller will send this to LLM |
| #910 | prompt = ( |
| #911 | f"{timeline_prompt}\n\n" |
| #912 | f"QUESTION: {question}\n\n" |
| #913 | f"INSTRUCTIONS:\n" |
| #914 | f"1. Identify the two events mentioned in the question\n" |
| #915 | f"2. Find the corresponding dates in the timeline above\n" |
| #916 | f"3. Compute the time difference between them\n" |
| #917 | f"4. State the answer clearly with both dates and the computed difference\n\n" |
| #918 | f"ANSWER:" |
| #919 | ) |
| #920 | return prompt # Return the prompt, not the answer - caller passes to LLM |
| #921 | |
| #922 | |
| #923 | def _detect_contradictions(messages: list, question: str) -> str | None: |
| #924 | """Scan conversation for contradictory statements about the question topic. |
| #925 | Returns contradiction context string to inject into prompt, or None if none found.""" |
| #926 | import re as _re |
| #927 | |
| #928 | # Extract the key topic from the question |
| #929 | # "Have I worked with Flask routes?" -> key terms: "flask routes", "http requests" |
| #930 | # "Have I integrated Flask-Login?" -> key terms: "flask-login", "session management" |
| #931 | |
| #932 | # Strip question words to get the core topic |
| #933 | q_clean = _re.sub(r'^(?:Have I|Did I|Do I|Am I|Has)\s+(?:ever\s+)?', '', question, flags=_re.IGNORECASE) |
| #934 | q_clean = _re.sub(r'\s+(?:in this project|across my sessions|in my project)\s*\??$', '', q_clean, flags=_re.IGNORECASE) |
| #935 | q_clean = q_clean.strip().rstrip('?').strip() |
| #936 | |
| #937 | # Extract meaningful noun phrases |
| #938 | words = _re.findall(r'\b[a-zA-Z][a-zA-Z\-]+\b', q_clean) |
| #939 | # Filter to key content words (nouns, tech terms) |
| #940 | key_terms = [] |
| #941 | for w in words: |
| #942 | wl = w.lower() |
| #943 | if len(wl) > 2 and wl not in ('the', 'and', 'for', 'with', 'any', 'this', 'that', 'have', 'has', 'been'): |
| #944 | key_terms.append(wl) |
| #945 | |
| #946 | if not key_terms: |
| #947 | return None |
| #948 | |
| #949 | # Scan all messages for mentions of ANY key term |
| #950 | affirmatives = [] |
| #951 | negatives = [] |
| #952 | |
| #953 | NEGATION_WORDS = {'never', 'not', "n't", 'no', 'without', 'cannot', "can't", 'nothing', 'none'} |
| #954 | |
| #955 | for i, msg in enumerate(messages): |
| #956 | content = msg.get("content", "") |
| #957 | content_lower = content.lower() |
| #958 | |
| #959 | # Check if message mentions any key term |
| #960 | matched_terms = [t for t in key_terms if t in content_lower] |
| #961 | if not matched_terms: |
| #962 | continue |
| #963 | |
| #964 | # Check for negation in the SENTENCE containing the matched term |
| #965 | # (BEAM contradictions embed negation near the topic mention) |
| #966 | has_negation = False |
| #967 | for term in matched_terms: |
| #968 | # Find the sentence containing this term |
| #969 | term_pos = content_lower.find(term) |
| #970 | if term_pos < 0: |
| #971 | continue |
| #972 | # Extract sentence context (200 chars around term, or to sentence boundaries) |
| #973 | start = max(0, term_pos - 150) |
| #974 | end = min(len(content_lower), term_pos + 150) |
| #975 | sentence = content_lower[start:end] |
| #976 | # Check for negation words in this sentence |
| #977 | for nw in NEGATION_WORDS: |
| #978 | if nw in sentence: |
| #979 | has_negation = True |
| #980 | break |
| #981 | if has_negation: |
| #982 | break |
| #983 | |
| #984 | snippet = content[:250].strip() |
| #985 | if has_negation: |
| #986 | negatives.append(f"[Msg {i}] {content[:250].strip()}") |
| #987 | else: |
| #988 | affirmatives.append(f"[Msg {i}] {content[:250].strip()}") |
| #989 | |
| #990 | if affirmatives and negatives: |
| #991 | ctx = "CRITICAL: CONTRADICTORY INFORMATION DETECTED\n\n" |
| #992 | ctx += "The conversation contains BOTH affirmative AND negative statements about this topic:\n\n" |
| #993 | ctx += "Statements suggesting this WAS done or worked on:\n" |
| #994 | for a in affirmatives[:5]: |
| #995 | ctx += f" - {a}\n" |
| #996 | ctx += "\nStatements suggesting this was NOT done:\n" |
| #997 | for n in negatives[:5]: |
| #998 | ctx += f" - {n}\n" |
| #999 | ctx += "\nYOU MUST explicitly identify the contradiction and present BOTH sides. " |
| #1000 | ctx += "Do NOT answer with just one side. The correct response begins with " |
| #1001 | ctx += "'I notice you've mentioned contradictory information about this.'" |
| #1002 | return ctx |
| #1003 | |
| #1004 | return None |
| #1005 | |
| #1006 | |
| #1007 | def answer_with_memory(llm: LLMClient, beam: BeamMemory, question: str, |
| #1008 | conversation_messages: list = None, top_k: int = DEFAULT_TOP_K, |
| #1009 | ability: str = None) -> str: |
| #1010 | """Retrieve memories and have LLM answer, with context strategy based on conversation size.""" |
| #1011 | |
| #1012 | total_msgs = len(conversation_messages) if conversation_messages else 0 |
| #1013 | |
| #1014 | # ---- PER-ABILITY BYPASSES (zero-LLM or augmented) ---- |
| #1015 | |
| #1016 | # TR (Temporal Reasoning): compute answer from extracted dates |
| #1017 | if ability == 'TR' and conversation_messages: |
| #1018 | timeline = _extract_timeline_from_conversation(conversation_messages) |
| #1019 | print(f" [TR-bypass] extracted {len(timeline)} dates from {len(conversation_messages)} msgs") |
| #1020 | if timeline and len(timeline) >= 2: |
| #1021 | tr_prompt = _compute_tr_answer(question, timeline) |
| #1022 | if tr_prompt: |
| #1023 | messages = [ |
| #1024 | {"role": "system", "content": "You are a precise date calculator. Use ONLY the dates from the provided timeline. Output ONLY the answer, no explanation."}, |
| #1025 | {"role": "user", "content": tr_prompt}, |
| #1026 | ] |
| #1027 | answer = llm.chat(messages, temperature=0.0, max_tokens=4096) |
| #1028 | print(f" [TR-bypass] LLM answer: {answer[:150]}") |
| #1029 | return answer |
| #1030 | else: |
| #1031 | print(f" [TR-bypass] _compute_tr_answer returned None") |
| #1032 | else: |
| #1033 | print(f" [TR-bypass] no timeline extracted or too few dates") |
| #1034 | |
| #1035 | # CR (Contradiction Resolution): detect contradictory statements |
| #1036 | _cr_context = None |
| #1037 | if ability == 'CR' and conversation_messages: |
| #1038 | _cr_context = _detect_contradictions(conversation_messages, question) |
| #1039 | if _cr_context: |
| #1040 | print(f" [CR-detect] FOUND contradictions, injecting context ({len(_cr_context)} chars)") |
| #1041 | else: |
| #1042 | print(f" [CR-detect] no contradictions found") |
| #1043 | # ---- END PER-ABILITY BYPASSES ---- |
| #1044 | |
| #1045 | # FULL-CONTEXT MODE: send the entire conversation to the LLM, bypassing Mnemosyne retrieval. |
| #1046 | # This tests the LLM's reading comprehension ceiling — useful for establishing the upper bound. |
| #1047 | # Controlled by FULL_CONTEXT_MODE env var. |
| #1048 | # HYBRID: try context→value matching first for factual questions (IE/MR/KU), |
| #1049 | # then fall through to full-context for complex reasoning (ABS/CR/EO/SUM/TR). |
| #1050 | _full_context = os.environ.get("FULL_CONTEXT_MODE", "").lower() in ("1", "true", "yes") |
| #1051 | # DEBUG |
| #1052 | if os.environ.get("FULL_CONTEXT_MODE"): |
| #1053 | print(f" [DEBUG full-context] env={_full_context}, msgs={bool(conversation_messages)}, count={len(conversation_messages) if conversation_messages else 0}") |
| #1054 | if _full_context and conversation_messages: |
| #1055 | # ---- Phase 1: Try context→value matching for factual questions ---- |
| #1056 | # Only use context→value for Information Extraction (IE) and Knowledge Understanding (KU). |
| #1057 | # MR (Multi-hop) requires reasoning across multiple messages; let full-context handle it. |
| #1058 | _FACT_ABILITIES = {'IE', 'KU'} |
| #1059 | if ability in _FACT_ABILITIES and hasattr(beam, '_context_facts') and beam._context_facts: |
| #1060 | _q_stop = {'when','does','do','did','what','how','where','which','who','why', |
| #1061 | 'is','are','was','were','can','will','would','should','could','may', |
| #1062 | 'the','a','an','in','on','at','to','for','of','with','my','me','i','you'} |
| #1063 | q_words = [w.lower() for w in question.split() if w.lower() not in _q_stop and len(w) > 1] |
| #1064 | q_set = set(q_words) |
| #1065 | best_match = None |
| #1066 | best_score = 0 |
| #1067 | for context_phrase, values in beam._context_facts.items(): |
| #1068 | c_words = set(context_phrase.split()) |
| #1069 | overlap = q_set & c_words |
| #1070 | if len(overlap) < 2: |
| #1071 | continue |
| #1072 | score = len(overlap) / max(len(c_words), 1) |
| #1073 | if score > best_score: |
| #1074 | best_score = score |
| #1075 | best_match = values[0] |
| #1076 | if best_match: |
| #1077 | return best_match # Direct fact answer, zero LLM cost |
| #1078 | |
| #1079 | # ---- Phase 2: Full-context LLM fallback ---- |
| #1080 | full_parts = [] |
| #1081 | total_chars = 0 |
| #1082 | for msg in conversation_messages: |
| #1083 | role = msg.get("role", "unknown") |
| #1084 | content = msg.get("content", "") |
| #1085 | if content.strip(): |
| #1086 | line = f"[{role}]: {content}" |
| #1087 | if total_chars + len(line) > MAX_MEMORY_CONTEXT_CHARS * 2: |
| #1088 | break |
| #1089 | full_parts.append(line) |
| #1090 | total_chars += len(line) |
| #1091 | |
| #1092 | context = "FULL CONVERSATION:\n" + "\n".join(full_parts) |
| #1093 | |
| #1094 | # Inject CR contradiction context if detected |
| #1095 | _cr_prefix = "" |
| #1096 | if _cr_context: |
| #1097 | _cr_prefix = f"\n\n{_cr_context}\n\n" |
| #1098 | |
| #1099 | messages = [ |
| #1100 | {"role": "system", "content": ANSWER_SYSTEM_PROMPT}, |
| #1101 | {"role": "user", "content": f"{_cr_prefix}{context}\n\nQUESTION: {question}\n\nANSWER:"}, |
| #1102 | ] |
| #1103 | return llm.chat(messages, temperature=0.1, max_tokens=2048) |
| #1104 | |
| #1105 | # ALWAYS use multi-strategy retrieval to test Mnemosyne's recall quality. |
| #1106 | # The previous <=500 bypass sent full raw conversations to the LLM, |
| #1107 | # completely bypassing Mnemosyne's retrieval pipeline. |
| #1108 | # This benchmark exists to measure MEMORY performance, not LLM reading comprehension. |
| #1109 | |
| #1110 | # Multi-strategy retrieval |
| #1111 | memories = _multi_strategy_recall(beam, question, top_k * 3, ability=ability) # Get 3x more for reranking |
| #1112 | |
| #1113 | # ---- Context→Value fact matching (Phase 7: direct regex-extracted facts, zero-LLM) ---- |
| #1114 | # At ingestion, we built beam._context_facts: {"words around fact": ["fact value"]}. |
| #1115 | # Now we try to match the question against context phrases and return the value directly. |
| #1116 | # Only used for factual question types (IE, MR, KU, TR) with strong matches. |
| #1117 | # ABS, CR, EO, SUM need LLM reasoning — we skip context matching for those. |
| #1118 | context_answer = None |
| #1119 | # Only use context→value for Information Extraction (IE) and Knowledge Understanding (KU). |
| #1120 | # MR (Multi-hop) requires reasoning across multiple messages; CR/TR/EO/SUM need LLM. |
| #1121 | _FACT_ABILITIES = {'IE', 'KU'} |
| #1122 | if ability in _FACT_ABILITIES and hasattr(beam, '_context_facts') and beam._context_facts: |
| #1123 | # Build question word set (filtered like FTS5 search does) |
| #1124 | _q_stop = {'when','does','do','did','what','how','where','which','who','why', |
| #1125 | 'is','are','was','were','can','will','would','should','could','may', |
| #1126 | 'the','a','an','in','on','at','to','for','of','with','my','me','i','you'} |
| #1127 | q_words = [w.lower() for w in question.split() if w.lower() not in _q_stop and len(w) > 1] |
| #1128 | q_set = set(q_words) |
| #1129 | best_match = None |
| #1130 | best_score = 0 |
| #1131 | for context_phrase, values in beam._context_facts.items(): |
| #1132 | c_words = set(context_phrase.split()) |
| #1133 | overlap = q_set & c_words |
| #1134 | if len(overlap) < 2: |
| #1135 | continue |
| #1136 | # Score: overlap count / max(context_words, question_words) for fairness |
| #1137 | score = len(overlap) / max(len(c_words), 1) |
| #1138 | if score > best_score and len(overlap) >= 2: |
| #1139 | best_score = score |
| #1140 | best_match = values[0] |
| #1141 | if best_match: |
| #1142 | context_answer = best_match |
| #1143 | |
| #1144 | # If cloud extraction enabled, also search the facts table |
| #1145 | if getattr(beam, 'use_cloud', False): |
| #1146 | try: |
| #1147 | fact_memories = beam.fact_recall(question, top_k=top_k) |
| #1148 | # Convert fact dicts to same format as recall results |
| #1149 | for f in fact_memories: |
| #1150 | memories.append({ |
| #1151 | "content": f"FACT: {f['content']}", |
| #1152 | "score": f.get("score", 0.5) * 2.0, # 2x weight for facts |
| #1153 | "source": "fact_extraction", |
| #1154 | }) |
| #1155 | # Re-sort by score |
| #1156 | memories.sort(key=lambda x: x.get("score", 0), reverse=True) |
| #1157 | except Exception: |
| #1158 | pass # Fact recall is best-effort |
| #1159 | |
| #1160 | # LLM RERANKING: DISABLED — rate-limit avoidance + proven ineffective (Reality Check 5.3) |
| #1161 | # The re-ranker cannot beat baseline by >3pp and causes 429 rate-limit cascades. |
| #1162 | # Left as dead code for reference. |
| #1163 | |
| #1164 | # ---- Fact-density reranking (Phase 6.5: algorithmic, zero-LLM) ---- |
| #1165 | # BEAM distractors are generic dev-talk; answer messages carry specific data. |
| #1166 | # Boost messages with dates, numbers, proper nouns, versions, technical terms. |
| #1167 | import re as _re_facts |
| #1168 | _FACT_PATTERNS = [ |
| #1169 | (r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}(?:[, ]*\d{4})?\b', 2.0), # dates |
| #1170 | (r'\b\d{4}-\d{2}-\d{2}\b', 2.5), # ISO dates |
| #1171 | (r'\b\d+[.,]?\d*\s*(?:ms|sec|mins?|hours?|days?|weeks?|months?|years?|%|KB|MB|GB|TB|rows?|columns?|roles?|features?|bugs?|commits?|cards?|users?|items?|tests?|APIs?|endpoints?|sprints?|tickets?)\b', 1.5), # numbers with units |
| #1172 | (r'\bv?\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?\b', 1.5), # version strings |
| #1173 | (r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4}\b', 1.0), # proper noun phrases |
| #1174 | (r'\b[A-Z]{2,8}\b', 0.8), # acronyms |
| #1175 | ] |
| #1176 | for mem in memories: |
| #1177 | content = mem.get("content", "") |
| #1178 | fact_score = 0.0 |
| #1179 | for pattern, weight in _FACT_PATTERNS: |
| #1180 | matches = _re_facts.findall(pattern, content) |
| #1181 | fact_score += len(matches) * weight |
| #1182 | # Normalize by content length to get fact density |
| #1183 | density = fact_score / max(len(content.split()), 1) |
| #1184 | mem["fact_density"] = round(density, 4) |
| #1185 | # Boost score: blend original with fact density (40% fact boost) |
| #1186 | orig = mem.get("score", mem.get("relevance", 0)) |
| #1187 | mem["score"] = orig * 0.6 + min(density * 5.0, 1.0) * 0.4 |
| #1188 | |
| #1189 | # Re-sort by boosted score |
| #1190 | memories.sort(key=lambda m: m.get("score", 0), reverse=True) |
| #1191 | |
| #1192 | context = "" # Built below from memories |
| #1193 | |
| #1194 | # Build recent context from last N messages |
| #1195 | recent_parts = [] |
| #1196 | if conversation_messages: |
| #1197 | recent = conversation_messages[-RECENT_CONTEXT_COUNT:] |
| #1198 | for msg in recent: |
| #1199 | role = msg.get("role", "unknown") |
| #1200 | content = msg.get("content", "") |
| #1201 | if content.strip(): |
| #1202 | recent_parts.append(f"[{role}]: {content[:300]}") |
| #1203 | |
| #1204 | # Build retrieved memory context (deduplicated, relevance-sorted) |
| #1205 | seen_content = set() |
| #1206 | memory_parts = [] |
| #1207 | total_chars = 0 |
| #1208 | for i, mem in enumerate(memories): |
| #1209 | content = mem.get("content", "") |
| #1210 | # Deduplicate |
| #1211 | content_key = content[:100] |
| #1212 | if content_key in seen_content: |
| #1213 | continue |
| #1214 | seen_content.add(content_key) |
| #1215 | |
| #1216 | score = mem.get("score", mem.get("relevance", 0)) |
| #1217 | if isinstance(score, (int, float)) and score < 0.05: |
| #1218 | continue # Skip very low relevance |
| #1219 | |
| #1220 | if total_chars + len(content) > MAX_MEMORY_CONTEXT_CHARS: |
| #1221 | remaining = MAX_MEMORY_CONTEXT_CHARS - total_chars |
| #1222 | if remaining > 100: |
| #1223 | memory_parts.append(f"[Memory] {content[:remaining]}...") |
| #1224 | break |
| #1225 | memory_parts.append(f"[Memory] {content}") |
| #1226 | total_chars += len(content) |
| #1227 | |
| #1228 | # Build prompt with contexts (skip if full-conversation mode already set) |
| #1229 | if not context: |
| #1230 | context_blocks = [] |
| #1231 | if recent_parts: |
| #1232 | context_blocks.append("RECENT CONVERSATION:\n" + "\n".join(recent_parts)) |
| #1233 | if memory_parts: |
| #1234 | context_blocks.append("RETRIEVED MEMORIES:\n" + "\n\n".join(memory_parts)) |
| #1235 | |
| #1236 | context = "\n\n".join(context_blocks) if context_blocks else "[No memories found]" |
| #1237 | |
| #1238 | # If we found a direct context→value match, return it immediately (zero LLM cost) |
| #1239 | if context_answer: |
| #1240 | return context_answer |
| #1241 | |
| #1242 | # Inject CR contradiction context if detected |
| #1243 | _cr_prefix_ret = "" |
| #1244 | if _cr_context: |
| #1245 | _cr_prefix_ret = f"\n\n{_cr_context}\n\n" |
| #1246 | |
| #1247 | messages = [ |
| #1248 | {"role": "system", "content": ANSWER_SYSTEM_PROMPT}, |
| #1249 | {"role": "user", "content": f"{_cr_prefix_ret}{context}\n\nQUESTION: {question}\n\nANSWER:"}, |
| #1250 | ] |
| #1251 | |
| #1252 | return llm.chat(messages, temperature=0.1, max_tokens=2048) |
| #1253 | |
| #1254 | |
| #1255 | # ============================================================ |
| #1256 | # LLM-as-Judge: Nugget-Based Scoring (BEAM Protocol) |
| #1257 | # ============================================================ |
| #1258 | |
| #1259 | JUDGE_SYSTEM_PROMPT = """You are an expert evaluator for a memory benchmark. |
| #1260 | You will be given: |
| #1261 | 1. A question about a conversation |
| #1262 | 2. A list of RUBRIC ITEMS (expected facts the AI should mention) |
| #1263 | 3. The AI's ANSWER |
| #1264 | |
| #1265 | For EACH rubric item, check if the AI's answer contains equivalent information: |
| #1266 | - Score 1.0: correct info present, substantially matches the rubric item |
| #1267 | - Score 0.5: partially correct, some key detail missing or slightly wrong |
| #1268 | - Score 0.0: missing or wrong |
| #1269 | |
| #1270 | Return ONLY this JSON: |
| #1271 | {"scores":[1.0,0.5,0.0],"overall_score":0.X} |
| #1272 | |
| #1273 | Where scores[i] corresponds to rubric[i], and overall_score is the average.""" |
| #1274 | |
| #1275 | |
| #1276 | def judge_with_rubrics(llm: LLMClient, question: str, rubric: list, ai_answer: str) -> dict: |
| #1277 | """Judge an AI answer against pre-written BEAM rubric items.""" |
| #1278 | if not rubric: |
| #1279 | # Fall back to generic nugget scoring if no rubric available |
| #1280 | return {"scores": [], "overall_score": 0.0, "assessment": "no rubric available"} |
| #1281 | |
| #1282 | rubric_text = "\n".join(f"{i+1}. {item}" for i, item in enumerate(rubric)) |
| #1283 | |
| #1284 | user_prompt = f"""QUESTION: {question} |
| #1285 | |
| #1286 | RUBRIC ITEMS: |
| #1287 | {rubric_text} |
| #1288 | |
| #1289 | AI's ANSWER: {ai_answer} |
| #1290 | |
| #1291 | For each rubric item, score how well the AI's answer matches. Return JSON with scores array and overall_score (average).""" |
| #1292 | |
| #1293 | messages = [ |
| #1294 | {"role": "system", "content": JUDGE_SYSTEM_PROMPT}, |
| #1295 | {"role": "user", "content": user_prompt}, |
| #1296 | ] |
| #1297 | |
| #1298 | response = llm.chat(messages, temperature=0.0, max_tokens=500) |
| #1299 | |
| #1300 | # Parse JSON from response |
| #1301 | if response is None: |
| #1302 | return { |
| #1303 | "scores": [0.0] * len(rubric), |
| #1304 | "overall_score": 0.0, |
| #1305 | "assessment": "LLM judge returned None (timeout or error)", |
| #1306 | } |
| #1307 | |
| #1308 | try: |
| #1309 | json_start = response.find("{") |
| #1310 | json_end = response.rfind("}") + 1 |
| #1311 | if json_start >= 0 and json_end > json_start: |
| #1312 | result = json.loads(response[json_start:json_end]) |
| #1313 | return result |
| #1314 | except (json.JSONDecodeError, ValueError): |
| #1315 | pass |
| #1316 | |
| #1317 | # Fallback: basic text matching |
| #1318 | return { |
| #1319 | "scores": [0.0] * len(rubric), |
| #1320 | "overall_score": basic_text_similarity(ai_answer, " ".join(rubric)), |
| #1321 | "assessment": "JSON parse failed; using fallback", |
| #1322 | "raw_response": response, |
| #1323 | } |
| #1324 | |
| #1325 | |
| #1326 | def basic_text_similarity(text1: str, text2: str) -> float: |
| #1327 | """Simple token overlap as fallback when LLM judge fails.""" |
| #1328 | t1 = set(text1.lower().split()) |
| #1329 | t2 = set(text2.lower().split()) |
| #1330 | if not t1 or not t2: |
| #1331 | return 0.0 |
| #1332 | intersection = t1 & t2 |
| #1333 | union = t1 | t2 |
| #1334 | return len(intersection) / len(union) if union else 0.0 |
| #1335 | |
| #1336 | |
| #1337 | # ============================================================ |
| #1338 | # Evaluation Runner |
| #1339 | # ============================================================ |
| #1340 | |
| #1341 | def evaluate_conversation( |
| #1342 | llm: LLMClient, |
| #1343 | judge_llm: LLMClient, |
| #1344 | beam: BeamMemory, |
| #1345 | conversation: dict, |
| #1346 | resume_ids: set = None, |
| #1347 | ) -> dict: |
| #1348 | """Evaluate all probing questions for one conversation.""" |
| #1349 | conv_id = conversation["id"] |
| #1350 | questions = conversation["questions"][:BENCHMARK_QUERIES_PER_CONV] |
| #1351 | results = [] |
| #1352 | |
| #1353 | print(f" Conversation {conv_id}: {len(questions)} questions") |
| #1354 | |
| #1355 | for i, q in enumerate(questions): |
| #1356 | qid = f"{conv_id}:q{i}" |
| #1357 | if resume_ids and qid in resume_ids: |
| #1358 | continue |
| #1359 | |
| #1360 | question = q["question"] |
| #1361 | ideal = q["ideal_answer"] |
| #1362 | rubric = q.get("rubric", []) |
| #1363 | ability = q.get("ability", "unknown") |
| #1364 | # Map dataset ability names to BEAM abbreviations |
| #1365 | ability = ABILITY_MAP.get(ability, ability) |
| #1366 | |
| #1367 | if not question or not ideal: |
| #1368 | continue |
| #1369 | |
| #1370 | # Step 1: LLM answers using Mnemosyne memories + conversation context |
| #1371 | t0 = time.perf_counter() |
| #1372 | ai_answer = answer_with_memory(llm, beam, question, |
| #1373 | conversation_messages=conversation.get("messages", []), |
| #1374 | ability=ability) |
| #1375 | answer_time = time.perf_counter() - t0 |
| #1376 | |
| #1377 | # Handle None answer (LLM timeout/error) |
| #1378 | if ai_answer is None: |
| #1379 | ai_answer = "[LLM_ERROR: No response from answering model]" |
| #1380 | |
| #1381 | # Step 2: LLM-as-judge scores the answer |
| #1382 | t0 = time.perf_counter() |
| #1383 | judgment = judge_with_rubrics(judge_llm, question, rubric, ai_answer) |
| #1384 | judge_time = time.perf_counter() - t0 |
| #1385 | |
| #1386 | score = judgment.get("overall_score", 0.0) |
| #1387 | |
| #1388 | result = { |
| #1389 | "qid": qid, |
| #1390 | "ability": ability, |
| #1391 | "question": question[:200], |
| #1392 | "ideal_answer": ideal[:200], |
| #1393 | "ai_answer": ai_answer[:500], |
| #1394 | "score": score, |
| #1395 | "nuggets": judgment.get("nuggets", []), |
| #1396 | "assessment": judgment.get("brief_assessment", ""), |
| #1397 | "answer_time_ms": answer_time * 1000, |
| #1398 | "judge_time_ms": judge_time * 1000, |
| #1399 | } |
| #1400 | results.append(result) |
| #1401 | |
| #1402 | print(f" [{ability}] score={score:.2f} ans={answer_time*1000:.0f}ms judge={judge_time*1000:.0f}ms " |
| #1403 | f"Q: {question[:60]}...") |
| #1404 | |
| #1405 | # Rate-limit avoidance: long pause between questions (20s to avoid provider burst limits) |
| #1406 | time.sleep(20) |
| #1407 | |
| #1408 | return { |
| #1409 | "conversation_id": conv_id, |
| #1410 | "scale": conversation["scale"], |
| #1411 | "num_questions": len(questions), |
| #1412 | "num_evaluated": len(results), |
| #1413 | "results": results, |
| #1414 | } |
| #1415 | |
| #1416 | |
| #1417 | def compute_ability_scores(all_results: list[dict]) -> dict: |
| #1418 | """Aggregate scores by ability and scale.""" |
| #1419 | by_scale_ability = defaultdict(lambda: defaultdict(list)) |
| #1420 | |
| #1421 | for conv_result in all_results: |
| #1422 | scale = conv_result["scale"] |
| #1423 | for r in conv_result["results"]: |
| #1424 | ability = r.get("ability", "unknown") |
| #1425 | score = r.get("score", 0.0) |
| #1426 | by_scale_ability[scale][ability].append(score) |
| #1427 | |
| #1428 | # Compute averages |
| #1429 | summary = {} |
| #1430 | for scale, abilities in by_scale_ability.items(): |
| #1431 | scale_scores = {} |
| #1432 | all_scores = [] |
| #1433 | for ability, scores in abilities.items(): |
| #1434 | avg = sum(scores) / len(scores) if scores else 0.0 |
| #1435 | scale_scores[ability] = { |
| #1436 | "avg_score": avg, |
| #1437 | "count": len(scores), |
| #1438 | } |
| #1439 | all_scores.extend(scores) |
| #1440 | |
| #1441 | # Overall average across all abilities |
| #1442 | overall = sum(all_scores) / len(all_scores) if all_scores else 0.0 |
| #1443 | scale_scores["OVERALL"] = { |
| #1444 | "avg_score": overall, |
| #1445 | "count": len(all_scores), |
| #1446 | } |
| #1447 | |
| #1448 | summary[scale] = scale_scores |
| #1449 | |
| #1450 | return summary |
| #1451 | |
| #1452 | |
| #1453 | # ============================================================ |
| #1454 | # SOTA Comparison |
| #1455 | # ============================================================ |
| #1456 | |
| #1457 | PUBLISHED_SOTA = { |
| #1458 | "10M": { |
| #1459 | "Hindsight": 64.1, |
| #1460 | "Honcho": 40.6, |
| #1461 | "LIGHT (Llama-4)": 26.6, |
| #1462 | "RAG (Llama-4)": 24.9, |
| #1463 | }, |
| #1464 | "1M": { |
| #1465 | "Hindsight": 73.9, |
| #1466 | "Honcho": 63.1, |
| #1467 | "LIGHT (Llama-4)": 33.6, |
| #1468 | "RAG (Llama-4)": 30.7, |
| #1469 | }, |
| #1470 | "500K": { |
| #1471 | "Hindsight": 71.1, |
| #1472 | "Honcho": 64.9, |
| #1473 | "LIGHT (Llama-4)": 35.9, |
| #1474 | "RAG (Llama-4)": 33.0, |
| #1475 | }, |
| #1476 | "100K": { |
| #1477 | "Hindsight": 73.4, |
| #1478 | "Honcho": 63.0, |
| #1479 | "LIGHT (Llama-4)": 35.8, |
| #1480 | "RAG (Llama-4)": 32.3, |
| #1481 | }, |
| #1482 | } |
| #1483 | |
| #1484 | |
| #1485 | def print_sota_report(ability_summary: dict, metadata: dict): |
| #1486 | """Print SOTA comparison report.""" |
| #1487 | print(f"\n{'='*80}") |
| #1488 | print(f" MNEMOSYNE BEAM END-TO-END SOTA REPORT") |
| #1489 | print(f" Date: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}") |
| #1490 | print(f" Model: {metadata.get('model', 'unknown')}") |
| #1491 | print(f" Conversations/scale: {metadata.get('sample_size', 'N/A')}") |
| #1492 | print(f" Top-K memories: {DEFAULT_TOP_K}") |
| #1493 | print(f" Methodology: LLM answering + LLM-as-judge (nugget scoring, per BEAM protocol)") |
| #1494 | print(f"{'='*80}") |
| #1495 | |
| #1496 | print(f"\n Per-Ability Scores:") |
| #1497 | print(f" {'Scale':<8} {'OVERALL':>8}", end="") |
| #1498 | for ab in BEAM_ABILITIES: |
| #1499 | print(f" {ab:>6}", end="") |
| #1500 | print() |
| #1501 | |
| #1502 | for scale in sorted(ability_summary.keys()): |
| #1503 | scores = ability_summary[scale] |
| #1504 | overall = scores.get("OVERALL", {}).get("avg_score", 0.0) |
| #1505 | print(f" {scale:<8} {overall*100:>7.1f}%", end="") |
| #1506 | for ab in BEAM_ABILITIES: |
| #1507 | s = scores.get(ab, {}).get("avg_score", 0.0) |
| #1508 | print(f" {s*100:>5.1f}%", end="") |
| #1509 | print() |
| #1510 | |
| #1511 | print(f"\n SOTA Comparison (OVERALL):") |
| #1512 | print(f" {'Scale':<8} {'Mnemosyne':>12}", end="") |
| #1513 | for system in ["Hindsight", "Honcho", "LIGHT (Llama-4)", "RAG (Llama-4)"]: |
| #1514 | print(f" {system:>18}", end="") |
| #1515 | print() |
| #1516 | |
| #1517 | for scale in sorted(ability_summary.keys()): |
| #1518 | our_score = ability_summary[scale].get("OVERALL", {}).get("avg_score", 0.0) * 100 |
| #1519 | sota = PUBLISHED_SOTA.get(scale, {}) |
| #1520 | print(f" {scale:<8} {our_score:>11.1f}%", end="") |
| #1521 | for system in ["Hindsight", "Honcho", "LIGHT (Llama-4)", "RAG (Llama-4)"]: |
| #1522 | print(f" {sota.get(system, 0):>17.1f}%", end="") |
| #1523 | print() |
| #1524 | |
| #1525 | print(f"\n Note: Published SOTA numbers from Hindsight blog (Apr 2026) and BEAM paper Table 3.") |
| #1526 | print(f" Mnemosyne uses DeepSeek V4 Pro as answering + judging LLM.") |
| #1527 | print(f" Direct comparison valid: identical BEAM dataset, identical LLM-as-judge protocol.") |
| #1528 | print(f"{'='*80}") |
| #1529 | |
| #1530 | |
| #1531 | # ============================================================ |
| #1532 | # Main |
| #1533 | # ============================================================ |
| #1534 | |
| #1535 | def main(): |
| #1536 | parser = argparse.ArgumentParser(description="BEAM End-to-End Evaluation") |
| #1537 | parser.add_argument("--scales", default="100K,500K,1M,10M", |
| #1538 | help="Scales to evaluate (comma-separated)") |
| #1539 | parser.add_argument("--sample", type=int, default=3, |
| #1540 | help="Conversations per scale (0=all)") |
| #1541 | parser.add_argument("--model", default=DEFAULT_MODEL, |
| #1542 | help="LLM model for answering and judging") |
| #1543 | parser.add_argument("--judge-model", default=None, |
| #1544 | help="Separate LLM for judging (default: same as --model)") |
| #1545 | parser.add_argument("--full-context", action="store_true", |
| #1546 | help="Send full conversation to LLM (ceiling test, bypasses retrieval)") |
| #1547 | parser.add_argument("--resume", action="store_true", |
| #1548 | help="Resume from previous results file") |
| #1549 | parser.add_argument("--dry-run", action="store_true", |
| #1550 | help="Download data and print stats, don't evaluate") |
| #1551 | parser.add_argument("--use-cloud", action="store_true", |
| #1552 | help="Enable LLM fact extraction (cloud tier). Requires OPENROUTER_API_KEY.") |
| #1553 | args = parser.parse_args() |
| #1554 | |
| #1555 | scales = [s.strip() for s in args.scales.split(",")] |
| #1556 | sample_size = args.sample if args.sample > 0 else None |
| #1557 | |
| #1558 | print(f"{'='*80}") |
| #1559 | print(f" BEAM End-to-End Evaluation Pipeline") |
| #1560 | print(f" Scales: {scales}") |
| #1561 | print(f" Sample: {sample_size or 'ALL'} conversations/scale") |
| #1562 | print(f" Model: {args.model}") |
| #1563 | print(f" Judge: {args.judge_model or args.model}") |
| #1564 | if args.full_context: |
| #1565 | os.environ["FULL_CONTEXT_MODE"] = "1" |
| #1566 | print(" Mode: FULL-CONTEXT (bypassing retrieval)") |
| #1567 | print(f"{'='*80}") |
| #1568 | |
| #1569 | # Load data |
| #1570 | print(f"\n[1/4] Loading BEAM dataset...") |
| #1571 | data = load_beam_dataset(scales, max_conversations=sample_size) |
| #1572 | |
| #1573 | if not data: |
| #1574 | print("ERROR: No data loaded. Check HuggingFace token and dataset name.") |
| #1575 | sys.exit(1) |
| #1576 | |
| #1577 | # Print stats |
| #1578 | print(f"\n Dataset Summary:") |
| #1579 | for scale, convs in data.items(): |
| #1580 | total_msgs = sum(len(c["messages"]) for c in convs) |
| #1581 | total_qs = sum(len(c["questions"]) for c in convs) |
| #1582 | print(f" {scale}: {len(convs)} convs, {total_msgs:,} msgs, {total_qs} questions") |
| #1583 | |
| #1584 | if args.dry_run: |
| #1585 | print(f"\n Dry run complete. Exiting.") |
| #1586 | return |
| #1587 | |
| #1588 | # Load previous results if resuming |
| #1589 | resume_ids = set() |
| #1590 | all_previous = [] |
| #1591 | if args.resume and RESULTS_FILE.exists(): |
| #1592 | print(f"\n Resuming from {RESULTS_FILE}...") |
| #1593 | with open(RESULTS_FILE) as f: |
| #1594 | prev = json.load(f) |
| #1595 | all_previous = prev.get("results", []) |
| #1596 | for conv_result in all_previous: |
| #1597 | for r in conv_result.get("results", []): |
| #1598 | resume_ids.add(r["qid"]) |
| #1599 | print(f" Already evaluated: {len(resume_ids)} questions") |
| #1600 | |
| #1601 | # Initialize LLM clients |
| #1602 | print(f"\n[2/4] Initializing LLM clients...") |
| #1603 | llm = LLMClient(model=args.model) |
| #1604 | judge_llm = LLMClient(model=args.judge_model or args.model) |
| #1605 | |
| #1606 | # Evaluate each conversation |
| #1607 | print(f"\n[3/4] Evaluating... ({len(data)} scales)") |
| #1608 | all_results = list(all_previous) if args.resume else [] |
| #1609 | |
| #1610 | for scale in sorted(data.keys()): |
| #1611 | conversations = data[scale] |
| #1612 | print(f"\n --- Scale: {scale} ({len(conversations)} conversations) ---") |
| #1613 | |
| #1614 | for conv in conversations: |
| #1615 | # Create fresh Mnemosyne DB for each conversation |
| #1616 | with tempfile.TemporaryDirectory() as tmpdir: |
| #1617 | db_path = Path(tmpdir) / f"beam_{scale}_{conv['id']}.db" |
| #1618 | init_beam(db_path) |
| #1619 | beam = BeamMemory(session_id=f"beam_{scale}_{conv['id']}", |
| #1620 | db_path=db_path, use_cloud=args.use_cloud) |
| #1621 | |
| #1622 | # Ingest |
| #1623 | t0 = time.perf_counter() |
| #1624 | stats = ingest_conversation(beam, conv["messages"]) |
| #1625 | ingest_time = time.perf_counter() - t0 |
| #1626 | print(f" Ingested {len(conv['messages'])} msgs in {ingest_time:.1f}s " |
| #1627 | f"(DB: {os.path.getsize(db_path)/1024:.0f}KB)") |
| #1628 | |
| #1629 | # Evaluate |
| #1630 | conv_result = evaluate_conversation( |
| #1631 | llm, judge_llm, beam, conv, resume_ids |
| #1632 | ) |
| #1633 | all_results.append(conv_result) |
| #1634 | beam.conn.close() |
| #1635 | |
| #1636 | # Save progress after each conversation |
| #1637 | os.makedirs(RESULTS_FILE.parent, exist_ok=True) |
| #1638 | metadata = { |
| #1639 | "date": datetime.now(timezone.utc).isoformat(), |
| #1640 | "model": args.model, |
| #1641 | "judge_model": args.judge_model or args.model, |
| #1642 | "top_k": DEFAULT_TOP_K, |
| #1643 | "sample_size": sample_size or "ALL", |
| #1644 | "scales": scales, |
| #1645 | "total_conversations": len(all_results), |
| #1646 | } |
| #1647 | with open(RESULTS_FILE, "w") as f: |
| #1648 | json.dump({"metadata": metadata, "results": all_results}, f, indent=2) |
| #1649 | |
| #1650 | # Cleanup |
| #1651 | llm.close() |
| #1652 | judge_llm.close() |
| #1653 | |
| #1654 | # Compute and print report |
| #1655 | print(f"\n[4/4] Computing SOTA report...") |
| #1656 | ability_summary = compute_ability_scores(all_results) |
| #1657 | |
| #1658 | metadata = { |
| #1659 | "model": args.model, |
| #1660 | "sample_size": sample_size or "ALL", |
| #1661 | "judge_model": args.judge_model or args.model, |
| #1662 | } |
| #1663 | print_sota_report(ability_summary, metadata) |
| #1664 | |
| #1665 | # Save summary |
| #1666 | summary_file = RESULTS_FILE.parent / "beam_e2e_summary.json" |
| #1667 | with open(summary_file, "w") as f: |
| #1668 | json.dump({ |
| #1669 | "date": datetime.now(timezone.utc).isoformat(), |
| #1670 | "metadata": metadata, |
| #1671 | "ability_summary": { |
| #1672 | scale: { |
| #1673 | ab: {"avg_score": v["avg_score"], "count": v["count"]} |
| #1674 | for ab, v in abilities.items() |
| #1675 | } |
| #1676 | for scale, abilities in ability_summary.items() |
| #1677 | }, |
| #1678 | }, f, indent=2) |
| #1679 | |
| #1680 | print(f"\n Results saved to: {RESULTS_FILE}") |
| #1681 | print(f" Summary saved to: {summary_file}") |
| #1682 | |
| #1683 | |
| #1684 | if __name__ == "__main__": |
| #1685 | main() |
| #1686 |