repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources15d ago| #1 | #!/usr/bin/env python3 |
| #2 | """ |
| #3 | Self-Healing Quality Pipeline for Mnemosyne Episodic Memory |
| #4 | ============================================================ |
| #5 | |
| #6 | Detects degraded entries (bullet-format, <300 chars) and repairs them |
| #7 | via a 4-stage LLM-as-Judge loop: Extract → Generate → Judge → Repair. |
| #8 | |
| #9 | Designed for upstream inclusion: model-agnostic, no mnemosyne internals |
| #10 | required beyond the existing BeamMemory API. |
| #11 | |
| #12 | Usage: |
| #13 | python scripts/heal_quality.py [--detect-only] [--entry-id ID] [--dry-run] |
| #14 | hermes mnemosyne heal-quality [--detect-only] [--dry-run] |
| #15 | """ |
| #16 | |
| #17 | from __future__ import annotations |
| #18 | |
| #19 | import argparse |
| #20 | import json |
| #21 | import os |
| #22 | import sys |
| #23 | import time |
| #24 | from datetime import datetime, timezone |
| #25 | from pathlib import Path |
| #26 | from typing import Any |
| #27 | |
| #28 | import sqlite3 |
| #29 | |
| #30 | # --- Config knobs (also overrideable via env) --- |
| #31 | JUDGE_THRESHOLD = int(os.environ.get("MNEMOSYNE_HEAL_JUDGE_THRESHOLD", "75")) |
| #32 | MAX_RETRIES = int(os.environ.get("MNEMOSYNE_HEAL_MAX_RETRIES", "3")) |
| #33 | MIN_SUMMARY_LEN = int(os.environ.get("MNEMOSYNE_HEAL_MIN_LEN", "300")) |
| #34 | MEMORY_UNIT_BUDGET = int(os.environ.get("MNEMOSYNE_HEAL_BUDGET", "4000")) |
| #35 | FORCE_M2_AFTER_RETRIES = int(os.environ.get("MNEMOSYNE_HEAL_ESCALATE_AFTER", "2")) |
| #36 | JUDGE_MODEL = os.environ.get("MNEMOSYNE_HEAL_JUDGE_MODEL", "MiniMax-M2.7") |
| #37 | |
| #38 | # --- LLM backends ------------------------------------------------------------ |
| #39 | |
| #40 | def _call_mmx(prompt: str) -> str | None: |
| #41 | """Call MiniMax M2.7 via mmx-cli. Returns text or None on failure.""" |
| #42 | import subprocess |
| #43 | |
| #44 | mmx_path = Path.home() / ".npm-global" / "bin" / "mmx" |
| #45 | if not mmx_path.exists(): |
| #46 | return None |
| #47 | |
| #48 | try: |
| #49 | result = subprocess.run( |
| #50 | [str(mmx_path), "text", "chat", "--message", prompt], |
| #51 | capture_output=True, |
| #52 | text=True, |
| #53 | timeout=60, |
| #54 | ) |
| #55 | if result.returncode != 0: |
| #56 | return None |
| #57 | data = json.loads(result.stdout) |
| #58 | # mmx returns {content: [{text: "..."}]} |
| #59 | content = data.get("content", []) |
| #60 | if content and isinstance(content, list): |
| #61 | return content[0].get("text") |
| #62 | return None |
| #63 | except Exception: |
| #64 | return None |
| #65 | |
| #66 | |
| #67 | def _call_mnemosyne_llm(memories: list[str], source: str) -> str | None: |
| #68 | """Call mnemosyne's own summarization (local GGUF or configured remote).""" |
| #69 | try: |
| #70 | from mnemosyne.core.local_llm import summarize_memories as mnemo_summarize |
| #71 | return mnemo_summarize(memories, source=source) |
| #72 | except Exception: |
| #73 | return None |
| #74 | |
| #75 | |
| #76 | # --- Core pipeline ------------------------------------------------------------ |
| #77 | |
| #78 | def _estimate_tokens(text: str) -> int: |
| #79 | """~4 chars per token for English.""" |
| #80 | return max(1, len(text) // 4) |
| #81 | |
| #82 | |
| #83 | def extract_memory_units(session_path: Path, budget: int = MEMORY_UNIT_BUDGET) -> list[str]: |
| #84 | """Parse session JSON and split into token-budgeted memory units.""" |
| #85 | try: |
| #86 | data = json.loads(session_path.read_text()) |
| #87 | except Exception: |
| #88 | return [] |
| #89 | |
| #90 | messages = data.get("messages", []) |
| #91 | units, current = [], [] |
| #92 | tokens = 0 |
| #93 | |
| #94 | for msg in messages[-80:]: # last 80 messages |
| #95 | role = msg.get("role", "") |
| #96 | |
| #97 | # Skip tool messages — their raw output pollutes summarization context. |
| #98 | # Only user and assistant turns carry conversational signal. |
| #99 | if role == "tool": |
| #100 | continue |
| #101 | |
| #102 | content = str(msg.get("content", "")).strip() |
| #103 | if not content: |
| #104 | continue |
| #105 | |
| #106 | t = _estimate_tokens(content) |
| #107 | if tokens + t > budget and current: |
| #108 | units.append("\n".join(current)) |
| #109 | current, tokens = [], 0 |
| #110 | current.append(f"[{role.upper()}] {content}") |
| #111 | tokens += t |
| #112 | |
| #113 | if current: |
| #114 | units.append("\n".join(current)) |
| #115 | return units |
| #116 | |
| #117 | |
| #118 | def judge_summary(summary: str, memory_units: list[str], source: str) -> dict[str, Any]: |
| #119 | """LLM-as-Judge: score summary across 4 dimensions, return structured verdict.""" |
| #120 | if not summary or not memory_units: |
| #121 | return { |
| #122 | "factual_density": 0, "format_compliance": 0, |
| #123 | "length_sufficiency": 0, "grounding": 0, |
| #124 | "fault": "none", "diagnosis": "Empty summary or no source memories.", |
| #125 | "retry_needed": False, |
| #126 | } |
| #127 | |
| #128 | prompt = f'''You are a memory quality auditor. Evaluate this summary against the source memories. |
| #129 | |
| #130 | SCORING DIMENSIONS (score each 0-100): |
| #131 | 1. FACTUAL_DENSITY — Does it contain specific facts (names, paths, versions, numbers)? |
| #132 | 2. FORMAT_COMPLIANCE — Is it plain prose (no bullets, no dashes)? |
| #133 | 3. LENGTH_SUFFICIENCY — Is it detailed enough (>300 chars for complex sessions)? |
| #134 | 4. GROUNDING — Does it match what actually happened in the memories? |
| #135 | |
| #136 | SUMMARY TO AUDIT: |
| #137 | {summary} |
| #138 | |
| #139 | MEMORY UNITS (ground truth, first 5): |
| #140 | {chr(10).join(memory_units[:5])} |
| #141 | |
| #142 | Respond with JSON only: |
| #143 | {{"factual_density": N, "format_compliance": N, "length_sufficiency": N, "grounding": N, "fault": "none|truncated|generic|missing_facts|wrong_format", "diagnosis": "one-sentence explanation", "retry_needed": true|false}}''' |
| #144 | |
| #145 | text = _call_mmx(prompt) |
| #146 | if not text: |
| #147 | # Fallback: heuristic scoring when judge call fails |
| #148 | score = 100 if len(summary) >= MIN_SUMMARY_LEN else max(0, len(summary) // 3) |
| #149 | return { |
| #150 | "factual_density": score, "format_compliance": score, |
| #151 | "length_sufficiency": score if len(summary) >= MIN_SUMMARY_LEN else 0, |
| #152 | "grounding": score, |
| #153 | "fault": "none", "diagnosis": "Judge call failed, used heuristic fallback.", |
| #154 | "retry_needed": False, |
| #155 | } |
| #156 | |
| #157 | try: |
| #158 | # Strip markdown code fences if present |
| #159 | text = text.strip() |
| #160 | if text.startswith("```"): |
| #161 | text = text.split("```")[1] |
| #162 | if text.startswith("json"): |
| #163 | text = text[4:] |
| #164 | return json.loads(text.strip()) |
| #165 | except Exception: |
| #166 | return { |
| #167 | "factual_density": 50, "format_compliance": 50, |
| #168 | "length_sufficiency": 50, "grounding": 50, |
| #169 | "fault": "none", "diagnosis": "Judge JSON parse failed.", |
| #170 | "retry_needed": False, |
| #171 | } |
| #172 | |
| #173 | |
| #174 | def generate_summary(memory_units: list[str], source: str, *, force_m2: bool = False) -> str | None: |
| #175 | """Generate a new summary, using M2.7 when forced or local LLM otherwise.""" |
| #176 | if not memory_units: |
| #177 | return None |
| #178 | |
| #179 | joined = "\n---\n".join(memory_units) |
| #180 | prompt = f"""Summarize into 1-3 plain prose sentences. |
| #181 | Preserve: names, file paths, tool names, project names, numbers, versions, decisions. |
| #182 | Discard: fluff, filler, meta-commentary. |
| #183 | Source: {source} |
| #184 | |
| #185 | ---MEMORY UNITS--- |
| #186 | {joined} |
| #187 | ---END--- |
| #188 | |
| #189 | Summary:""" |
| #190 | |
| #191 | # Prefer M2.7 when forced; fall back to local GGUF if mmx is unavailable |
| #192 | if force_m2: |
| #193 | result = _call_mmx(prompt) |
| #194 | if result: |
| #195 | return result |
| #196 | # mmx unavailable — fall through to local LLM |
| #197 | |
| #198 | result = _call_mnemosyne_llm(memory_units, source) |
| #199 | if result and len(result) >= MIN_SUMMARY_LEN: |
| #200 | return result |
| #201 | |
| #202 | # Escalate to M2.7 if local model produced thin output |
| #203 | result = _call_mmx(prompt) |
| #204 | return result if result else None |
| #205 | |
| #206 | |
| #207 | def repair_summary(summary: str, memory_units: list[str], fault: str, source: str) -> str | None: |
| #208 | """Apply fault-specific repair strategy.""" |
| #209 | if fault == "truncated": |
| #210 | # Context was cut — retry with doubled context |
| #211 | return generate_summary(memory_units * 2, source, force_m2=True) |
| #212 | if fault == "generic": |
| #213 | prompt = f"""Rewrite this summary to be SPECIFIC. Every noun must be named. |
| #214 | File paths must be real paths. Tools must be named. Numbers must be included. |
| #215 | |
| #216 | Current summary: {summary} |
| #217 | |
| #218 | Source context (first unit): |
| #219 | {memory_units[0][:1000] if memory_units else ''} |
| #220 | |
| #221 | Specific rewrite:""" |
| #222 | return _call_mmx(prompt) |
| #223 | if fault == "missing_facts": |
| #224 | # Extract and re-inject key facts |
| #225 | facts = [] |
| #226 | for unit in memory_units: |
| #227 | facts.extend(_extract_facts_from_text(unit)) |
| #228 | if facts: |
| #229 | prompt = f"""Augment this summary by incorporating these specific facts: |
| #230 | {', '.join(facts[:20])} |
| #231 | |
| #232 | Current: {summary} |
| #233 | |
| #234 | Augmented (preserve prose, add facts inline):""" |
| #235 | return _call_mmx(prompt) |
| #236 | return generate_summary(memory_units, source, force_m2=True) |
| #237 | if fault == "wrong_format": |
| #238 | prompt = f"""Rewrite in PLAIN PROSE ONLY. No bullets, no dashes, no lists. |
| #239 | Convert: {summary} |
| #240 | Plain prose:""" |
| #241 | return _call_mmx(prompt) |
| #242 | # Unknown fault — full regenerate with M2.7 |
| #243 | return generate_summary(memory_units, source, force_m2=True) |
| #244 | |
| #245 | |
| #246 | def _extract_facts_from_text(text: str) -> list[str]: |
| #247 | """Simple heuristic fact extractor: paths, URLs, version strings, numbers with context.""" |
| #248 | import re |
| #249 | facts = [] |
| #250 | # File paths |
| #251 | paths = re.findall(r'(?:/[\w\-./]+(?:\.\w+)?|\b[\w\-]+\/[\w\-./]+(?:\.\w+)?)', text) |
| #252 | facts.extend(paths[:5]) |
| #253 | # Version strings |
| #254 | versions = re.findall(r'\b\d+\.\d+(?:\.\d+)?\b', text) |
| #255 | facts.extend([f"v{v}" for v in versions[:5]]) |
| #256 | # URLs |
| #257 | urls = re.findall(r'https?://\S+', text) |
| #258 | facts.extend(urls[:3]) |
| #259 | return list(dict.fromkeys(facts)) # deduplicate, preserve order |
| #260 | |
| #261 | |
| #262 | def should_retry(retry_count: int, verdict: dict, max_retries: int = MAX_RETRIES) -> bool: |
| #263 | """Decide whether to retry the repair loop.""" |
| #264 | if retry_count >= max_retries: |
| #265 | return False |
| #266 | if verdict.get("fault") == "none": |
| #267 | return False |
| #268 | if retry_count >= FORCE_M2_AFTER_RETRIES: |
| #269 | verdict["fault"] = "escalate" # Forces M2.7 on next generate |
| #270 | return True |
| #271 | |
| #272 | |
| #273 | # --- Database ---------------------------------------------------------------- |
| #274 | |
| #275 | def get_db_path() -> Path: |
| #276 | default = Path.home() / ".hermes" / "mnemosyne" / "data" / "mnemosyne.db" |
| #277 | return Path(os.environ.get("MNEMOSYNE_DB_PATH", default)) |
| #278 | |
| #279 | |
| #280 | def get_session_path(session_id: str) -> Path | None: |
| #281 | """Map mnemosyne session_id (hermes_<ts>_<hash>) to session JSON file.""" |
| #282 | sessions_dir = Path.home() / ".hermes" / "sessions" |
| #283 | # session_id format: hermes_20260505_165757_<hash> |
| #284 | # file format: session_20260505_165757_<hash>.json |
| #285 | if session_id.startswith("hermes_"): |
| #286 | ts_part = "_".join(session_id.split("_")[1:4]) # 20260505_165757 |
| #287 | remainder = "_".join(session_id.split("_")[4:]) |
| #288 | filename = f"session_{ts_part}_{remainder}.json" |
| #289 | path = sessions_dir / filename |
| #290 | if path.exists(): |
| #291 | return path |
| #292 | # Fallback: search |
| #293 | for p in sessions_dir.glob(f"session_*.json"): |
| #294 | if session_id.replace("hermes_", "") in p.stem: |
| #295 | return p |
| #296 | return None |
| #297 | |
| #298 | |
| #299 | def detect_degraded_entries(conn: sqlite3.Connection) -> list[tuple]: |
| #300 | """Return list of (id, content, session_id, metadata_json, len) for degraded entries.""" |
| #301 | cursor = conn.execute(""" |
| #302 | SELECT id, content, session_id, metadata_json, LENGTH(content) as len |
| #303 | FROM episodic_memory |
| #304 | WHERE (content LIKE '- %' AND LENGTH(content) < 150) |
| #305 | OR (LENGTH(content) < 100 AND content NOT LIKE '% % %') |
| #306 | ORDER BY len ASC |
| #307 | """) |
| #308 | return cursor.fetchall() |
| #309 | |
| #310 | |
| #311 | def update_entry( |
| #312 | conn: sqlite3.Connection, |
| #313 | entry_id: str, |
| #314 | new_content: str, |
| #315 | quality_score: float, |
| #316 | verdict: dict, |
| #317 | retry_count: int, |
| #318 | degraded_at_was: str | None, |
| #319 | ): |
| #320 | """Update episodic_memory row with repaired content and quality metadata.""" |
| #321 | now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") |
| #322 | meta = { |
| #323 | "quality_score": quality_score, |
| #324 | "judge_model": JUDGE_MODEL, |
| #325 | "consolidated_at": now, |
| #326 | "fault_before_repair": verdict.get("fault"), |
| #327 | "retry_loop_count": retry_count, |
| #328 | "needs_human_review": quality_score < 60, |
| #329 | } |
| #330 | if degraded_at_was: |
| #331 | meta["degraded_at"] = degraded_at_was |
| #332 | |
| #333 | conn.execute(""" |
| #334 | UPDATE episodic_memory |
| #335 | SET content = ?, |
| #336 | metadata_json = ?, |
| #337 | degraded_at = NULL |
| #338 | WHERE id = ? |
| #339 | """, (new_content, json.dumps(meta), entry_id)) |
| #340 | conn.commit() |
| #341 | |
| #342 | |
| #343 | def get_episodic_stats(conn: sqlite3.Connection) -> dict: |
| #344 | cursor = conn.execute(""" |
| #345 | SELECT |
| #346 | COUNT(*) as total, |
| #347 | SUM(CASE WHEN LENGTH(content) > 300 THEN 1 ELSE 0 END) as high_quality, |
| #348 | SUM(CASE WHEN (content LIKE '- %' AND LENGTH(content) < 150) |
| #349 | OR (LENGTH(content) < 100 AND content NOT LIKE '% % %') |
| #350 | THEN 1 ELSE 0 END) as degraded, |
| #351 | SUM(CASE WHEN degraded_at IS NOT NULL THEN 1 ELSE 0 END) as marked_degraded |
| #352 | FROM episodic_memory |
| #353 | """) |
| #354 | row = cursor.fetchone() |
| #355 | return { |
| #356 | "total": row[0] or 0, |
| #357 | "high_quality": row[1] or 0, |
| #358 | "degraded": row[2] or 0, |
| #359 | "marked_degraded": row[3] or 0, |
| #360 | } |
| #361 | |
| #362 | |
| #363 | # --- Main pipeline orchestrator ---------------------------------------------- |
| #364 | |
| #365 | def heal_entry( |
| #366 | entry_id: str, |
| #367 | conn: sqlite3.Connection, |
| #368 | dry_run: bool = False, |
| #369 | ) -> dict[str, Any]: |
| #370 | """Run the full 4-stage pipeline on one entry. Returns result dict.""" |
| #371 | cursor = conn.execute( |
| #372 | "SELECT id, content, session_id, source, metadata_json, degraded_at FROM episodic_memory WHERE id = ?", |
| #373 | (entry_id,) |
| #374 | ) |
| #375 | row = cursor.fetchone() |
| #376 | if not row: |
| #377 | return {"entry_id": entry_id, "status": "not_found"} |
| #378 | |
| #379 | eid, old_content, session_id, source, meta_json, degraded_at = row |
| #380 | source = source or f"session:{session_id}" |
| #381 | meta = json.loads(meta_json) if meta_json else {} |
| #382 | |
| #383 | # Stage 0: Extract memory units from session file |
| #384 | session_path = get_session_path(session_id) |
| #385 | if session_path: |
| #386 | memory_units = extract_memory_units(session_path) |
| #387 | else: |
| #388 | # No session file — use the existing content as the memory unit |
| #389 | memory_units = [old_content] |
| #390 | |
| #391 | # Stage 1: Generate initial summary (M2.7 for repair pipeline) |
| #392 | retry_count = 0 |
| #393 | verdict = {"fault": "none"} |
| #394 | |
| #395 | summary = generate_summary(memory_units, source, force_m2=True) |
| #396 | if not summary: |
| #397 | return {"entry_id": entry_id, "status": "generate_failed", "fault": "none"} |
| #398 | |
| #399 | # Stage 2: Judge |
| #400 | verdict = judge_summary(summary, memory_units, source) |
| #401 | score = (verdict["factual_density"] + verdict["format_compliance"] + |
| #402 | verdict["length_sufficiency"] + verdict["grounding"]) / 4 |
| #403 | |
| #404 | # Closed loop: repair if needed |
| #405 | while should_retry(retry_count, verdict): |
| #406 | retry_count += 1 |
| #407 | repaired = repair_summary(summary, memory_units, verdict.get("fault", ""), source) |
| #408 | if not repaired: |
| #409 | break |
| #410 | summary = repaired |
| #411 | verdict = judge_summary(summary, memory_units, source) |
| #412 | score = (verdict["factual_density"] + verdict["format_compliance"] + |
| #413 | verdict["length_sufficiency"] + verdict["grounding"]) / 4 |
| #414 | if verdict.get("fault") == "none": |
| #415 | break |
| #416 | |
| #417 | if dry_run: |
| #418 | return { |
| #419 | "entry_id": entry_id, |
| #420 | "status": "dry_run", |
| #421 | "old_len": len(old_content), |
| #422 | "new_len": len(summary), |
| #423 | "quality_score": score, |
| #424 | "verdict": verdict, |
| #425 | "retry_count": retry_count, |
| #426 | } |
| #427 | |
| #428 | update_entry(conn, eid, summary, score, verdict, retry_count, degraded_at) |
| #429 | return { |
| #430 | "entry_id": entry_id, |
| #431 | "status": "repaired", |
| #432 | "old_len": len(old_content), |
| #433 | "new_len": len(summary), |
| #434 | "quality_score": score, |
| #435 | "verdict": verdict, |
| #436 | "retry_count": retry_count, |
| #437 | } |
| #438 | |
| #439 | |
| #440 | def run_heal_pipeline(detect_only: bool = False, entry_id: str | None = None, dry_run: bool = False): |
| #441 | """Main entry point.""" |
| #442 | db_path = get_db_path() |
| #443 | if not db_path.exists(): |
| #444 | print(f"Error: database not found at {db_path}") |
| #445 | sys.exit(1) |
| #446 | |
| #447 | conn = sqlite3.connect(db_path) |
| #448 | |
| #449 | if entry_id: |
| #450 | result = heal_entry(entry_id, conn, dry_run=dry_run) |
| #451 | print(json.dumps(result, indent=2)) |
| #452 | conn.close() |
| #453 | return |
| #454 | |
| #455 | # Detect degraded |
| #456 | degraded = detect_degraded_entries(conn) |
| #457 | stats = get_episodic_stats(conn) |
| #458 | print(json.dumps({"stats": stats, "degraded_count": len(degraded), "entries": [ |
| #459 | {"id": e[0], "len": e[4], "session_id": e[2]} for e in degraded |
| #460 | ]}, indent=2)) |
| #461 | |
| #462 | if detect_only or dry_run: |
| #463 | conn.close() |
| #464 | return |
| #465 | |
| #466 | # Heal all degraded |
| #467 | repaired = [] |
| #468 | for e in degraded: |
| #469 | eid = e[0] |
| #470 | result = heal_entry(eid, conn, dry_run=False) |
| #471 | repaired.append(result) |
| #472 | print(f" {'✓' if result['status'] == 'repaired' else '✗'} {eid}: {result.get('status')}") |
| #473 | |
| #474 | final_stats = get_episodic_stats(conn) |
| #475 | print(json.dumps({"final_stats": final_stats, "repaired": repaired}, indent=2)) |
| #476 | conn.close() |
| #477 | |
| #478 | |
| #479 | # --- CLI ------------------------------------------------------------------------- |
| #480 | |
| #481 | def main(): |
| #482 | parser = argparse.ArgumentParser(description="Mnemosyne self-healing quality pipeline") |
| #483 | parser.add_argument("--detect-only", action="store_true", help="Only detect degraded entries") |
| #484 | parser.add_argument("--entry-id", type=str, help="Heal a specific entry by ID") |
| #485 | parser.add_argument("--dry-run", action="store_true", help="Report what would change without writing") |
| #486 | args = parser.parse_args() |
| #487 | |
| #488 | run_heal_pipeline( |
| #489 | detect_only=args.detect_only, |
| #490 | entry_id=args.entry_id, |
| #491 | dry_run=args.dry_run, |
| #492 | ) |
| #493 | |
| #494 | |
| #495 | if __name__ == "__main__": |
| #496 | main() |