my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	#!/usr/bin/env python3
#2	"""
#3	Self-Healing Quality Pipeline for Mnemosyne Episodic Memory
#4	============================================================
#5
#6	Detects degraded entries (bullet-format, <300 chars) and repairs them
#7	via a 4-stage LLM-as-Judge loop: Extract → Generate → Judge → Repair.
#8
#9	Designed for upstream inclusion: model-agnostic, no mnemosyne internals
#10	required beyond the existing BeamMemory API.
#11
#12	Usage:
#13	python scripts/heal_quality.py [--detect-only] [--entry-id ID] [--dry-run]
#14	hermes mnemosyne heal-quality [--detect-only] [--dry-run]
#15	"""
#16
#17	from __future__ import annotations
#18
#19	import argparse
#20	import json
#21	import os
#22	import sys
#23	import time
#24	from datetime import datetime, timezone
#25	from pathlib import Path
#26	from typing import Any
#27
#28	import sqlite3
#29
#30	# --- Config knobs (also overrideable via env) ---
#31	JUDGE_THRESHOLD = int(os.environ.get("MNEMOSYNE_HEAL_JUDGE_THRESHOLD", "75"))
#32	MAX_RETRIES = int(os.environ.get("MNEMOSYNE_HEAL_MAX_RETRIES", "3"))
#33	MIN_SUMMARY_LEN = int(os.environ.get("MNEMOSYNE_HEAL_MIN_LEN", "300"))
#34	MEMORY_UNIT_BUDGET = int(os.environ.get("MNEMOSYNE_HEAL_BUDGET", "4000"))
#35	FORCE_M2_AFTER_RETRIES = int(os.environ.get("MNEMOSYNE_HEAL_ESCALATE_AFTER", "2"))
#36	JUDGE_MODEL = os.environ.get("MNEMOSYNE_HEAL_JUDGE_MODEL", "MiniMax-M2.7")
#37
#38	# --- LLM backends ------------------------------------------------------------
#39
#40	def _call_mmx(prompt: str) -> str \| None:
#41	"""Call MiniMax M2.7 via mmx-cli. Returns text or None on failure."""
#42	import subprocess
#43
#44	mmx_path = Path.home() / ".npm-global" / "bin" / "mmx"
#45	if not mmx_path.exists():
#46	return None
#47
#48	try:
#49	result = subprocess.run(
#50	[str(mmx_path), "text", "chat", "--message", prompt],
#51	capture_output=True,
#52	text=True,
#53	timeout=60,
#54	)
#55	if result.returncode != 0:
#56	return None
#57	data = json.loads(result.stdout)
#58	# mmx returns {content: [{text: "..."}]}
#59	content = data.get("content", [])
#60	if content and isinstance(content, list):
#61	return content[0].get("text")
#62	return None
#63	except Exception:
#64	return None
#65
#66
#67	def _call_mnemosyne_llm(memories: list[str], source: str) -> str \| None:
#68	"""Call mnemosyne's own summarization (local GGUF or configured remote)."""
#69	try:
#70	from mnemosyne.core.local_llm import summarize_memories as mnemo_summarize
#71	return mnemo_summarize(memories, source=source)
#72	except Exception:
#73	return None
#74
#75
#76	# --- Core pipeline ------------------------------------------------------------
#77
#78	def _estimate_tokens(text: str) -> int:
#79	"""~4 chars per token for English."""
#80	return max(1, len(text) // 4)
#81
#82
#83	def extract_memory_units(session_path: Path, budget: int = MEMORY_UNIT_BUDGET) -> list[str]:
#84	"""Parse session JSON and split into token-budgeted memory units."""
#85	try:
#86	data = json.loads(session_path.read_text())
#87	except Exception:
#88	return []
#89
#90	messages = data.get("messages", [])
#91	units, current = [], []
#92	tokens = 0
#93
#94	for msg in messages[-80:]: # last 80 messages
#95	role = msg.get("role", "")
#96
#97	# Skip tool messages — their raw output pollutes summarization context.
#98	# Only user and assistant turns carry conversational signal.
#99	if role == "tool":
#100	continue
#101
#102	content = str(msg.get("content", "")).strip()
#103	if not content:
#104	continue
#105
#106	t = _estimate_tokens(content)
#107	if tokens + t > budget and current:
#108	units.append("\n".join(current))
#109	current, tokens = [], 0
#110	current.append(f"[{role.upper()}] {content}")
#111	tokens += t
#112
#113	if current:
#114	units.append("\n".join(current))
#115	return units
#116
#117
#118	def judge_summary(summary: str, memory_units: list[str], source: str) -> dict[str, Any]:
#119	"""LLM-as-Judge: score summary across 4 dimensions, return structured verdict."""
#120	if not summary or not memory_units:
#121	return {
#122	"factual_density": 0, "format_compliance": 0,
#123	"length_sufficiency": 0, "grounding": 0,
#124	"fault": "none", "diagnosis": "Empty summary or no source memories.",
#125	"retry_needed": False,
#126	}
#127
#128	prompt = f'''You are a memory quality auditor. Evaluate this summary against the source memories.
#129
#130	SCORING DIMENSIONS (score each 0-100):
#131	1. FACTUAL_DENSITY — Does it contain specific facts (names, paths, versions, numbers)?
#132	2. FORMAT_COMPLIANCE — Is it plain prose (no bullets, no dashes)?
#133	3. LENGTH_SUFFICIENCY — Is it detailed enough (>300 chars for complex sessions)?
#134	4. GROUNDING — Does it match what actually happened in the memories?
#135
#136	SUMMARY TO AUDIT:
#137	{summary}
#138
#139	MEMORY UNITS (ground truth, first 5):
#140	{chr(10).join(memory_units[:5])}
#141
#142	Respond with JSON only:
#143	{{"factual_density": N, "format_compliance": N, "length_sufficiency": N, "grounding": N, "fault": "none\|truncated\|generic\|missing_facts\|wrong_format", "diagnosis": "one-sentence explanation", "retry_needed": true\|false}}'''
#144
#145	text = _call_mmx(prompt)
#146	if not text:
#147	# Fallback: heuristic scoring when judge call fails
#148	score = 100 if len(summary) >= MIN_SUMMARY_LEN else max(0, len(summary) // 3)
#149	return {
#150	"factual_density": score, "format_compliance": score,
#151	"length_sufficiency": score if len(summary) >= MIN_SUMMARY_LEN else 0,
#152	"grounding": score,
#153	"fault": "none", "diagnosis": "Judge call failed, used heuristic fallback.",
#154	"retry_needed": False,
#155	}
#156
#157	try:
#158	# Strip markdown code fences if present
#159	text = text.strip()
#160	if text.startswith("```"):
#161	text = text.split("```")[1]
#162	if text.startswith("json"):
#163	text = text[4:]
#164	return json.loads(text.strip())
#165	except Exception:
#166	return {
#167	"factual_density": 50, "format_compliance": 50,
#168	"length_sufficiency": 50, "grounding": 50,
#169	"fault": "none", "diagnosis": "Judge JSON parse failed.",
#170	"retry_needed": False,
#171	}
#172
#173
#174	def generate_summary(memory_units: list[str], source: str, *, force_m2: bool = False) -> str \| None:
#175	"""Generate a new summary, using M2.7 when forced or local LLM otherwise."""
#176	if not memory_units:
#177	return None
#178
#179	joined = "\n---\n".join(memory_units)
#180	prompt = f"""Summarize into 1-3 plain prose sentences.
#181	Preserve: names, file paths, tool names, project names, numbers, versions, decisions.
#182	Discard: fluff, filler, meta-commentary.
#183	Source: {source}
#184
#185	---MEMORY UNITS---
#186	{joined}
#187	---END---
#188
#189	Summary:"""
#190
#191	# Prefer M2.7 when forced; fall back to local GGUF if mmx is unavailable
#192	if force_m2:
#193	result = _call_mmx(prompt)
#194	if result:
#195	return result
#196	# mmx unavailable — fall through to local LLM
#197
#198	result = _call_mnemosyne_llm(memory_units, source)
#199	if result and len(result) >= MIN_SUMMARY_LEN:
#200	return result
#201
#202	# Escalate to M2.7 if local model produced thin output
#203	result = _call_mmx(prompt)
#204	return result if result else None
#205
#206
#207	def repair_summary(summary: str, memory_units: list[str], fault: str, source: str) -> str \| None:
#208	"""Apply fault-specific repair strategy."""
#209	if fault == "truncated":
#210	# Context was cut — retry with doubled context
#211	return generate_summary(memory_units * 2, source, force_m2=True)
#212	if fault == "generic":
#213	prompt = f"""Rewrite this summary to be SPECIFIC. Every noun must be named.
#214	File paths must be real paths. Tools must be named. Numbers must be included.
#215
#216	Current summary: {summary}
#217
#218	Source context (first unit):
#219	{memory_units[0][:1000] if memory_units else ''}
#220
#221	Specific rewrite:"""
#222	return _call_mmx(prompt)
#223	if fault == "missing_facts":
#224	# Extract and re-inject key facts
#225	facts = []
#226	for unit in memory_units:
#227	facts.extend(_extract_facts_from_text(unit))
#228	if facts:
#229	prompt = f"""Augment this summary by incorporating these specific facts:
#230	{', '.join(facts[:20])}
#231
#232	Current: {summary}
#233
#234	Augmented (preserve prose, add facts inline):"""
#235	return _call_mmx(prompt)
#236	return generate_summary(memory_units, source, force_m2=True)
#237	if fault == "wrong_format":
#238	prompt = f"""Rewrite in PLAIN PROSE ONLY. No bullets, no dashes, no lists.
#239	Convert: {summary}
#240	Plain prose:"""
#241	return _call_mmx(prompt)
#242	# Unknown fault — full regenerate with M2.7
#243	return generate_summary(memory_units, source, force_m2=True)
#244
#245
#246	def _extract_facts_from_text(text: str) -> list[str]:
#247	"""Simple heuristic fact extractor: paths, URLs, version strings, numbers with context."""
#248	import re
#249	facts = []
#250	# File paths
#251	paths = re.findall(r'(?:/[\w\-./]+(?:\.\w+)?\|\b[\w\-]+\/[\w\-./]+(?:\.\w+)?)', text)
#252	facts.extend(paths[:5])
#253	# Version strings
#254	versions = re.findall(r'\b\d+\.\d+(?:\.\d+)?\b', text)
#255	facts.extend([f"v{v}" for v in versions[:5]])
#256	# URLs
#257	urls = re.findall(r'https?://\S+', text)
#258	facts.extend(urls[:3])
#259	return list(dict.fromkeys(facts)) # deduplicate, preserve order
#260
#261
#262	def should_retry(retry_count: int, verdict: dict, max_retries: int = MAX_RETRIES) -> bool:
#263	"""Decide whether to retry the repair loop."""
#264	if retry_count >= max_retries:
#265	return False
#266	if verdict.get("fault") == "none":
#267	return False
#268	if retry_count >= FORCE_M2_AFTER_RETRIES:
#269	verdict["fault"] = "escalate" # Forces M2.7 on next generate
#270	return True
#271
#272
#273	# --- Database ----------------------------------------------------------------
#274
#275	def get_db_path() -> Path:
#276	default = Path.home() / ".hermes" / "mnemosyne" / "data" / "mnemosyne.db"
#277	return Path(os.environ.get("MNEMOSYNE_DB_PATH", default))
#278
#279
#280	def get_session_path(session_id: str) -> Path \| None:
#281	"""Map mnemosyne session_id (hermes_<ts>_<hash>) to session JSON file."""
#282	sessions_dir = Path.home() / ".hermes" / "sessions"
#283	# session_id format: hermes_20260505_165757_<hash>
#284	# file format: session_20260505_165757_<hash>.json
#285	if session_id.startswith("hermes_"):
#286	ts_part = "_".join(session_id.split("_")[1:4]) # 20260505_165757
#287	remainder = "_".join(session_id.split("_")[4:])
#288	filename = f"session_{ts_part}_{remainder}.json"
#289	path = sessions_dir / filename
#290	if path.exists():
#291	return path
#292	# Fallback: search
#293	for p in sessions_dir.glob(f"session_*.json"):
#294	if session_id.replace("hermes_", "") in p.stem:
#295	return p
#296	return None
#297
#298
#299	def detect_degraded_entries(conn: sqlite3.Connection) -> list[tuple]:
#300	"""Return list of (id, content, session_id, metadata_json, len) for degraded entries."""
#301	cursor = conn.execute("""
#302	SELECT id, content, session_id, metadata_json, LENGTH(content) as len
#303	FROM episodic_memory
#304	WHERE (content LIKE '- %' AND LENGTH(content) < 150)
#305	OR (LENGTH(content) < 100 AND content NOT LIKE '% % %')
#306	ORDER BY len ASC
#307	""")
#308	return cursor.fetchall()
#309
#310
#311	def update_entry(
#312	conn: sqlite3.Connection,
#313	entry_id: str,
#314	new_content: str,
#315	quality_score: float,
#316	verdict: dict,
#317	retry_count: int,
#318	degraded_at_was: str \| None,
#319	):
#320	"""Update episodic_memory row with repaired content and quality metadata."""
#321	now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
#322	meta = {
#323	"quality_score": quality_score,
#324	"judge_model": JUDGE_MODEL,
#325	"consolidated_at": now,
#326	"fault_before_repair": verdict.get("fault"),
#327	"retry_loop_count": retry_count,
#328	"needs_human_review": quality_score < 60,
#329	}
#330	if degraded_at_was:
#331	meta["degraded_at"] = degraded_at_was
#332
#333	conn.execute("""
#334	UPDATE episodic_memory
#335	SET content = ?,
#336	metadata_json = ?,
#337	degraded_at = NULL
#338	WHERE id = ?
#339	""", (new_content, json.dumps(meta), entry_id))
#340	conn.commit()
#341
#342
#343	def get_episodic_stats(conn: sqlite3.Connection) -> dict:
#344	cursor = conn.execute("""
#345	SELECT
#346	COUNT(*) as total,
#347	SUM(CASE WHEN LENGTH(content) > 300 THEN 1 ELSE 0 END) as high_quality,
#348	SUM(CASE WHEN (content LIKE '- %' AND LENGTH(content) < 150)
#349	OR (LENGTH(content) < 100 AND content NOT LIKE '% % %')
#350	THEN 1 ELSE 0 END) as degraded,
#351	SUM(CASE WHEN degraded_at IS NOT NULL THEN 1 ELSE 0 END) as marked_degraded
#352	FROM episodic_memory
#353	""")
#354	row = cursor.fetchone()
#355	return {
#356	"total": row[0] or 0,
#357	"high_quality": row[1] or 0,
#358	"degraded": row[2] or 0,
#359	"marked_degraded": row[3] or 0,
#360	}
#361
#362
#363	# --- Main pipeline orchestrator ----------------------------------------------
#364
#365	def heal_entry(
#366	entry_id: str,
#367	conn: sqlite3.Connection,
#368	dry_run: bool = False,
#369	) -> dict[str, Any]:
#370	"""Run the full 4-stage pipeline on one entry. Returns result dict."""
#371	cursor = conn.execute(
#372	"SELECT id, content, session_id, source, metadata_json, degraded_at FROM episodic_memory WHERE id = ?",
#373	(entry_id,)
#374	)
#375	row = cursor.fetchone()
#376	if not row:
#377	return {"entry_id": entry_id, "status": "not_found"}
#378
#379	eid, old_content, session_id, source, meta_json, degraded_at = row
#380	source = source or f"session:{session_id}"
#381	meta = json.loads(meta_json) if meta_json else {}
#382
#383	# Stage 0: Extract memory units from session file
#384	session_path = get_session_path(session_id)
#385	if session_path:
#386	memory_units = extract_memory_units(session_path)
#387	else:
#388	# No session file — use the existing content as the memory unit
#389	memory_units = [old_content]
#390
#391	# Stage 1: Generate initial summary (M2.7 for repair pipeline)
#392	retry_count = 0
#393	verdict = {"fault": "none"}
#394
#395	summary = generate_summary(memory_units, source, force_m2=True)
#396	if not summary:
#397	return {"entry_id": entry_id, "status": "generate_failed", "fault": "none"}
#398
#399	# Stage 2: Judge
#400	verdict = judge_summary(summary, memory_units, source)
#401	score = (verdict["factual_density"] + verdict["format_compliance"] +
#402	verdict["length_sufficiency"] + verdict["grounding"]) / 4
#403
#404	# Closed loop: repair if needed
#405	while should_retry(retry_count, verdict):
#406	retry_count += 1
#407	repaired = repair_summary(summary, memory_units, verdict.get("fault", ""), source)
#408	if not repaired:
#409	break
#410	summary = repaired
#411	verdict = judge_summary(summary, memory_units, source)
#412	score = (verdict["factual_density"] + verdict["format_compliance"] +
#413	verdict["length_sufficiency"] + verdict["grounding"]) / 4
#414	if verdict.get("fault") == "none":
#415	break
#416
#417	if dry_run:
#418	return {
#419	"entry_id": entry_id,
#420	"status": "dry_run",
#421	"old_len": len(old_content),
#422	"new_len": len(summary),
#423	"quality_score": score,
#424	"verdict": verdict,
#425	"retry_count": retry_count,
#426	}
#427
#428	update_entry(conn, eid, summary, score, verdict, retry_count, degraded_at)
#429	return {
#430	"entry_id": entry_id,
#431	"status": "repaired",
#432	"old_len": len(old_content),
#433	"new_len": len(summary),
#434	"quality_score": score,
#435	"verdict": verdict,
#436	"retry_count": retry_count,
#437	}
#438
#439
#440	def run_heal_pipeline(detect_only: bool = False, entry_id: str \| None = None, dry_run: bool = False):
#441	"""Main entry point."""
#442	db_path = get_db_path()
#443	if not db_path.exists():
#444	print(f"Error: database not found at {db_path}")
#445	sys.exit(1)
#446
#447	conn = sqlite3.connect(db_path)
#448
#449	if entry_id:
#450	result = heal_entry(entry_id, conn, dry_run=dry_run)
#451	print(json.dumps(result, indent=2))
#452	conn.close()
#453	return
#454
#455	# Detect degraded
#456	degraded = detect_degraded_entries(conn)
#457	stats = get_episodic_stats(conn)
#458	print(json.dumps({"stats": stats, "degraded_count": len(degraded), "entries": [
#459	{"id": e[0], "len": e[4], "session_id": e[2]} for e in degraded
#460	]}, indent=2))
#461
#462	if detect_only or dry_run:
#463	conn.close()
#464	return
#465
#466	# Heal all degraded
#467	repaired = []
#468	for e in degraded:
#469	eid = e[0]
#470	result = heal_entry(eid, conn, dry_run=False)
#471	repaired.append(result)
#472	print(f" {'✓' if result['status'] == 'repaired' else '✗'} {eid}: {result.get('status')}")
#473
#474	final_stats = get_episodic_stats(conn)
#475	print(json.dumps({"final_stats": final_stats, "repaired": repaired}, indent=2))
#476	conn.close()
#477
#478
#479	# --- CLI -------------------------------------------------------------------------
#480
#481	def main():
#482	parser = argparse.ArgumentParser(description="Mnemosyne self-healing quality pipeline")
#483	parser.add_argument("--detect-only", action="store_true", help="Only detect degraded entries")
#484	parser.add_argument("--entry-id", type=str, help="Heal a specific entry by ID")
#485	parser.add_argument("--dry-run", action="store_true", help="Report what would change without writing")
#486	args = parser.parse_args()
#487
#488	run_heal_pipeline(
#489	detect_only=args.detect_only,
#490	entry_id=args.entry_id,
#491	dry_run=args.dry_run,
#492	)
#493
#494
#495	if __name__ == "__main__":
#496	main()

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public