my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	"""
#2	Mnemosyne Local LLM Consolidation
#3	=================================
#4	Lightweight on-device summarization for the sleep/consolidation cycle.
#5	Uses llama-cpp-python (ARM64 + x86_64 native) with ctransformers fallback.
#6	Falls back to aaak encoding if the model is unavailable or inference fails.
#7
#8	Model cache: ~/.hermes/mnemosyne/models/
#9	Default model: TinyLlama-1.1B-Chat-v1.0-GGUF (Q4_K_M, ~600MB)
#10	"""
#11
#12	import os
#13	import sys
#14	import re
#15	from pathlib import Path
#16	from typing import List, Optional
#17
#18	# --- Config ------------------------------------------------------------------
#19	DEFAULT_MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
#20	DEFAULT_MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
#21	MODEL_CACHE_DIR = Path.home() / ".hermes" / "mnemosyne" / "models"
#22
#23	LLM_ENABLED = os.environ.get("MNEMOSYNE_LLM_ENABLED", "true").lower() in ("1", "true", "yes")
#24	LLM_MAX_TOKENS = int(os.environ.get("MNEMOSYNE_LLM_MAX_TOKENS", "256"))
#25	LLM_N_THREADS = int(os.environ.get("MNEMOSYNE_LLM_N_THREADS", "4"))
#26	LLM_N_CTX = int(os.environ.get("MNEMOSYNE_LLM_N_CTX", "2048"))
#27
#28	# Override model via env
#29	_env_repo = os.environ.get("MNEMOSYNE_LLM_REPO")
#30	_env_file = os.environ.get("MNEMOSYNE_LLM_FILE")
#31	if _env_repo and _env_file:
#32	DEFAULT_MODEL_REPO = _env_repo
#33	DEFAULT_MODEL_FILE = _env_file
#34
#35	# Remote API config
#36	LLM_BASE_URL = os.environ.get("MNEMOSYNE_LLM_BASE_URL", "").rstrip("/")
#37	LLM_API_KEY = os.environ.get("MNEMOSYNE_LLM_API_KEY", "")
#38	LLM_REMOTE_MODEL = os.environ.get("MNEMOSYNE_LLM_MODEL", "")
#39
#40	# Host LLM adapter (Hermes or another agent). Disabled by default to preserve
#41	# existing standalone behavior. When MNEMOSYNE_HOST_LLM_ENABLED=true and a
#42	# backend is registered via mnemosyne.core.llm_backends.set_host_llm_backend(),
#43	# the host backend is consulted before the existing remote/local chain.
#44	# See docs/hermes-llm-integration.md for the full behavior model.
#45	HOST_LLM_ENABLED = os.environ.get("MNEMOSYNE_HOST_LLM_ENABLED", "false").lower() in ("1", "true", "yes")
#46	HOST_LLM_PROVIDER = os.environ.get("MNEMOSYNE_HOST_LLM_PROVIDER", "").strip() or None
#47	HOST_LLM_MODEL = os.environ.get("MNEMOSYNE_HOST_LLM_MODEL", "").strip() or None
#48	HOST_LLM_TIMEOUT = 15.0 # Per-attempt safety cap; not user-facing.
#49	# Host context window: TinyLlama-calibrated LLM_N_CTX (2048) is too small for
#50	# Codex/GPT-class aux models; use this larger budget when the host is the path.
#51	HOST_LLM_N_CTX = int(os.environ.get("MNEMOSYNE_HOST_LLM_N_CTX", "32000"))
#52
#53	# --- Lazy singleton ----------------------------------------------------------
#54	_llm_instance = None
#55	_llm_backend = None # "llamacpp", "ctransformers", or None
#56	_llm_available = None # None = not checked yet
#57
#58
#59	def _ensure_sys_path():
#60	"""Ensure /usr/local/lib/python3.11/site-packages is in sys.path
#61	so ctransformers is discoverable when Hermes runs in a venv."""
#62	sp = "/usr/local/lib/python3.11/site-packages"
#63	if sp not in sys.path and os.path.isdir(sp):
#64	sys.path.append(sp)
#65
#66
#67	def _model_path() -> Optional[Path]:
#68	"""Return path to the local GGUF model file, or None if not downloaded."""
#69	candidate = MODEL_CACHE_DIR / DEFAULT_MODEL_FILE
#70	return candidate if candidate.exists() else None
#71
#72
#73	def _download_model() -> Path:
#74	"""Download the GGUF model from HuggingFace if not present."""
#75	MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
#76	local_path = MODEL_CACHE_DIR / DEFAULT_MODEL_FILE
#77	if local_path.exists():
#78	return local_path
#79
#80	try:
#81	from huggingface_hub import hf_hub_download
#82	except ImportError:
#83	raise RuntimeError(
#84	"huggingface_hub not installed. Run: pip install huggingface-hub"
#85	)
#86
#87	downloaded = hf_hub_download(
#88	repo_id=DEFAULT_MODEL_REPO,
#89	filename=DEFAULT_MODEL_FILE,
#90	local_dir=str(MODEL_CACHE_DIR),
#91	local_dir_use_symlinks=False,
#92	)
#93	return Path(downloaded)
#94
#95
#96	def _load_llm_llamacpp(model_path: Path):
#97	"""Load the GGUF model via llama-cpp-python. Returns Llama instance or None."""
#98	try:
#99	from llama_cpp import Llama
#100	except ImportError:
#101	return None
#102
#103	try:
#104	llm = Llama(
#105	model_path=str(model_path),
#106	n_ctx=LLM_N_CTX,
#107	n_threads=LLM_N_THREADS,
#108	verbose=False,
#109	)
#110	return llm
#111	except Exception:
#112	return None
#113
#114
#115	def _load_llm_ctransformers(model_path: Path):
#116	"""Load the GGUF model via ctransformers (x86_64 only). Returns model or None."""
#117	_ensure_sys_path()
#118
#119	try:
#120	from ctransformers import AutoModelForCausalLM
#121	except ImportError:
#122	return None
#123
#124	try:
#125	return AutoModelForCausalLM.from_pretrained(
#126	str(model_path),
#127	model_type="llama",
#128	max_new_tokens=LLM_MAX_TOKENS,
#129	threads=LLM_N_THREADS,
#130	context_length=LLM_N_CTX,
#131	)
#132	except Exception:
#133	return None
#134
#135
#136	def _load_llm():
#137	"""Lazy-load the best available local LLM backend.
#138
#139	Priority: llama-cpp-python > ctransformers (x86_64 fallback).
#140	Returns the loaded model/LLM instance, or None if no backend works.
#141	"""
#142	global _llm_instance, _llm_backend, _llm_available
#143
#144	if _llm_instance is not None:
#145	return _llm_instance
#146
#147	if not LLM_ENABLED:
#148	_llm_available = False
#149	return None
#150
#151	# Get or download model file
#152	model_file = _model_path()
#153	if model_file is None:
#154	try:
#155	model_file = _download_model()
#156	except Exception:
#157	_llm_available = False
#158	return None
#159
#160	# Try llama-cpp-python first (works on ARM64 + x86_64)
#161	llm = _load_llm_llamacpp(model_file)
#162	if llm is not None:
#163	_llm_instance = llm
#164	_llm_backend = "llamacpp"
#165	_llm_available = True
#166	return _llm_instance
#167
#168	# Fall back to ctransformers (x86_64 only)
#169	llm = _load_llm_ctransformers(model_file)
#170	if llm is not None:
#171	_llm_instance = llm
#172	_llm_backend = "ctransformers"
#173	_llm_available = True
#174	return _llm_instance
#175
#176	_llm_available = False
#177	return None
#178
#179
#180	def _call_local_llm(prompt: str) -> Optional[str]:
#181	"""Run inference on the local LLM using whichever backend is loaded."""
#182	llm = _load_llm()
#183	if llm is None:
#184	return None
#185
#186	try:
#187	if _llm_backend == "llamacpp":
#188	# llama-cpp-python uses chat completion API
#189	response = llm.create_chat_completion(
#190	messages=[{"role": "user", "content": prompt}],
#191	max_tokens=LLM_MAX_TOKENS,
#192	stop=["</s>", "<\|user\|>"],
#193	temperature=0.3,
#194	)
#195	choices = response.get("choices", [])
#196	if choices:
#197	return choices[0].get("message", {}).get("content", "")
#198	return None
#199	else:
#200	# ctransformers uses direct callable
#201	return llm(prompt, max_new_tokens=LLM_MAX_TOKENS, stop=["</s>", "<\|user\|>"])
#202	except Exception:
#203	return None
#204
#205
#206	def _build_prompt(memories: List[str], source: str = "") -> str:
#207	"""Build a consolidation prompt from a list of memory strings.
#208
#209	Uses a plain-text instruction format (no special model tokens)
#210	suitable for both local GGUF models and any LLM. For host LLM
#211	calls, use :func:`_build_host_prompt` instead.
#212	"""
#213	header = (
#214	"Summarize the following memories into 1-3 concise sentences. "
#215	"Preserve facts, names, preferences, and decisions. Discard fluff."
#216	)
#217	if source:
#218	header += f" Source: {source}."
#219
#220	lines = "\n".join(f"- {m}" for m in memories if m)
#221	prompt = f"{header}\n\n{lines}\n\nSummary:"
#222	return prompt
#223
#224
#225	def _build_host_prompt(memories: List[str], source: str = "") -> str:
#226	"""Plain-text consolidation prompt for host LLMs (no TinyLlama tokens).
#227
#228	The host adapter wraps this string as the user-message content of a
#229	Chat Completions call; embedding TinyLlama chat-template tokens here
#230	would degrade output quality on every modern aux provider.
#231	"""
#232	header = (
#233	"Summarize the following memories into 1-3 concise sentences. "
#234	"Preserve facts, names, preferences, and decisions. Discard fluff."
#235	)
#236	if source:
#237	header += f" Source: {source}."
#238
#239	lines = "\n".join(f"- {m}" for m in memories if m)
#240	return f"{header}\n\n{lines}"
#241
#242
#243	def _host_backend_will_handle_call() -> bool:
#244	"""True iff the host backend will be the chosen path for an LLM call.
#245
#246	Used to pick the right context budget at chunk time (HOST_LLM_N_CTX vs
#247	LLM_N_CTX) and to short-circuit llm_available() for Hermes-only users.
#248	"""
#249	if not LLM_ENABLED or not HOST_LLM_ENABLED:
#250	return False
#251	try:
#252	from mnemosyne.core.llm_backends import get_host_llm_backend
#253	return get_host_llm_backend() is not None
#254	except Exception:
#255	return False
#256
#257
#258	def _try_host_llm(
#259	prompt: str,
#260	*,
#261	max_tokens: int,
#262	temperature: float,
#263	):
#264	"""Attempt the host LLM backend if enabled and registered.
#265
#266	Returns ``(attempted, text)``:
#267
#268	- ``(False, None)`` when host is disabled, MNEMOSYNE_LLM_ENABLED is false,
#269	or no backend is registered. Caller should proceed with the existing
#270	remote/local fallback chain.
#271	- ``(True, text-or-None)`` when the backend was called. The ``attempted``
#272	flag is the sentinel callers use to honor the precedence rule: when
#273	host is enabled and was attempted, the existing MNEMOSYNE_LLM_BASE_URL
#274	path MUST be skipped on failure; fall straight to local GGUF, then None.
#275
#276	See ``docs/hermes-llm-integration.md`` for the full behavior model.
#277	"""
#278	if not LLM_ENABLED or not HOST_LLM_ENABLED:
#279	return (False, None)
#280	try:
#281	from mnemosyne.core.llm_backends import call_host_llm, get_host_llm_backend
#282	except Exception:
#283	return (False, None)
#284	if get_host_llm_backend() is None:
#285	return (False, None)
#286	raw = call_host_llm(
#287	prompt,
#288	max_tokens=max_tokens,
#289	temperature=temperature,
#290	timeout=HOST_LLM_TIMEOUT,
#291	provider=HOST_LLM_PROVIDER,
#292	model=HOST_LLM_MODEL,
#293	)
#294	# NB: do NOT run host output through _clean_output(): that helper exists
#295	# to scrub TinyLlama prompt-template echoes and bulleted prompt repeats
#296	# from local-model output. Host LLMs (Codex/GPT-class) don't echo our
#297	# prompt format, AND extract_facts() relies on `- bullet` lines surviving
#298	# so _parse_facts() can consume them. Just trim whitespace.
#299	text = raw.strip() if isinstance(raw, str) and raw.strip() else None
#300	return (True, text)
#301
#302
#303	def _clean_output(text: str) -> str:
#304	"""Strip assistant tokens and extra whitespace from model output."""
#305	text = text.replace("<\|assistant\|>", "").replace("<\|user\|>", "")
#306	text = text.replace("</s>", "").strip()
#307	text = re.sub(r"^(Summarize the following memories.?[.!?:]\s)", "", text, flags=re.IGNORECASE \| re.DOTALL)
#308	text = re.sub(r"^(Preserve facts.?[.!?:]\s)", "", text, flags=re.IGNORECASE \| re.DOTALL)
#309	text = re.sub(r"^Source:.*?\n", "", text, flags=re.IGNORECASE)
#310	text = re.sub(r"^\s[-]\s.*\n", "", text, flags=re.MULTILINE)
#311	return text.strip()
#312
#313
#314	def _estimate_tokens(text: str) -> int:
#315	"""Rough token count: ~4 chars per token for English, with safety margin."""
#316	return max(1, len(text) // 4)
#317
#318
#319	def _prompt_token_budget() -> int:
#320	"""Return usable token budget for memory content (reserves overhead + output).
#321
#322	Picks the larger HOST_LLM_N_CTX when the host backend will handle the
#323	call; otherwise the TinyLlama-calibrated LLM_N_CTX. This avoids the
#324	multi-chunk-summary degradation on 128K-context aux providers.
#325	"""
#326	overhead = 80
#327	output_reserve = LLM_MAX_TOKENS
#328	n_ctx = HOST_LLM_N_CTX if _host_backend_will_handle_call() else LLM_N_CTX
#329	safety_margin = int(n_ctx * 0.2)
#330	return max(64, n_ctx - overhead - output_reserve - safety_margin)
#331
#332
#333	def chunk_memories_by_budget(memories: List[str], source: str = "") -> List[List[str]]:
#334	"""Split memories into chunks that fit within the LLM context window."""
#335	if not memories:
#336	return []
#337
#338	budget = _prompt_token_budget()
#339	chunks = []
#340	current_chunk = []
#341	current_tokens = 0
#342
#343	header = (
#344	"Summarize the following memories into 1-3 concise sentences. "
#345	"Preserve facts, names, preferences, and decisions. Discard fluff."
#346	)
#347	if source:
#348	header += f" Source: {source}."
#349	header_tokens = _estimate_tokens(header + "\n\n")
#350
#351	format_overhead = _estimate_tokens("- \n")
#352	available = budget - header_tokens
#353
#354	for memory in memories:
#355	mem_tokens = _estimate_tokens(memory) + format_overhead
#356	if mem_tokens > budget:
#357	continue
#358	if current_tokens + mem_tokens > available and current_chunk:
#359	chunks.append(current_chunk)
#360	current_chunk = []
#361	current_tokens = 0
#362	current_chunk.append(memory)
#363	current_tokens += mem_tokens
#364
#365	if current_chunk:
#366	chunks.append(current_chunk)
#367
#368	return chunks
#369
#370
#371	def llm_available() -> bool:
#372	"""Check whether any LLM backend (host, remote, or local) is available.
#373
#374	Returns True for Hermes-only users (no MNEMOSYNE_LLM_BASE_URL, no local
#375	GGUF) as long as a host backend is registered and enabled — otherwise
#376	sleep would skip ``summarize_memories()`` before the host path could run.
#377	"""
#378	global _llm_available
#379	# 0. Host backend (if a host is registered and the user opted in).
#380	if _host_backend_will_handle_call():
#381	return True
#382	# 1. Remote API: only consider it when LLM is globally enabled.
#383	if LLM_ENABLED and LLM_BASE_URL:
#384	return True
#385	if _llm_available is not None:
#386	return _llm_available
#387	_load_llm()
#388	return bool(_llm_available)
#389
#390
#391	def _call_remote_llm(prompt: str, temperature: float = 0.3) -> Optional[str]:
#392	"""Call an OpenAI-compatible remote endpoint for summarization.
#393
#394	``temperature`` defaults to 0.3 (paraphrase-safe for consolidation);
#395	callers that need deterministic output (e.g., fact extraction) can
#396	pass ``temperature=0.0``.
#397	"""
#398	if not LLM_BASE_URL:
#399	return None
#400
#401	import json
#402
#403	try:
#404	import httpx
#405	has_httpx = True
#406	except ImportError:
#407	has_httpx = False
#408
#409	url = f"{LLM_BASE_URL}/chat/completions"
#410	headers = {"Content-Type": "application/json"}
#411	if LLM_API_KEY:
#412	headers["Authorization"] = f"Bearer {LLM_API_KEY}"
#413
#414	model = LLM_REMOTE_MODEL or "local"
#415
#416	payload = {
#417	"model": model,
#418	"messages": [{"role": "user", "content": prompt}],
#419	"max_tokens": LLM_MAX_TOKENS,
#420	"temperature": temperature,
#421	"stop": ["</s>", "<\|user\|>"]
#422	}
#423
#424	try:
#425	if has_httpx:
#426	with httpx.Client(timeout=60.0) as client:
#427	response = client.post(url, json=payload, headers=headers)
#428	response.raise_for_status()
#429	data = response.json()
#430	else:
#431	import urllib.request
#432	req = urllib.request.Request(
#433	url,
#434	data=json.dumps(payload).encode(),
#435	headers=headers,
#436	method="POST"
#437	)
#438	with urllib.request.urlopen(req, timeout=60.0) as resp:
#439	data = json.loads(resp.read().decode())
#440
#441	choices = data.get("choices", [])
#442	if choices and choices[0].get("message", {}).get("content"):
#443	return choices[0]["message"]["content"]
#444	return None
#445	except Exception:
#446	return None
#447
#448
#449	def summarize_memories(memories: List[str], source: str = "") -> Optional[str]:
#450	"""Summarize a batch of working-memory items into a single episodic string.
#451
#452	Fallback chain:
#453
#454	0. Host-provided LLM backend, only if MNEMOSYNE_HOST_LLM_ENABLED=true,
#455	MNEMOSYNE_LLM_ENABLED=true, AND a backend is registered. When this
#456	path is attempted but produces no usable text, the existing remote
#457	URL is skipped — falls through to local GGUF, then None. This
#458	prevents accidentally routing memory content to a stale
#459	MNEMOSYNE_LLM_BASE_URL the user forgot to clear.
#460	1. Remote OpenAI-compatible API (if MNEMOSYNE_LLM_BASE_URL is set
#461	AND MNEMOSYNE_LLM_ENABLED is not false).
#462	2. llama-cpp-python (ARM64 + x86_64 native).
#463	3. ctransformers (x86_64 only, legacy).
#464	4. Return None → caller falls back to AAAK encoding.
#465	"""
#466	if not memories:
#467	return None
#468
#469	# Chunk large memory lists to stay within context window limits.
#470	# chunk_memories_by_budget() respects LLM_N_CTX and safety margins.
#471	chunks = chunk_memories_by_budget(memories, source=source)
#472
#473	def _summarize_chunk(chunk_memories: List[str], chunk_source: str = "") -> Optional[str]:
#474	"""Summarize a single chunk of memories via the fallback chain."""
#475	host_prompt = _build_host_prompt(chunk_memories, source=chunk_source)
#476	prompt = _build_prompt(chunk_memories, source=chunk_source)
#477
#478	# 0. Host backend.
#479	attempted, text = _try_host_llm(host_prompt, max_tokens=LLM_MAX_TOKENS, temperature=0.3)
#480	if attempted:
#481	if text:
#482	return text
#483	raw = _call_local_llm(prompt)
#484	if raw:
#485	cleaned = _clean_output(raw)
#486	return cleaned if cleaned else None
#487	return None
#488
#489	# 1. Remote API (skip if MNEMOSYNE_FORCE_LOCAL=1 or remote call fails).
#490	if LLM_ENABLED and LLM_BASE_URL and not os.environ.get("MNEMOSYNE_FORCE_LOCAL", "").lower() in ("1", "true", "yes"):
#491	raw = _call_remote_llm(prompt)
#492	if raw:
#493	cleaned = _clean_output(raw)
#494	return cleaned if cleaned else None
#495
#496	# 2. Local LLM (llama-cpp-python or ctransformers fallback).
#497	raw = _call_local_llm(prompt)
#498	if raw:
#499	cleaned = _clean_output(raw)
#500	return cleaned if cleaned else None
#501	return None
#502
#503	# Summarize each chunk individually.
#504	chunk_summaries = []
#505	for chunk in chunks:
#506	summary = _summarize_chunk(chunk, chunk_source=source)
#507	if summary:
#508	chunk_summaries.append(summary)
#509
#510	if not chunk_summaries:
#511	return None
#512
#513	# If multiple chunks, do a second-pass summary to consolidate chunk summaries.
#514	if len(chunk_summaries) > 1:
#515	final = _summarize_chunk(chunk_summaries, source=f"{source} [chunked {len(chunks)} parts]")
#516	return final if final else chunk_summaries[0]
#517
#518	return chunk_summaries[0]
#519

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public