repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | """ |
| #2 | Mnemosyne Local LLM Consolidation |
| #3 | ================================= |
| #4 | Lightweight on-device summarization for the sleep/consolidation cycle. |
| #5 | Uses llama-cpp-python (ARM64 + x86_64 native) with ctransformers fallback. |
| #6 | Falls back to aaak encoding if the model is unavailable or inference fails. |
| #7 | |
| #8 | Model cache: ~/.hermes/mnemosyne/models/ |
| #9 | Default model: TinyLlama-1.1B-Chat-v1.0-GGUF (Q4_K_M, ~600MB) |
| #10 | """ |
| #11 | |
| #12 | import os |
| #13 | import sys |
| #14 | import re |
| #15 | from pathlib import Path |
| #16 | from typing import List, Optional |
| #17 | |
| #18 | # --- Config ------------------------------------------------------------------ |
| #19 | DEFAULT_MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" |
| #20 | DEFAULT_MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" |
| #21 | MODEL_CACHE_DIR = Path.home() / ".hermes" / "mnemosyne" / "models" |
| #22 | |
| #23 | LLM_ENABLED = os.environ.get("MNEMOSYNE_LLM_ENABLED", "true").lower() in ("1", "true", "yes") |
| #24 | LLM_MAX_TOKENS = int(os.environ.get("MNEMOSYNE_LLM_MAX_TOKENS", "256")) |
| #25 | LLM_N_THREADS = int(os.environ.get("MNEMOSYNE_LLM_N_THREADS", "4")) |
| #26 | LLM_N_CTX = int(os.environ.get("MNEMOSYNE_LLM_N_CTX", "2048")) |
| #27 | |
| #28 | # Override model via env |
| #29 | _env_repo = os.environ.get("MNEMOSYNE_LLM_REPO") |
| #30 | _env_file = os.environ.get("MNEMOSYNE_LLM_FILE") |
| #31 | if _env_repo and _env_file: |
| #32 | DEFAULT_MODEL_REPO = _env_repo |
| #33 | DEFAULT_MODEL_FILE = _env_file |
| #34 | |
| #35 | # Remote API config |
| #36 | LLM_BASE_URL = os.environ.get("MNEMOSYNE_LLM_BASE_URL", "").rstrip("/") |
| #37 | LLM_API_KEY = os.environ.get("MNEMOSYNE_LLM_API_KEY", "") |
| #38 | LLM_REMOTE_MODEL = os.environ.get("MNEMOSYNE_LLM_MODEL", "") |
| #39 | |
| #40 | # Host LLM adapter (Hermes or another agent). Disabled by default to preserve |
| #41 | # existing standalone behavior. When MNEMOSYNE_HOST_LLM_ENABLED=true and a |
| #42 | # backend is registered via mnemosyne.core.llm_backends.set_host_llm_backend(), |
| #43 | # the host backend is consulted before the existing remote/local chain. |
| #44 | # See docs/hermes-llm-integration.md for the full behavior model. |
| #45 | HOST_LLM_ENABLED = os.environ.get("MNEMOSYNE_HOST_LLM_ENABLED", "false").lower() in ("1", "true", "yes") |
| #46 | HOST_LLM_PROVIDER = os.environ.get("MNEMOSYNE_HOST_LLM_PROVIDER", "").strip() or None |
| #47 | HOST_LLM_MODEL = os.environ.get("MNEMOSYNE_HOST_LLM_MODEL", "").strip() or None |
| #48 | HOST_LLM_TIMEOUT = 15.0 # Per-attempt safety cap; not user-facing. |
| #49 | # Host context window: TinyLlama-calibrated LLM_N_CTX (2048) is too small for |
| #50 | # Codex/GPT-class aux models; use this larger budget when the host is the path. |
| #51 | HOST_LLM_N_CTX = int(os.environ.get("MNEMOSYNE_HOST_LLM_N_CTX", "32000")) |
| #52 | |
| #53 | # --- Lazy singleton ---------------------------------------------------------- |
| #54 | _llm_instance = None |
| #55 | _llm_backend = None # "llamacpp", "ctransformers", or None |
| #56 | _llm_available = None # None = not checked yet |
| #57 | |
| #58 | |
| #59 | def _ensure_sys_path(): |
| #60 | """Ensure /usr/local/lib/python3.11/site-packages is in sys.path |
| #61 | so ctransformers is discoverable when Hermes runs in a venv.""" |
| #62 | sp = "/usr/local/lib/python3.11/site-packages" |
| #63 | if sp not in sys.path and os.path.isdir(sp): |
| #64 | sys.path.append(sp) |
| #65 | |
| #66 | |
| #67 | def _model_path() -> Optional[Path]: |
| #68 | """Return path to the local GGUF model file, or None if not downloaded.""" |
| #69 | candidate = MODEL_CACHE_DIR / DEFAULT_MODEL_FILE |
| #70 | return candidate if candidate.exists() else None |
| #71 | |
| #72 | |
| #73 | def _download_model() -> Path: |
| #74 | """Download the GGUF model from HuggingFace if not present.""" |
| #75 | MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True) |
| #76 | local_path = MODEL_CACHE_DIR / DEFAULT_MODEL_FILE |
| #77 | if local_path.exists(): |
| #78 | return local_path |
| #79 | |
| #80 | try: |
| #81 | from huggingface_hub import hf_hub_download |
| #82 | except ImportError: |
| #83 | raise RuntimeError( |
| #84 | "huggingface_hub not installed. Run: pip install huggingface-hub" |
| #85 | ) |
| #86 | |
| #87 | downloaded = hf_hub_download( |
| #88 | repo_id=DEFAULT_MODEL_REPO, |
| #89 | filename=DEFAULT_MODEL_FILE, |
| #90 | local_dir=str(MODEL_CACHE_DIR), |
| #91 | local_dir_use_symlinks=False, |
| #92 | ) |
| #93 | return Path(downloaded) |
| #94 | |
| #95 | |
| #96 | def _load_llm_llamacpp(model_path: Path): |
| #97 | """Load the GGUF model via llama-cpp-python. Returns Llama instance or None.""" |
| #98 | try: |
| #99 | from llama_cpp import Llama |
| #100 | except ImportError: |
| #101 | return None |
| #102 | |
| #103 | try: |
| #104 | llm = Llama( |
| #105 | model_path=str(model_path), |
| #106 | n_ctx=LLM_N_CTX, |
| #107 | n_threads=LLM_N_THREADS, |
| #108 | verbose=False, |
| #109 | ) |
| #110 | return llm |
| #111 | except Exception: |
| #112 | return None |
| #113 | |
| #114 | |
| #115 | def _load_llm_ctransformers(model_path: Path): |
| #116 | """Load the GGUF model via ctransformers (x86_64 only). Returns model or None.""" |
| #117 | _ensure_sys_path() |
| #118 | |
| #119 | try: |
| #120 | from ctransformers import AutoModelForCausalLM |
| #121 | except ImportError: |
| #122 | return None |
| #123 | |
| #124 | try: |
| #125 | return AutoModelForCausalLM.from_pretrained( |
| #126 | str(model_path), |
| #127 | model_type="llama", |
| #128 | max_new_tokens=LLM_MAX_TOKENS, |
| #129 | threads=LLM_N_THREADS, |
| #130 | context_length=LLM_N_CTX, |
| #131 | ) |
| #132 | except Exception: |
| #133 | return None |
| #134 | |
| #135 | |
| #136 | def _load_llm(): |
| #137 | """Lazy-load the best available local LLM backend. |
| #138 | |
| #139 | Priority: llama-cpp-python > ctransformers (x86_64 fallback). |
| #140 | Returns the loaded model/LLM instance, or None if no backend works. |
| #141 | """ |
| #142 | global _llm_instance, _llm_backend, _llm_available |
| #143 | |
| #144 | if _llm_instance is not None: |
| #145 | return _llm_instance |
| #146 | |
| #147 | if not LLM_ENABLED: |
| #148 | _llm_available = False |
| #149 | return None |
| #150 | |
| #151 | # Get or download model file |
| #152 | model_file = _model_path() |
| #153 | if model_file is None: |
| #154 | try: |
| #155 | model_file = _download_model() |
| #156 | except Exception: |
| #157 | _llm_available = False |
| #158 | return None |
| #159 | |
| #160 | # Try llama-cpp-python first (works on ARM64 + x86_64) |
| #161 | llm = _load_llm_llamacpp(model_file) |
| #162 | if llm is not None: |
| #163 | _llm_instance = llm |
| #164 | _llm_backend = "llamacpp" |
| #165 | _llm_available = True |
| #166 | return _llm_instance |
| #167 | |
| #168 | # Fall back to ctransformers (x86_64 only) |
| #169 | llm = _load_llm_ctransformers(model_file) |
| #170 | if llm is not None: |
| #171 | _llm_instance = llm |
| #172 | _llm_backend = "ctransformers" |
| #173 | _llm_available = True |
| #174 | return _llm_instance |
| #175 | |
| #176 | _llm_available = False |
| #177 | return None |
| #178 | |
| #179 | |
| #180 | def _call_local_llm(prompt: str) -> Optional[str]: |
| #181 | """Run inference on the local LLM using whichever backend is loaded.""" |
| #182 | llm = _load_llm() |
| #183 | if llm is None: |
| #184 | return None |
| #185 | |
| #186 | try: |
| #187 | if _llm_backend == "llamacpp": |
| #188 | # llama-cpp-python uses chat completion API |
| #189 | response = llm.create_chat_completion( |
| #190 | messages=[{"role": "user", "content": prompt}], |
| #191 | max_tokens=LLM_MAX_TOKENS, |
| #192 | stop=["</s>", "<|user|>"], |
| #193 | temperature=0.3, |
| #194 | ) |
| #195 | choices = response.get("choices", []) |
| #196 | if choices: |
| #197 | return choices[0].get("message", {}).get("content", "") |
| #198 | return None |
| #199 | else: |
| #200 | # ctransformers uses direct callable |
| #201 | return llm(prompt, max_new_tokens=LLM_MAX_TOKENS, stop=["</s>", "<|user|>"]) |
| #202 | except Exception: |
| #203 | return None |
| #204 | |
| #205 | |
| #206 | def _build_prompt(memories: List[str], source: str = "") -> str: |
| #207 | """Build a consolidation prompt from a list of memory strings. |
| #208 | |
| #209 | Uses a plain-text instruction format (no special model tokens) |
| #210 | suitable for both local GGUF models and any LLM. For host LLM |
| #211 | calls, use :func:`_build_host_prompt` instead. |
| #212 | """ |
| #213 | header = ( |
| #214 | "Summarize the following memories into 1-3 concise sentences. " |
| #215 | "Preserve facts, names, preferences, and decisions. Discard fluff." |
| #216 | ) |
| #217 | if source: |
| #218 | header += f" Source: {source}." |
| #219 | |
| #220 | lines = "\n".join(f"- {m}" for m in memories if m) |
| #221 | prompt = f"{header}\n\n{lines}\n\nSummary:" |
| #222 | return prompt |
| #223 | |
| #224 | |
| #225 | def _build_host_prompt(memories: List[str], source: str = "") -> str: |
| #226 | """Plain-text consolidation prompt for host LLMs (no TinyLlama tokens). |
| #227 | |
| #228 | The host adapter wraps this string as the user-message content of a |
| #229 | Chat Completions call; embedding TinyLlama chat-template tokens here |
| #230 | would degrade output quality on every modern aux provider. |
| #231 | """ |
| #232 | header = ( |
| #233 | "Summarize the following memories into 1-3 concise sentences. " |
| #234 | "Preserve facts, names, preferences, and decisions. Discard fluff." |
| #235 | ) |
| #236 | if source: |
| #237 | header += f" Source: {source}." |
| #238 | |
| #239 | lines = "\n".join(f"- {m}" for m in memories if m) |
| #240 | return f"{header}\n\n{lines}" |
| #241 | |
| #242 | |
| #243 | def _host_backend_will_handle_call() -> bool: |
| #244 | """True iff the host backend will be the chosen path for an LLM call. |
| #245 | |
| #246 | Used to pick the right context budget at chunk time (HOST_LLM_N_CTX vs |
| #247 | LLM_N_CTX) and to short-circuit llm_available() for Hermes-only users. |
| #248 | """ |
| #249 | if not LLM_ENABLED or not HOST_LLM_ENABLED: |
| #250 | return False |
| #251 | try: |
| #252 | from mnemosyne.core.llm_backends import get_host_llm_backend |
| #253 | return get_host_llm_backend() is not None |
| #254 | except Exception: |
| #255 | return False |
| #256 | |
| #257 | |
| #258 | def _try_host_llm( |
| #259 | prompt: str, |
| #260 | *, |
| #261 | max_tokens: int, |
| #262 | temperature: float, |
| #263 | ): |
| #264 | """Attempt the host LLM backend if enabled and registered. |
| #265 | |
| #266 | Returns ``(attempted, text)``: |
| #267 | |
| #268 | - ``(False, None)`` when host is disabled, MNEMOSYNE_LLM_ENABLED is false, |
| #269 | or no backend is registered. Caller should proceed with the existing |
| #270 | remote/local fallback chain. |
| #271 | - ``(True, text-or-None)`` when the backend was called. The ``attempted`` |
| #272 | flag is the sentinel callers use to honor the precedence rule: when |
| #273 | host is enabled and was attempted, the existing MNEMOSYNE_LLM_BASE_URL |
| #274 | path MUST be skipped on failure; fall straight to local GGUF, then None. |
| #275 | |
| #276 | See ``docs/hermes-llm-integration.md`` for the full behavior model. |
| #277 | """ |
| #278 | if not LLM_ENABLED or not HOST_LLM_ENABLED: |
| #279 | return (False, None) |
| #280 | try: |
| #281 | from mnemosyne.core.llm_backends import call_host_llm, get_host_llm_backend |
| #282 | except Exception: |
| #283 | return (False, None) |
| #284 | if get_host_llm_backend() is None: |
| #285 | return (False, None) |
| #286 | raw = call_host_llm( |
| #287 | prompt, |
| #288 | max_tokens=max_tokens, |
| #289 | temperature=temperature, |
| #290 | timeout=HOST_LLM_TIMEOUT, |
| #291 | provider=HOST_LLM_PROVIDER, |
| #292 | model=HOST_LLM_MODEL, |
| #293 | ) |
| #294 | # NB: do NOT run host output through _clean_output(): that helper exists |
| #295 | # to scrub TinyLlama prompt-template echoes and bulleted prompt repeats |
| #296 | # from local-model output. Host LLMs (Codex/GPT-class) don't echo our |
| #297 | # prompt format, AND extract_facts() relies on `- bullet` lines surviving |
| #298 | # so _parse_facts() can consume them. Just trim whitespace. |
| #299 | text = raw.strip() if isinstance(raw, str) and raw.strip() else None |
| #300 | return (True, text) |
| #301 | |
| #302 | |
| #303 | def _clean_output(text: str) -> str: |
| #304 | """Strip assistant tokens and extra whitespace from model output.""" |
| #305 | text = text.replace("<|assistant|>", "").replace("<|user|>", "") |
| #306 | text = text.replace("</s>", "").strip() |
| #307 | text = re.sub(r"^(Summarize the following memories.*?[.!?:]\s*)", "", text, flags=re.IGNORECASE | re.DOTALL) |
| #308 | text = re.sub(r"^(Preserve facts.*?[.!?:]\s*)", "", text, flags=re.IGNORECASE | re.DOTALL) |
| #309 | text = re.sub(r"^Source:.*?\n", "", text, flags=re.IGNORECASE) |
| #310 | text = re.sub(r"^\s*[-*]\s.*\n", "", text, flags=re.MULTILINE) |
| #311 | return text.strip() |
| #312 | |
| #313 | |
| #314 | def _estimate_tokens(text: str) -> int: |
| #315 | """Rough token count: ~4 chars per token for English, with safety margin.""" |
| #316 | return max(1, len(text) // 4) |
| #317 | |
| #318 | |
| #319 | def _prompt_token_budget() -> int: |
| #320 | """Return usable token budget for memory content (reserves overhead + output). |
| #321 | |
| #322 | Picks the larger HOST_LLM_N_CTX when the host backend will handle the |
| #323 | call; otherwise the TinyLlama-calibrated LLM_N_CTX. This avoids the |
| #324 | multi-chunk-summary degradation on 128K-context aux providers. |
| #325 | """ |
| #326 | overhead = 80 |
| #327 | output_reserve = LLM_MAX_TOKENS |
| #328 | n_ctx = HOST_LLM_N_CTX if _host_backend_will_handle_call() else LLM_N_CTX |
| #329 | safety_margin = int(n_ctx * 0.2) |
| #330 | return max(64, n_ctx - overhead - output_reserve - safety_margin) |
| #331 | |
| #332 | |
| #333 | def chunk_memories_by_budget(memories: List[str], source: str = "") -> List[List[str]]: |
| #334 | """Split memories into chunks that fit within the LLM context window.""" |
| #335 | if not memories: |
| #336 | return [] |
| #337 | |
| #338 | budget = _prompt_token_budget() |
| #339 | chunks = [] |
| #340 | current_chunk = [] |
| #341 | current_tokens = 0 |
| #342 | |
| #343 | header = ( |
| #344 | "Summarize the following memories into 1-3 concise sentences. " |
| #345 | "Preserve facts, names, preferences, and decisions. Discard fluff." |
| #346 | ) |
| #347 | if source: |
| #348 | header += f" Source: {source}." |
| #349 | header_tokens = _estimate_tokens(header + "\n\n") |
| #350 | |
| #351 | format_overhead = _estimate_tokens("- \n") |
| #352 | available = budget - header_tokens |
| #353 | |
| #354 | for memory in memories: |
| #355 | mem_tokens = _estimate_tokens(memory) + format_overhead |
| #356 | if mem_tokens > budget: |
| #357 | continue |
| #358 | if current_tokens + mem_tokens > available and current_chunk: |
| #359 | chunks.append(current_chunk) |
| #360 | current_chunk = [] |
| #361 | current_tokens = 0 |
| #362 | current_chunk.append(memory) |
| #363 | current_tokens += mem_tokens |
| #364 | |
| #365 | if current_chunk: |
| #366 | chunks.append(current_chunk) |
| #367 | |
| #368 | return chunks |
| #369 | |
| #370 | |
| #371 | def llm_available() -> bool: |
| #372 | """Check whether any LLM backend (host, remote, or local) is available. |
| #373 | |
| #374 | Returns True for Hermes-only users (no MNEMOSYNE_LLM_BASE_URL, no local |
| #375 | GGUF) as long as a host backend is registered and enabled — otherwise |
| #376 | sleep would skip ``summarize_memories()`` before the host path could run. |
| #377 | """ |
| #378 | global _llm_available |
| #379 | # 0. Host backend (if a host is registered and the user opted in). |
| #380 | if _host_backend_will_handle_call(): |
| #381 | return True |
| #382 | # 1. Remote API: only consider it when LLM is globally enabled. |
| #383 | if LLM_ENABLED and LLM_BASE_URL: |
| #384 | return True |
| #385 | if _llm_available is not None: |
| #386 | return _llm_available |
| #387 | _load_llm() |
| #388 | return bool(_llm_available) |
| #389 | |
| #390 | |
| #391 | def _call_remote_llm(prompt: str, temperature: float = 0.3) -> Optional[str]: |
| #392 | """Call an OpenAI-compatible remote endpoint for summarization. |
| #393 | |
| #394 | ``temperature`` defaults to 0.3 (paraphrase-safe for consolidation); |
| #395 | callers that need deterministic output (e.g., fact extraction) can |
| #396 | pass ``temperature=0.0``. |
| #397 | """ |
| #398 | if not LLM_BASE_URL: |
| #399 | return None |
| #400 | |
| #401 | import json |
| #402 | |
| #403 | try: |
| #404 | import httpx |
| #405 | has_httpx = True |
| #406 | except ImportError: |
| #407 | has_httpx = False |
| #408 | |
| #409 | url = f"{LLM_BASE_URL}/chat/completions" |
| #410 | headers = {"Content-Type": "application/json"} |
| #411 | if LLM_API_KEY: |
| #412 | headers["Authorization"] = f"Bearer {LLM_API_KEY}" |
| #413 | |
| #414 | model = LLM_REMOTE_MODEL or "local" |
| #415 | |
| #416 | payload = { |
| #417 | "model": model, |
| #418 | "messages": [{"role": "user", "content": prompt}], |
| #419 | "max_tokens": LLM_MAX_TOKENS, |
| #420 | "temperature": temperature, |
| #421 | "stop": ["</s>", "<|user|>"] |
| #422 | } |
| #423 | |
| #424 | try: |
| #425 | if has_httpx: |
| #426 | with httpx.Client(timeout=60.0) as client: |
| #427 | response = client.post(url, json=payload, headers=headers) |
| #428 | response.raise_for_status() |
| #429 | data = response.json() |
| #430 | else: |
| #431 | import urllib.request |
| #432 | req = urllib.request.Request( |
| #433 | url, |
| #434 | data=json.dumps(payload).encode(), |
| #435 | headers=headers, |
| #436 | method="POST" |
| #437 | ) |
| #438 | with urllib.request.urlopen(req, timeout=60.0) as resp: |
| #439 | data = json.loads(resp.read().decode()) |
| #440 | |
| #441 | choices = data.get("choices", []) |
| #442 | if choices and choices[0].get("message", {}).get("content"): |
| #443 | return choices[0]["message"]["content"] |
| #444 | return None |
| #445 | except Exception: |
| #446 | return None |
| #447 | |
| #448 | |
| #449 | def summarize_memories(memories: List[str], source: str = "") -> Optional[str]: |
| #450 | """Summarize a batch of working-memory items into a single episodic string. |
| #451 | |
| #452 | Fallback chain: |
| #453 | |
| #454 | 0. Host-provided LLM backend, only if MNEMOSYNE_HOST_LLM_ENABLED=true, |
| #455 | MNEMOSYNE_LLM_ENABLED=true, AND a backend is registered. When this |
| #456 | path is attempted but produces no usable text, the existing remote |
| #457 | URL is **skipped** — falls through to local GGUF, then None. This |
| #458 | prevents accidentally routing memory content to a stale |
| #459 | MNEMOSYNE_LLM_BASE_URL the user forgot to clear. |
| #460 | 1. Remote OpenAI-compatible API (if MNEMOSYNE_LLM_BASE_URL is set |
| #461 | AND MNEMOSYNE_LLM_ENABLED is not false). |
| #462 | 2. llama-cpp-python (ARM64 + x86_64 native). |
| #463 | 3. ctransformers (x86_64 only, legacy). |
| #464 | 4. Return None → caller falls back to AAAK encoding. |
| #465 | """ |
| #466 | if not memories: |
| #467 | return None |
| #468 | |
| #469 | # Chunk large memory lists to stay within context window limits. |
| #470 | # chunk_memories_by_budget() respects LLM_N_CTX and safety margins. |
| #471 | chunks = chunk_memories_by_budget(memories, source=source) |
| #472 | |
| #473 | def _summarize_chunk(chunk_memories: List[str], chunk_source: str = "") -> Optional[str]: |
| #474 | """Summarize a single chunk of memories via the fallback chain.""" |
| #475 | host_prompt = _build_host_prompt(chunk_memories, source=chunk_source) |
| #476 | prompt = _build_prompt(chunk_memories, source=chunk_source) |
| #477 | |
| #478 | # 0. Host backend. |
| #479 | attempted, text = _try_host_llm(host_prompt, max_tokens=LLM_MAX_TOKENS, temperature=0.3) |
| #480 | if attempted: |
| #481 | if text: |
| #482 | return text |
| #483 | raw = _call_local_llm(prompt) |
| #484 | if raw: |
| #485 | cleaned = _clean_output(raw) |
| #486 | return cleaned if cleaned else None |
| #487 | return None |
| #488 | |
| #489 | # 1. Remote API (skip if MNEMOSYNE_FORCE_LOCAL=1 or remote call fails). |
| #490 | if LLM_ENABLED and LLM_BASE_URL and not os.environ.get("MNEMOSYNE_FORCE_LOCAL", "").lower() in ("1", "true", "yes"): |
| #491 | raw = _call_remote_llm(prompt) |
| #492 | if raw: |
| #493 | cleaned = _clean_output(raw) |
| #494 | return cleaned if cleaned else None |
| #495 | |
| #496 | # 2. Local LLM (llama-cpp-python or ctransformers fallback). |
| #497 | raw = _call_local_llm(prompt) |
| #498 | if raw: |
| #499 | cleaned = _clean_output(raw) |
| #500 | return cleaned if cleaned else None |
| #501 | return None |
| #502 | |
| #503 | # Summarize each chunk individually. |
| #504 | chunk_summaries = [] |
| #505 | for chunk in chunks: |
| #506 | summary = _summarize_chunk(chunk, chunk_source=source) |
| #507 | if summary: |
| #508 | chunk_summaries.append(summary) |
| #509 | |
| #510 | if not chunk_summaries: |
| #511 | return None |
| #512 | |
| #513 | # If multiple chunks, do a second-pass summary to consolidate chunk summaries. |
| #514 | if len(chunk_summaries) > 1: |
| #515 | final = _summarize_chunk(chunk_summaries, source=f"{source} [chunked {len(chunks)} parts]") |
| #516 | return final if final else chunk_summaries[0] |
| #517 | |
| #518 | return chunk_summaries[0] |
| #519 |