benmaster82-Kwipu

repository

loading code, commits, and activity

repositories

loading repo index

#1	import os
#2	import sys
#3
#4	# Force global UTF-8 mode on Windows
#5	if sys.platform == "win32" and os.environ.get("PYTHONUTF8") != "1":
#6	import subprocess
#7
#8	os.environ["PYTHONUTF8"] = "1"
#9	result = subprocess.run([sys.executable] + sys.argv, env=os.environ)
#10	sys.exit(result.returncode)
#11
#12	import hashlib
#13	import io
#14	import math
#15	import re
#16	import time
#17	import yaml
#18	import logging
#19	import threading
#20	import asyncio
#21	from pathlib import Path
#22	from collections import defaultdict, Counter
#23
#24	# Fix for Windows: avoid "Event loop is closed" with ProactorEventLoop
#25	if sys.platform == "win32":
#26	asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
#27
#28	import nest_asyncio
#29
#30	# Apply nest_asyncio only when needed (conflicts with uvicorn)
#31	_NEST_ASYNCIO_APPLIED = False
#32
#33
#34	def _ensure_nest_asyncio():
#35	global _NEST_ASYNCIO_APPLIED
#36	if not _NEST_ASYNCIO_APPLIED:
#37	nest_asyncio.apply()
#38	_NEST_ASYNCIO_APPLIED = True
#39
#40	# Force UTF-8 on stdout/stderr
#41	try:
#42	sys.stdout = io.TextIOWrapper(
#43	sys.stdout.buffer, encoding="utf-8", errors="replace", line_buffering=True
#44	)
#45	sys.stderr = io.TextIOWrapper(
#46	sys.stderr.buffer, encoding="utf-8", errors="replace", line_buffering=True
#47	)
#48	except Exception:
#49	pass
#50
#51	from watchdog.observers import Observer
#52	from watchdog.events import FileSystemEventHandler
#53
#54	from llama_index.core import (
#55	PropertyGraphIndex,
#56	StorageContext,
#57	SimpleDirectoryReader,
#58	Settings,
#59	PromptTemplate,
#60	load_index_from_storage,
#61	)
#62	from llama_index.core.indices.property_graph.transformations import (
#63	SimpleLLMPathExtractor,
#64	ImplicitPathExtractor,
#65	)
#66	from llama_index.core.indices.property_graph import (
#67	LLMSynonymRetriever,
#68	VectorContextRetriever,
#69	CustomPGRetriever,
#70	)
#71	from llama_index.core.schema import NodeWithScore, TextNode, Document
#72	from llama_index.llms.ollama import Ollama
#73	from llama_index.embeddings.ollama import OllamaEmbedding
#74
#75	# Multilingual module
#76	from lang_config import (
#77	tokenize,
#78	detect_language,
#79	extract_date_tokens,
#80	infer_relation,
#81	ALL_TEMPORAL_KEYWORDS,
#82	FALLBACK_RELATION,
#83	)
#84
#85	# ==========================================
#86	# CONFIGURATION
#87	# ==========================================
#88	MODEL_NAME = "gpt-oss:20b-cloud"
#89	EMBED_MODEL = "nomic-embed-text"
#90	KNOWLEDGE_DIR = "./knowledge_base"
#91	STORAGE_DIR = "./storage_graph"
#92
#93	WATCHER_DEBOUNCE_SECONDS = 5
#94	WATCHER_VALID_EXTENSIONS = {".md", ".txt", ".pdf", ".docx"}
#95
#96	logging.basicConfig(level=logging.ERROR)
#97
#98
#99	def _init_llm(model_name: str = MODEL_NAME, embed_model: str = EMBED_MODEL):
#100	"""Initialize LLM and embedding model. Called from main() to avoid side effects on import."""
#101	Settings.llm = Ollama(
#102	model=model_name,
#103	request_timeout=300.0,
#104	base_url="http://localhost:11434",
#105	)
#106	Settings.embed_model = OllamaEmbedding(model_name=embed_model)
#107
#108	# Chunking: large chunks to avoid splitting small notes
#109	Settings.chunk_size = 2048
#110	Settings.chunk_overlap = 256
#111
#112	# ==========================================
#113	# SYSTEM PROMPT (multilingual)
#114	# ==========================================
#115	SYSTEM_PROMPT = (
#116	"You are the research assistant of Geode Graph. Your task is to answer questions "
#117	"based exclusively on the provided context from the user's knowledge base.\n\n"
#118	"RULES:\n"
#119	"1. Use ONLY information explicitly stated in the context below. Never invent or assume facts.\n"
#120	"2. Be concise but complete: include every relevant fact found in the context. "
#121	"Do not omit cited information that answers the question.\n"
#122	"3. If the answer involves multiple files, state which files are involved and "
#123	"quote the connecting fact from each.\n"
#124	"4. Always cite source file names in square brackets (e.g. [document.md]).\n"
#125	"5. When quoting actions or facts, preserve the original meaning. "
#126	"If a document says someone WILL do something, report it as a future action.\n"
#127	"6. When the user asks what to do BEFORE an event, include ALL tasks, preparations, "
#128	"and actions related to that event.\n"
#129	"7. If you cannot find the answer, say: 'I don't have enough information in your local files.'\n"
#130	"8. If unsure about a detail, omit it rather than guessing.\n"
#131	"9. ALWAYS respond in the same language as the user's question.\n\n"
#132	"CONTEXT:\n"
#133	"{context_str}\n\n"
#134	"QUESTION: {query_str}"
#135	)
#136
#137	qa_template = PromptTemplate(SYSTEM_PROMPT)
#138
#139
#140	# ==========================================
#141	# SAFE PRINT (Rich-powered)
#142	# ==========================================
#143	from rich.console import Console
#144	from rich.markdown import Markdown
#145	from rich.panel import Panel
#146	from rich.text import Text
#147	from rich.status import Status
#148
#149	console = Console()
#150
#151
#152	def safe_print(args, *kwargs):
#153	try:
#154	console.print(args, *kwargs)
#155	except Exception:
#156	try:
#157	print(*[str(a) for a in args])
#158	except Exception:
#159	pass
#160
#161
#162	# ==========================================
#163	# BM25 CHUNK RETRIEVER
#164	# ==========================================
#165	class BM25ChunkRetriever(CustomPGRetriever):
#166	"""Multilingual BM25-like retriever that searches text chunks in the graph."""
#167
#168	K1 = 1.5
#169	B = 0.75
#170	TOP_K = 8
#171
#172	def init(self, **kwargs):
#173	self._idf_cache = {}
#174	self._avg_dl = 0
#175	self._doc_count = 0
#176	self._corpus_built = False
#177
#178	def _build_corpus_stats(self, chunks: list[tuple[str, str]]):
#179	"""Compute IDF and average document length."""
#180	if self._corpus_built:
#181	return
#182
#183	self._doc_count = len(chunks)
#184	if self._doc_count == 0:
#185	return
#186
#187	total_len = 0
#188	df = Counter()
#189
#190	for _, text in chunks:
#191	tokens = tokenize(text) # Usa tokenizer multilingue
#192	total_len += len(tokens)
#193	unique_tokens = set(tokens)
#194	for t in unique_tokens:
#195	df[t] += 1
#196
#197	self._avg_dl = total_len / self._doc_count if self._doc_count > 0 else 1
#198
#199	for token, freq in df.items():
#200	self._idf_cache[token] = math.log(
#201	1 + (self._doc_count - freq + 0.5) / (freq + 0.5)
#202	)
#203
#204	self._corpus_built = True
#205
#206	def _bm25_score(self, query_tokens: list[str], doc_text: str) -> float:
#207	"""Compute BM25 score for a document."""
#208	doc_tokens = tokenize(doc_text)
#209	doc_len = len(doc_tokens)
#210	if doc_len == 0:
#211	return 0.0
#212
#213	tf = Counter(doc_tokens)
#214	score = 0.0
#215
#216	for qt in query_tokens:
#217	if qt not in self._idf_cache:
#218	continue
#219	term_freq = tf.get(qt, 0)
#220	if term_freq == 0:
#221	continue
#222	idf = self._idf_cache[qt]
#223	numerator = term_freq * (self.K1 + 1)
#224	denominator = term_freq + self.K1 * (
#225	1 - self.B + self.B * doc_len / self._avg_dl
#226	)
#227	score += idf * numerator / denominator
#228
#229	return score
#230
#231	def custom_retrieve(self, query_str: str) -> list[NodeWithScore]:
#232	"""Search text chunks in the graph using BM25 scoring."""
#233	results = []
#234	chunks = []
#235	try:
#236	all_nodes = self.graph_store.graph.nodes
#237	for nid, data in all_nodes.items():
#238	text = getattr(data, "text", None)
#239	if text and len(text.strip()) > 20:
#240	chunks.append((str(nid), text))
#241	except Exception:
#242	return results
#243
#244	if not chunks:
#245	return results
#246
#247	self._build_corpus_stats(chunks)
#248
#249	query_tokens = tokenize(query_str)
#250	if not query_tokens:
#251	return results
#252
#253	scored = []
#254	for nid, text in chunks:
#255	score = self._bm25_score(query_tokens, text)
#256	if score > 0:
#257	scored.append((nid, text, score))
#258
#259	scored.sort(key=lambda x: x[2], reverse=True)
#260	for nid, text, score in scored[: self.TOP_K]:
#261	results.append(
#262	NodeWithScore(node=TextNode(text=text, id_=nid), score=score)
#263	)
#264
#265	return results
#266
#267
#268	# ==========================================
#269	# TEMPORAL METADATA RETRIEVER
#270	# ==========================================
#271	class TemporalMetadataRetriever(CustomPGRetriever):
#272	"""Multilingual retriever for temporal queries and events."""
#273
#274	TOP_K = 8
#275
#276	def init(self, **kwargs):
#277	pass
#278
#279	def custom_retrieve(self, query_str: str) -> list[NodeWithScore]:
#280	"""Search for relevant documents based on dates, tags and metadata in any language."""
#281	results = []
#282
#283	query_date_tokens = extract_date_tokens(query_str)
#284	query_tokens = tokenize(query_str)
#285
#286	is_temporal_query = bool(query_date_tokens) or bool(
#287	ALL_TEMPORAL_KEYWORDS.intersection(set(query_tokens))
#288	)
#289
#290	if not is_temporal_query and not query_tokens:
#291	return results
#292
#293	try:
#294	all_nodes = self.graph_store.graph.nodes
#295	except Exception:
#296	return results
#297
#298	for nid, data in all_nodes.items():
#299	text = getattr(data, "text", None)
#300	if not text or len(text.strip()) < 20:
#301	continue
#302
#303	score = 0.0
#304	text_lower = text.lower()
#305
#306	# 1. Date matching
#307	for dt in query_date_tokens:
#308	if dt in text_lower:
#309	score += 3.0
#310
#311	# 2. Temporal keyword matching (all languages)
#312	for kw in ALL_TEMPORAL_KEYWORDS:
#313	if kw in query_tokens and kw in tokenize(text):
#314	score += 2.0
#315
#316	# 3. Tag/metadata matching
#317	if "tags:" in text_lower or "data:" in text_lower or "date:" in text_lower:
#318	for qt in query_tokens:
#319	if qt in text_lower:
#320	score += 1.0
#321
#322	# 4. Proper name matching
#323	for qt in query_tokens:
#324	if len(qt) > 4 and qt in text_lower:
#325	if qt[0:1].upper() + qt[1:] in text:
#326	score += 1.5
#327
#328	if score > 0:
#329	results.append(
#330	NodeWithScore(node=TextNode(text=text, id_=str(nid)), score=score)
#331	)
#332
#333	results.sort(key=lambda x: x.score, reverse=True)
#334	return results[: self.TOP_K]
#335
#336
#337	# ==========================================
#338	# READ-WRITE LOCK
#339	# ==========================================
#340	class ReadWriteLock:
#341	"""Lock that allows concurrent reads but exclusive writes."""
#342
#343	def __init__(self):
#344	self._read_ready = threading.Condition(threading.Lock())
#345	self._readers = 0
#346
#347	def acquire_read(self):
#348	with self._read_ready:
#349	self._readers += 1
#350
#351	def release_read(self):
#352	with self._read_ready:
#353	self._readers -= 1
#354	if self._readers == 0:
#355	self._read_ready.notify_all()
#356
#357	def acquire_write(self):
#358	self._read_ready.acquire()
#359	while self._readers > 0:
#360	self._read_ready.wait()
#361
#362	def release_write(self):
#363	self._read_ready.release()
#364
#365
#366	# ==========================================
#367	# OBSIDIAN PRE-PROCESSING (multilingual)
#368	# ==========================================
#369	_WIKILINK_RE = re.compile(r"\[\[([^\]\|]+)(?:\\|([^\]]+))?\]\]")
#370	_FRONTMATTER_RE = re.compile(r"^---\s\n(.?)\n---\s*\n", re.DOTALL)
#371
#372
#373	def parse_frontmatter(text: str) -> tuple[dict, str]:
#374	"""Extract YAML frontmatter and return (metadata_dict, body_text).
#375
#376	Handles BOM and leading whitespace that some editors add on Windows.
#377	"""
#378	# Strip BOM and leading whitespace before matching
#379	text_clean = text.lstrip("\ufeff").lstrip()
#380	match = _FRONTMATTER_RE.match(text_clean)
#381	if not match:
#382	return {}, text
#383	try:
#384	meta = yaml.safe_load(match.group(1))
#385	if not isinstance(meta, dict):
#386	meta = {}
#387	except yaml.YAMLError:
#388	meta = {}
#389	# Calculate body offset relative to original text
#390	offset_in_clean = match.end()
#391	# Find where text_clean starts in original text
#392	prefix_len = len(text) - len(text.lstrip("\ufeff").lstrip())
#393	body = text[prefix_len + offset_in_clean:]
#394	return meta, body
#395
#396
#397	def extract_wikilink_triples(file_path: str, text: str) -> list[tuple[str, str, str]]:
#398	"""Extract structured triples from Obsidian [[wikilinks]] with multilingual inference."""
#399	filename = Path(file_path).stem
#400	triples = []
#401	seen = set()
#402
#403	for match in _WIKILINK_RE.finditer(text):
#404	target = match.group(1).strip()
#405	if not target:
#406	continue
#407
#408	pair_key = (filename.lower(), target.lower())
#409	if pair_key in seen:
#410	continue
#411	seen.add(pair_key)
#412
#413	line_start = text.rfind("\n", 0, match.start()) + 1
#414	line_end = text.find("\n", match.end())
#415	if line_end == -1:
#416	line_end = len(text)
#417	line = text[line_start:line_end].strip()
#418
#419	# Use multilingual inference
#420	relation = infer_relation(line, filename, target)
#421	triples.append((filename, relation, target))
#422
#423	return triples
#424
#425
#426	def extract_frontmatter_triples(
#427	file_path: str, metadata: dict
#428	) -> list[tuple[str, str, str]]:
#429	"""Generate structured triples from YAML frontmatter.
#430
#431	Frontmatter keys are mapped to relations across supported languages.
#432	"""
#433	filename = Path(file_path).stem
#434	triples = []
#435
#436	# Frontmatter key -> relation mapping (multilingual)
#437	key_relations = {
#438	# Italian
#439	"ruolo": "Has role",
#440	"organizzazione": "Belongs to",
#441	"progetto": "Participates in",
#442	"stato": "Has status",
#443	"responsabile": "Has responsible",
#444	"licenza": "Has license",
#445	"location": "Located at",
#446	# English
#447	"role": "Has role",
#448	"organization": "Belongs to",
#449	"project": "Participates in",
#450	"status": "Has status",
#451	"responsible": "Has responsible",
#452	"license": "Has license",
#453	# French
#454	"organisation": "Belongs to",
#455	"projet": "Participates in",
#456	"statut": "Has status",
#457	"lieu": "Located at",
#458	# German
#459	"rolle": "Has role",
#460	"projekt": "Participates in",
#461	"standort": "Located at",
#462	# Spanish
#463	"rol": "Has role",
#464	"organizacion": "Belongs to",
#465	"proyecto": "Participates in",
#466	"estado": "Has status",
#467	"ubicacion": "Located at",
#468	# Portuguese
#469	"funcao": "Has role",
#470	"organizacao": "Belongs to",
#471	"projeto": "Participates in",
#472	"localizacao": "Located at",
#473	}
#474
#475	for key, relation in key_relations.items():
#476	value = metadata.get(key)
#477	if value and isinstance(value, str):
#478	triples.append((filename, relation, value))
#479
#480	# Tags (universal key)
#481	tags = metadata.get("tags", [])
#482	if isinstance(tags, list):
#483	for tag in tags:
#484	if isinstance(tag, str):
#485	triples.append((filename, "Has tag", tag))
#486
#487	# Participants (multilingual keys)
#488	for key in ("partecipanti", "participants", "teilnehmer", "participantes"):
#489	partecipanti = metadata.get(key, [])
#490	if isinstance(partecipanti, list):
#491	for p in partecipanti:
#492	if isinstance(p, str):
#493	triples.append((filename, "Has participant", p))
#494
#495	# Date (multilingual keys)
#496	for key in ("data", "date", "datum", "fecha"):
#497	data = metadata.get(key)
#498	if data:
#499	triples.append((filename, "Has date", str(data)))
#500	break
#501
#502	# Budget (universal)
#503	budget = metadata.get("budget")
#504	if budget:
#505	triples.append((filename, "Has budget", f"{budget}"))
#506
#507	# Duration (multilingual keys)
#508	for key in ("durata_mesi", "duration_months", "duree_mois", "dauer_monate",
#509	"duracion_meses", "duracao_meses"):
#510	durata = metadata.get(key)
#511	if durata:
#512	triples.append((filename, "Has duration", f"{durata} months"))
#513	break
#514
#515	return triples
#516
#517
#518	def enrich_documents(
#519	documents: list[Document],
#520	) -> tuple[list[Document], list[tuple[str, str, str]]]:
#521	"""Pre-process documents: extract frontmatter, wikilinks and generate structured triples."""
#522	all_triples = []
#523	enriched_docs = []
#524
#525	for doc in documents:
#526	text = doc.text
#527	file_path = doc.metadata.get("file_path", doc.id_)
#528
#529	metadata, body = parse_frontmatter(text)
#530
#531	for key, value in metadata.items():
#532	if isinstance(value, (str, int, float)):
#533	doc.metadata[f"fm_{key}"] = str(value)
#534	elif isinstance(value, list):
#535	doc.metadata[f"fm_{key}"] = ", ".join(str(v) for v in value)
#536
#537	wikilink_triples = extract_wikilink_triples(file_path, body)
#538	all_triples.extend(wikilink_triples)
#539
#540	fm_triples = extract_frontmatter_triples(file_path, metadata)
#541	all_triples.extend(fm_triples)
#542
#543	enriched_docs.append(doc)
#544
#545	all_triples = _deduplicate_triples(all_triples)
#546	return enriched_docs, all_triples
#547
#548
#549	def _deduplicate_triples(
#550	triples: list[tuple[str, str, str]],
#551	) -> list[tuple[str, str, str]]:
#552	"""Remove duplicate triples (case-insensitive)."""
#553	seen = set()
#554	unique = []
#555	for s, r, o in triples:
#556	key = (s.lower().strip(), r.lower().strip(), o.lower().strip())
#557	if key not in seen:
#558	seen.add(key)
#559	unique.append((s.strip(), r.strip(), o.strip()))
#560	return unique
#561
#562
#563	# ==========================================
#564	# GRAPH RAG ENGINE
#565	# ==========================================
#566	class WritHerGraphRAG:
#567	def __init__(
#568	self,
#569	fast_mode: bool = False,
#570	model_name: str = MODEL_NAME,
#571	embed_model: str = EMBED_MODEL,
#572	) -> None:
#573	self.index = None
#574	self._rw_lock = ReadWriteLock()
#575	self._query_engine = None
#576	self._retrievers_dirty = True
#577	self._fast_mode = fast_mode
#578	self.model_name = model_name
#579	self.embed_model = embed_model
#580	if not os.path.exists(KNOWLEDGE_DIR):
#581	os.makedirs(KNOWLEDGE_DIR)
#582	self.load_or_build_index()
#583
#584	def load_or_build_index(self):
#585	"""Load existing graph or build a new one."""
#586	self._rw_lock.acquire_write()
#587	try:
#588	try:
#589	if os.path.exists(STORAGE_DIR) and os.listdir(STORAGE_DIR):
#590	# P1.4: Check embed model compatibility
#591	self._check_storage_compatibility()
#592	safe_print("Loading knowledge graph from local storage...")
#593	storage_context = StorageContext.from_defaults(
#594	persist_dir=STORAGE_DIR
#595	)
#596	self.index = load_index_from_storage(storage_context)
#597	safe_print("Graph loaded successfully.")
#598	else:
#599	self._build_index_unlocked()
#600	except Exception as e:
#601	safe_print(f"Load error: {e}. Rebuilding index...")
#602	self._build_index_unlocked()
#603	self._retrievers_dirty = True
#604	finally:
#605	self._rw_lock.release_write()
#606
#607	def _check_storage_compatibility(self):
#608	"""Verify that stored graph was built with the same embedding model.
#609
#610	Raises RuntimeError if embed model mismatch is detected.
#611	"""
#612	import json as _json
#613
#614	meta_path = os.path.join(STORAGE_DIR, ".kwipu_meta.json")
#615	if not os.path.exists(meta_path):
#616	return # Legacy storage without manifest, allow loading
#617
#618	try:
#619	with open(meta_path, "r", encoding="utf-8") as f:
#620	meta = _json.load(f)
#621	except (OSError, _json.JSONDecodeError):
#622	return
#623
#624	stored_embed = meta.get("embed_model", "")
#625	current_embed = self.embed_model
#626
#627	if stored_embed and stored_embed != current_embed:
#628	raise RuntimeError(
#629	f"Embedding model mismatch: storage was built with '{stored_embed}' "
#630	f"but current config uses '{current_embed}'. "
#631	f"Delete '{STORAGE_DIR}/' to rebuild, or restore the previous model."
#632	)
#633
#634	# LLM model change is fine (only used for generation, not embeddings)
#635	stored_llm = meta.get("llm_model", "")
#636	if stored_llm and stored_llm != self.model_name:
#637	safe_print(
#638	f"[dim]Note: graph was built with '{stored_llm}', "
#639	f"now using '{self.model_name}' for queries.[/dim]"
#640	)
#641
#642	def _save_storage_manifest(self):
#643	"""Save metadata about the current build configuration."""
#644	import json as _json
#645
#646	meta_path = os.path.join(STORAGE_DIR, ".kwipu_meta.json")
#647	meta = {
#648	"embed_model": self.embed_model,
#649	"llm_model": self.model_name,
#650	"version": "1.0",
#651	}
#652	try:
#653	with open(meta_path, "w", encoding="utf-8") as f:
#654	_json.dump(meta, f, indent=2)
#655	except OSError:
#656	pass
#657
#658	def build_index(self):
#659	"""Rebuild the graph (thread-safe with write lock)."""
#660	self._rw_lock.acquire_write()
#661	try:
#662	self._build_index_unlocked()
#663	self._retrievers_dirty = True
#664	finally:
#665	self._rw_lock.release_write()
#666
#667	def insert_document(self, file_path):
#668	"""Insert a single document into the existing graph (incremental)."""
#669	self._rw_lock.acquire_write()
#670	try:
#671	if not self.index:
#672	self._build_index_unlocked()
#673	self._retrievers_dirty = True
#674	return
#675
#676	try:
#677	reader = SimpleDirectoryReader(
#678	input_files=[file_path], filename_as_id=True
#679	)
#680	docs = reader.load_data()
#681	if docs:
#682	enriched_docs, structural_triples = enrich_documents(docs)
#683	for doc in enriched_docs:
#684	safe_print(
#685	f"Incremental insert: {os.path.basename(file_path)}..."
#686	)
#687	self.index.insert(doc)
#688	self._inject_structural_triples(structural_triples)
#689	self.index.storage_context.persist(persist_dir=STORAGE_DIR)
#690	safe_print("Document added to graph successfully.")
#691	self._retrievers_dirty = True
#692	except Exception as e:
#693	safe_print(
#694	f"Incremental insert error: {e}. Full rebuild..."
#695	)
#696	self._build_index_unlocked()
#697	self._retrievers_dirty = True
#698	finally:
#699	self._rw_lock.release_write()
#700
#701	def update_document(self, file_path):
#702	"""Update a modified document in the graph (delete + re-insert).
#703
#704	More efficient than a full rebuild for single-file modifications.
#705	Falls back to full rebuild if the incremental update fails.
#706	"""
#707	self._rw_lock.acquire_write()
#708	try:
#709	if not self.index:
#710	self._build_index_unlocked()
#711	self._retrievers_dirty = True
#712	return
#713
#714	try:
#715	# The ref_doc_id is the file path (set by filename_as_id=True)
#716	ref_doc_id = file_path
#717	safe_print(f"Updating: {os.path.basename(file_path)}...")
#718
#719	# Step 1: Remove old version from graph
#720	self.index.delete_ref_doc(ref_doc_id, delete_from_docstore=True)
#721
#722	# Step 2: Re-read and insert updated version
#723	reader = SimpleDirectoryReader(
#724	input_files=[file_path], filename_as_id=True
#725	)
#726	docs = reader.load_data()
#727	if docs:
#728	enriched_docs, structural_triples = enrich_documents(docs)
#729	for doc in enriched_docs:
#730	self.index.insert(doc)
#731	self._inject_structural_triples(structural_triples)
#732
#733	self.index.storage_context.persist(persist_dir=STORAGE_DIR)
#734	safe_print("Document updated successfully.")
#735	self._retrievers_dirty = True
#736	except Exception as e:
#737	safe_print(
#738	f"Incremental update error: {e}. Full rebuild..."
#739	)
#740	self._build_index_unlocked()
#741	self._retrievers_dirty = True
#742	finally:
#743	self._rw_lock.release_write()
#744
#745	def _inject_structural_triples(self, triples: list[tuple[str, str, str]]):
#746	"""Inject pre-extracted triples into the property graph."""
#747	if not self.index or not triples:
#748	return
#749	graph_store = self.index.property_graph_store
#750	for subj, rel, obj in triples:
#751	try:
#752	graph_store.upsert_triplet(subj, rel, obj)
#753	except Exception:
#754	pass
#755
#756	def _build_index_unlocked(self):
#757	"""Analyze files and build the graph."""
#758	safe_print(f"Scanning documents in '{KNOWLEDGE_DIR}'...")
#759
#760	try:
#761	reader = SimpleDirectoryReader(
#762	KNOWLEDGE_DIR, recursive=True, filename_as_id=True
#763	)
#764	documents = reader.load_data()
#765	except ValueError:
#766	documents = []
#767
#768	if not documents:
#769	safe_print("No files found. Waiting for documents...")
#770	self.index = None
#771	return
#772
#773	# Time estimate for user feedback
#774	n_docs = len(documents)
#775	safe_print(f"Found {n_docs} documents.")
#776	if n_docs > 10:
#777	est_minutes = max(1, n_docs // 3)
#778	safe_print(
#779	f" ⏱ Estimate: {est_minutes}-{est_minutes * 3} minutes "
#780	f"(depends on model and hardware)."
#781	)
#782	safe_print(
#783	" 💡 Tip: first build is the slowest. "
#784	"Subsequent runs will be incremental."
#785	)
#786
#787	safe_print("Pre-processing: extracting wikilinks and frontmatter...")
#788	enriched_docs, structural_triples = enrich_documents(documents)
#789	safe_print(
#790	f" -> {len(structural_triples)} structural relations extracted."
#791	)
#792
#793	build_start = time.time()
#794	safe_print(
#795	f"LLM extraction and graph construction with {self.model_name}..."
#796	)
#797
#798	def parse_triplets(response_str, max_length=128):
#799	results = []
#800	for line in response_str.strip().split("\n"):
#801	line = line.strip().strip("-").strip("*").strip()
#802	if not line:
#803	continue
#804	if "(" in line and ")" in line:
#805	line = line[line.index("(") + 1 : line.index(")")]
#806	tokens = line.split(",")
#807	if len(tokens) != 3:
#808	continue
#809	subj, pred, obj = (t.strip().strip('"') for t in tokens)
#810	if not subj or not pred or not obj:
#811	continue
#812	if any(len(s.encode("utf-8")) > max_length for s in [subj, pred, obj]):
#813	continue
#814	results.append(
#815	(subj.capitalize(), pred.capitalize(), obj.capitalize())
#816	)
#817	return results
#818
#819	kg_extractors = [
#820	SimpleLLMPathExtractor(
#821	llm=Settings.llm,
#822	extract_prompt=(
#823	"From the text below, extract up to {max_paths_per_chunk} knowledge triplets.\n"
#824	"Each triplet must be on its own line in the format: entity1, relation, entity2\n"
#825	"RULES:\n"
#826	"- Each entity must be a single proper noun (person, organization, place, technology, dataset)\n"
#827	"- Do NOT combine multiple concepts into one entity\n"
#828	"- Extract ALL person names as separate entities\n"
#829	"- Relations should be short verb phrases\n\n"
#830	"Text: {text}\n"
#831	"Triplets:\n"
#832	),
#833	parse_fn=parse_triplets,
#834	num_workers=1,
#835	max_paths_per_chunk=20,
#836	),
#837	ImplicitPathExtractor(),
#838	]
#839
#840	self.index = PropertyGraphIndex.from_documents(
#841	enriched_docs, kg_extractors=kg_extractors, show_progress=False
#842	)
#843
#844	safe_print("Injecting structural relations into graph...")
#845	self._inject_structural_triples(structural_triples)
#846
#847	self.index.storage_context.persist(persist_dir=STORAGE_DIR)
#848	self._save_storage_manifest()
#849	build_elapsed = time.time() - build_start
#850	minutes = int(build_elapsed // 60)
#851	seconds = int(build_elapsed % 60)
#852	safe_print(
#853	f"Graph built and saved successfully. "
#854	f"(Build time: {minutes}m {seconds}s)"
#855	)
#856
#857	def _build_retrievers(self):
#858	"""Build retrievers and query engine.
#859
#860	Fast mode: vector + BM25 + temporal only (no LLM call per query).
#861	Normal mode: adds LLM synonym retriever.
#862	"""
#863	if not self.index:
#864	self._query_engine = None
#865	return
#866
#867	sub_retrievers = []
#868
#869	# Synonym retriever: normal mode only (costs one LLM call per query)
#870	if not self._fast_mode:
#871	synonym_retriever = LLMSynonymRetriever(
#872	self.index.property_graph_store,
#873	llm=Settings.llm,
#874	include_text=True,
#875	synonym_prompt=(
#876	"Given the query below, generate synonyms or related keywords up to {max_keywords} in total.\n"
#877	"Include: original names, names with titles (Prof., Dott., Dott.ssa, Dr., Ing., Dra.), "
#878	"abbreviations, related project names, and multilingual variants.\n"
#879	"Provide all synonyms/keywords separated by '^' symbols: 'keyword1^keyword2^...'\n"
#880	"Result must be one line, separated by '^' symbols.\n"
#881	"----\n"
#882	"QUERY: {query_str}\n"
#883	"----\n"
#884	"KEYWORDS: "
#885	),
#886	max_keywords=15,
#887	path_depth=3,
#888	)
#889	sub_retrievers.append(synonym_retriever)
#890
#891	# These retrievers don't use the LLM, always active
#892	vector_retriever = VectorContextRetriever(
#893	self.index.property_graph_store,
#894	vector_store=self.index.vector_store,
#895	include_text=True,
#896	similarity_top_k=20,
#897	embed_model=Settings.embed_model,
#898	path_depth=3,
#899	)
#900	sub_retrievers.append(vector_retriever)
#901
#902	bm25_retriever = BM25ChunkRetriever(self.index.property_graph_store)
#903	sub_retrievers.append(bm25_retriever)
#904
#905	temporal_retriever = TemporalMetadataRetriever(self.index.property_graph_store)
#906	sub_retrievers.append(temporal_retriever)
#907
#908	self._query_engine = self.index.as_query_engine(
#909	text_qa_template=qa_template,
#910	sub_retrievers=sub_retrievers,
#911	)
#912	self._retrievers_dirty = False
#913
#914	def ask(self, question):
#915	"""Query the graph with read lock.
#916
#917	Uses a lock-upgrade pattern: acquire read to check state, release,
#918	then acquire write only if retrievers need rebuilding, then read again for query.
#919	"""
#920	self._rw_lock.acquire_read()
#921	try:
#922	if not self.index:
#923	return "No index available. Add files to the knowledge_base folder."
#924	needs_rebuild = self._retrievers_dirty
#925	finally:
#926	self._rw_lock.release_read()
#927
#928	if needs_rebuild:
#929	self._rw_lock.acquire_write()
#930	try:
#931	if self._retrievers_dirty:
#932	self._build_retrievers()
#933	finally:
#934	self._rw_lock.release_write()
#935
#936	self._rw_lock.acquire_read()
#937	try:
#938	if not self._query_engine:
#939	return "No index available. Add files to the knowledge_base folder."
#940	response = self._query_engine.query(question)
#941	return response
#942	finally:
#943	self._rw_lock.release_read()
#944
#945
#946	# ==========================================
#947	# REAL-TIME FILE MONITORING (with persistent content-hash)
#948	# ==========================================
#949	_HASH_CACHE_FILE = os.path.join(STORAGE_DIR, ".file_hashes.json")
#950
#951
#952	def _file_content_hash(path: str) -> str \| None:
#953	"""Compute MD5 hash of file contents. Returns None if file doesn't exist."""
#954	try:
#955	with open(path, "rb") as f:
#956	return hashlib.md5(f.read()).hexdigest()
#957	except (OSError, IOError):
#958	return None
#959
#960
#961	def _load_hash_cache() -> dict[str, str]:
#962	"""Load hash cache from disk."""
#963	import json
#964
#965	try:
#966	if os.path.exists(_HASH_CACHE_FILE):
#967	with open(_HASH_CACHE_FILE, "r", encoding="utf-8") as f:
#968	data = json.load(f)
#969	if isinstance(data, dict):
#970	return data
#971	except (json.JSONDecodeError, OSError):
#972	pass
#973	return {}
#974
#975
#976	def _save_hash_cache(hashes: dict[str, str]):
#977	"""Save hash cache to disk."""
#978	import json
#979
#980	os.makedirs(os.path.dirname(_HASH_CACHE_FILE), exist_ok=True)
#981	try:
#982	with open(_HASH_CACHE_FILE, "w", encoding="utf-8") as f:
#983	json.dump(hashes, f, indent=2)
#984	except OSError:
#985	pass
#986
#987
#988	class FileWatcher(FileSystemEventHandler):
#989	def __init__(self, rag_system):
#990	self.rag_system = rag_system
#991	self._lock = threading.Lock()
#992	self._pending_events: dict[str, tuple[str, float]] = {}
#993	self._timer = None
#994	# Persistent hash cache on disk
#995	self._file_hashes: dict[str, str] = _load_hash_cache()
#996	self._refresh_hashes()
#997
#998	def _refresh_hashes(self):
#999	"""Refresh hashes for all current files in the knowledge base."""
#1000	kb_path = Path(KNOWLEDGE_DIR)
#1001	if not kb_path.exists():
#1002	return
#1003
#1004	current_files = set()
#1005	for ext in WATCHER_VALID_EXTENSIONS:
#1006	for f in kb_path.rglob(f"*{ext}"):
#1007	if ".obsidian" not in f.parts:
#1008	fpath = str(f)
#1009	current_files.add(fpath)
#1010	h = _file_content_hash(fpath)
#1011	if h and fpath not in self._file_hashes:
#1012	self._file_hashes[fpath] = h
#1013
#1014	# Remove hashes for deleted files
#1015	stale = [k for k in self._file_hashes if k not in current_files]
#1016	for k in stale:
#1017	del self._file_hashes[k]
#1018
#1019	_save_hash_cache(self._file_hashes)
#1020
#1021	def _is_relevant_file(self, path):
#1022	p = Path(path)
#1023	if ".obsidian" in p.parts:
#1024	return False
#1025	return p.suffix.lower() in WATCHER_VALID_EXTENSIONS
#1026
#1027	def _has_content_changed(self, path: str) -> bool:
#1028	"""Check if file content has actually changed compared to last hash."""
#1029	new_hash = _file_content_hash(path)
#1030	if new_hash is None:
#1031	return path in self._file_hashes
#1032
#1033	old_hash = self._file_hashes.get(path)
#1034	if old_hash == new_hash:
#1035	return False # Identical content, phantom event
#1036
#1037	self._file_hashes[path] = new_hash
#1038	_save_hash_cache(self._file_hashes)
#1039	return True
#1040
#1041	def _schedule_processing(self, event_type, path):
#1042	with self._lock:
#1043	self._pending_events[path] = (event_type, time.time())
#1044	if self._timer is not None:
#1045	self._timer.cancel()
#1046	self._timer = threading.Timer(
#1047	WATCHER_DEBOUNCE_SECONDS, self._process_pending
#1048	)
#1049	self._timer.daemon = True
#1050	self._timer.start()
#1051
#1052	def _process_pending(self):
#1053	with self._lock:
#1054	events = dict(self._pending_events)
#1055	self._pending_events.clear()
#1056	self._timer = None
#1057
#1058	if not events:
#1059	return
#1060
#1061	# Filter phantom events: verify content actually changed
#1062	real_events = {}
#1063	for path, (etype, ts) in events.items():
#1064	if etype == "deleted":
#1065	# Deletions are always real
#1066	self._file_hashes.pop(path, None)
#1067	real_events[path] = (etype, ts)
#1068	elif self._has_content_changed(path):
#1069	real_events[path] = (etype, ts)
#1070
#1071	if not real_events:
#1072	safe_print("\n(Filesystem events ignored: no real content changes)")
#1073	return
#1074
#1075	# Separate events by type
#1076	deleted = [p for p, (e, _) in real_events.items() if e == "deleted"]
#1077	modified = [p for p, (e, _) in real_events.items() if e == "modified"]
#1078	created = [p for p, (e, _) in real_events.items() if e == "created"]
#1079
#1080	# Deletions require full rebuild (can't selectively remove all related triples)
#1081	if deleted:
#1082	safe_print(f"\nFile(s) deleted. Rebuilding graph...")
#1083	self.rag_system.build_index()
#1084	return
#1085
#1086	# Modifications: incremental update (delete + re-insert per file)
#1087	for path in modified:
#1088	if os.path.exists(path):
#1089	self.rag_system.update_document(path)
#1090
#1091	# Creations: incremental insert
#1092	for path in created:
#1093	if os.path.exists(path):
#1094	safe_print(f"\nNew file detected: {os.path.basename(path)}.")
#1095	self.rag_system.insert_document(path)
#1096
#1097	def on_created(self, event):
#1098	if not event.is_directory and self._is_relevant_file(event.src_path):
#1099	self._schedule_processing("created", event.src_path)
#1100
#1101	def on_modified(self, event):
#1102	if not event.is_directory and self._is_relevant_file(event.src_path):
#1103	self._schedule_processing("modified", event.src_path)
#1104
#1105	def on_deleted(self, event):
#1106	if not event.is_directory and self._is_relevant_file(event.src_path):
#1107	self._schedule_processing("deleted", event.src_path)
#1108
#1109
#1110	# ==========================================
#1111	# TERMINAL INTERFACE (Rich)
#1112	# ==========================================
#1113	def _check_ollama_available(model_name: str, embed_model: str):
#1114	"""Verify Ollama is running and required models are available.
#1115
#1116	Prints clear error messages with suggested commands if something is missing.
#1117	Returns True if everything is ready, False otherwise.
#1118	"""
#1119	import urllib.request
#1120	import json as _json
#1121
#1122	base_url = "http://localhost:11434"
#1123
#1124	# Check if Ollama is running
#1125	try:
#1126	req = urllib.request.Request(f"{base_url}/api/tags", method="GET")
#1127	with urllib.request.urlopen(req, timeout=5) as resp:
#1128	data = _json.loads(resp.read().decode())
#1129	except Exception:
#1130	console.print(
#1131	Panel(
#1132	"[bold red]Ollama is not running.[/bold red]\n\n"
#1133	"Start Ollama before running Geode Graph:\n"
#1134	" [dim]ollama serve[/dim]",
#1135	title="[red]Connection Error[/red]",
#1136	border_style="red",
#1137	)
#1138	)
#1139	return False
#1140
#1141	# Check available models
#1142	available_models = set()
#1143	for model_info in data.get("models", []):
#1144	name = model_info.get("name", "")
#1145	available_models.add(name)
#1146	# Also add without tag (e.g. "qwen2.5:3b" -> "qwen2.5")
#1147	if ":" in name:
#1148	available_models.add(name.split(":")[0])
#1149
#1150	missing = []
#1151	if model_name not in available_models and model_name.split(":")[0] not in available_models:
#1152	missing.append(model_name)
#1153	if embed_model not in available_models and embed_model.split(":")[0] not in available_models:
#1154	missing.append(embed_model)
#1155
#1156	if missing:
#1157	cmds = "\n".join(f" [dim]ollama pull {m}[/dim]" for m in missing)
#1158	console.print(
#1159	Panel(
#1160	f"[bold yellow]Missing model(s):[/bold yellow] {', '.join(missing)}\n\n"
#1161	f"Pull them with:\n{cmds}",
#1162	title="[yellow]Model Not Found[/yellow]",
#1163	border_style="yellow",
#1164	)
#1165	)
#1166	return False
#1167
#1168	return True
#1169
#1170
#1171	def main():
#1172	import argparse
#1173
#1174	parser = argparse.ArgumentParser(description="Geode Graph - Knowledge Graph Assistant")
#1175	parser.add_argument(
#1176	"--fast",
#1177	action="store_true",
#1178	help="Fast mode: disables LLM synonym retriever for faster queries",
#1179	)
#1180	parser.add_argument(
#1181	"--llm-model",
#1182	type=str,
#1183	default=None,
#1184	help=f"Override LLM model (default: {MODEL_NAME})",
#1185	)
#1186	parser.add_argument(
#1187	"--embed-model",
#1188	type=str,
#1189	default=None,
#1190	help=f"Override embedding model (default: {EMBED_MODEL})",
#1191	)
#1192	args = parser.parse_args()
#1193
#1194	# Resolve model names (CLI overrides config)
#1195	llm_model = args.llm_model or MODEL_NAME
#1196	embed_model = args.embed_model or EMBED_MODEL
#1197
#1198	_ensure_nest_asyncio()
#1199
#1200	# Check Ollama before doing anything expensive
#1201	if not _check_ollama_available(llm_model, embed_model):
#1202	sys.exit(1)
#1203
#1204	# Initialize LLM (P0.6: no side effects on import)
#1205	_init_llm(model_name=llm_model, embed_model=embed_model)
#1206
#1207	console.print()
#1208	console.print(
#1209	Panel(
#1210	Text.from_markup(
#1211	f"[bold]Geode Graph[/bold]\n"
#1212	f"[dim]LLM:[/dim] {llm_model} "
#1213	f"[dim]Mode:[/dim] {'FAST' if args.fast else 'FULL'} "
#1214	f"[dim]Watching:[/dim] {KNOWLEDGE_DIR}"
#1215	),
#1216	border_style="bright_black",
#1217	padding=(1, 2),
#1218	)
#1219	)
#1220
#1221	with Status("[dim]Loading knowledge graph...[/dim]", console=console, spinner="dots"):
#1222	rag = WritHerGraphRAG(
#1223	fast_mode=args.fast,
#1224	model_name=llm_model,
#1225	embed_model=embed_model,
#1226	)
#1227
#1228	observer = Observer()
#1229	observer.schedule(FileWatcher(rag), KNOWLEDGE_DIR, recursive=True)
#1230	observer.start()
#1231
#1232	console.print("[dim]Type your question, or 'exit' to quit.[/dim]\n")
#1233
#1234	try:
#1235	while True:
#1236	try:
#1237	query = console.input("[bold bright_white]>[/bold bright_white] ")
#1238	except EOFError:
#1239	break
#1240
#1241	if query.lower().strip() in ["exit", "quit", "esci"]:
#1242	break
#1243
#1244	if not query.strip():
#1245	continue
#1246
#1247	with Status("[dim]Querying graph...[/dim]", console=console, spinner="dots"):
#1248	start_t = time.time()
#1249	response = rag.ask(query)
#1250	elapsed = time.time() - start_t
#1251
#1252	console.print()
#1253	console.print(
#1254	Panel(
#1255	Markdown(str(response)),
#1256	title="[bold]Response[/bold]",
#1257	subtitle=f"[dim]{elapsed:.1f}s[/dim]",
#1258	border_style="bright_black",
#1259	padding=(1, 2),
#1260	)
#1261	)
#1262	console.print()
#1263
#1264	except KeyboardInterrupt:
#1265	pass
#1266	finally:
#1267	observer.stop()
#1268	observer.join()
#1269	console.print("\n[dim]Goodbye.[/dim]")
#1270
#1271
#1272	if __name__ == "__main__":
#1273	main()
#1274

z6MkqRzA/benmaster82-Kwipu