my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	#!/usr/bin/env python3
#2	"""Diagnose fact_recall precision: what % of returned facts are actually relevant?"""
#3	import sys, os, tempfile, json, time
#4	from pathlib import Path
#5
#6	sys.path.insert(0, str(Path(__file__).parent.parent))
#7	from tools.evaluate_beam_end_to_end import load_beam_dataset, init_beam, ingest_conversation
#8	from mnemosyne.core.beam import BeamMemory
#9
#10	# LLM for judging relevance
#11	class JudgeLLM:
#12	def __init__(self):
#13	import openai
#14	self.client = openai.OpenAI(
#15	base_url="https://openrouter.ai/api/v1",
#16	api_key=os.environ["OPENROUTER_API_KEY"],
#17	)
#18	def judge_relevance(self, question: str, fact_text: str) -> bool:
#19	try:
#20	resp = self.client.chat.completions.create(
#21	model="google/gemini-2.5-flash",
#22	messages=[{
#23	"role": "user",
#24	"content": f"QUESTION: {question}\n\nFACT: {fact_text}\n\nIs this fact RELEVANT to answering the question? Answer ONLY 'yes' or 'no'. Relevant means it contains information that helps answer the question, even partially."
#25	}],
#26	temperature=0.0,
#27	max_tokens=5,
#28	)
#29	answer = resp.choices[0].message.content.strip().lower()
#30	return answer.startswith("yes")
#31	except Exception:
#32	return False
#33
#34	def main():
#35	# Load 100K data
#36	print("Loading 100K BEAM conversation...")
#37	data = load_beam_dataset(["100K"], max_conversations=1)
#38	convs = data.get("100K", [])
#39	if not convs:
#40	print("ERROR: No 100K data")
#41	sys.exit(1)
#42	conv = convs[0]
#43	questions = conv.get("questions", [])[:16] # Max 16 questions
#44	print(f" {len(conv['messages'])} messages, {len(questions)} questions")
#45
#46	# Ingest
#47	print("\nIngesting (use_cloud=True)...")
#48	with tempfile.TemporaryDirectory() as tmpdir:
#49	db_path = Path(tmpdir) / "diag.db"
#50	init_beam(db_path)
#51	beam = BeamMemory(session_id="diag", db_path=db_path, use_cloud=True)
#52
#53	t0 = time.perf_counter()
#54	stats = ingest_conversation(beam, conv["messages"])
#55	print(f" Done in {time.perf_counter()-t0:.1f}s, DB: {os.path.getsize(db_path)/1024:.0f}KB")
#56
#57	facts_count = beam.conn.execute("SELECT COUNT(*) FROM facts").fetchone()[0]
#58	print(f" Facts stored: {facts_count}")
#59
#60	# Show random sample facts
#61	samples = beam.conn.execute(
#62	"SELECT subject, predicate, object FROM facts ORDER BY RANDOM() LIMIT 5"
#63	).fetchall()
#64	print(" Sample facts:")
#65	for s in samples:
#66	print(f" [{s['predicate']}] {s['subject']}: {s['object'][:80]}")
#67
#68	# Diagnose each question
#69	print("\n--- FACT RECALL PRECISION DIAGNOSTIC ---\n")
#70	judge = JudgeLLM()
#71
#72	total_precision_5 = []
#73	total_precision_10 = []
#74	total_precision_30 = []
#75	total_coverage = [] # How many relevant facts out of all facts?
#76
#77	for qi, q in enumerate(questions):
#78	question = q.get("question", q.get("text", ""))
#79	rubric = q.get("rubric", q.get("ideal", []))
#80	ability = q.get("ability", "?")
#81
#82	facts = beam.fact_recall(question, top_k=30)
#83
#84	if not facts:
#85	print(f"[{qi}] {ability}: 0 facts returned, question: {question[:80]}")
#86	total_precision_5.append(0.0)
#87	total_precision_10.append(0.0)
#88	total_precision_30.append(0.0)
#89	total_coverage.append(0.0)
#90	continue
#91
#92	# Judge top 30 for relevance
#93	relevant_indices = set()
#94	for fi, f in enumerate(facts[:30]):
#95	is_rel = judge.judge_relevance(question, f["content"])
#96	if is_rel:
#97	relevant_indices.add(fi)
#98
#99	p5 = len([i for i in relevant_indices if i < 5]) / min(5, len(facts))
#100	p10 = len([i for i in relevant_indices if i < 10]) / min(10, len(facts))
#101	p30 = len([i for i in relevant_indices if i < 30]) / min(30, len(facts))
#102
#103	total_precision_5.append(p5)
#104	total_precision_10.append(p10)
#105	total_precision_30.append(p30)
#106	total_coverage.append(len(relevant_indices))
#107
#108	print(f"[{qi}] {ability}: p@5={p5:.0%} p@10={p10:.0%} p@30={p30:.0%} "
#109	f"({len(relevant_indices)}/{len(facts)} relevant) "
#110	f"Q: {question[:60]}")
#111
#112	# Aggregate
#113	print(f"\n{'='*60}")
#114	print(f"AGGREGATE (n={len(questions)} questions):")
#115	avg_p5 = sum(total_precision_5) / len(total_precision_5)
#116	avg_p10 = sum(total_precision_10) / len(total_precision_10)
#117	avg_p30 = sum(total_precision_30) / len(total_precision_30)
#118	avg_cov = sum(total_coverage) / len(total_coverage)
#119
#120	print(f" Precision@5: {avg_p5:.1%}")
#121	print(f" Precision@10: {avg_p10:.1%}")
#122	print(f" Precision@30: {avg_p30:.1%}")
#123	print(f" Avg relevant facts: {avg_cov:.1f}")
#124	print(f" Zero-recall questions: {sum(1 for c in total_coverage if c == 0)}/{len(total_coverage)}")
#125	print(f" p@5 ≥ 50%: {sum(1 for p in total_precision_5 if p >= 0.5)}/{len(total_precision_5)}")
#126
#127	verdict = "GOOD" if avg_p10 >= 0.5 else "BROKEN"
#128	print(f"\n VERDICT: Recall pipeline is {verdict}")
#129
#130	beam.conn.close()
#131
#132	if __name__ == "__main__":
#133	main()
#134

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public