repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources15d ago| #1 | #!/usr/bin/env python3 |
| #2 | """Diagnose fact_recall precision: what % of returned facts are actually relevant?""" |
| #3 | import sys, os, tempfile, json, time |
| #4 | from pathlib import Path |
| #5 | |
| #6 | sys.path.insert(0, str(Path(__file__).parent.parent)) |
| #7 | from tools.evaluate_beam_end_to_end import load_beam_dataset, init_beam, ingest_conversation |
| #8 | from mnemosyne.core.beam import BeamMemory |
| #9 | |
| #10 | # LLM for judging relevance |
| #11 | class JudgeLLM: |
| #12 | def __init__(self): |
| #13 | import openai |
| #14 | self.client = openai.OpenAI( |
| #15 | base_url="https://openrouter.ai/api/v1", |
| #16 | api_key=os.environ["OPENROUTER_API_KEY"], |
| #17 | ) |
| #18 | def judge_relevance(self, question: str, fact_text: str) -> bool: |
| #19 | try: |
| #20 | resp = self.client.chat.completions.create( |
| #21 | model="google/gemini-2.5-flash", |
| #22 | messages=[{ |
| #23 | "role": "user", |
| #24 | "content": f"QUESTION: {question}\n\nFACT: {fact_text}\n\nIs this fact RELEVANT to answering the question? Answer ONLY 'yes' or 'no'. Relevant means it contains information that helps answer the question, even partially." |
| #25 | }], |
| #26 | temperature=0.0, |
| #27 | max_tokens=5, |
| #28 | ) |
| #29 | answer = resp.choices[0].message.content.strip().lower() |
| #30 | return answer.startswith("yes") |
| #31 | except Exception: |
| #32 | return False |
| #33 | |
| #34 | def main(): |
| #35 | # Load 100K data |
| #36 | print("Loading 100K BEAM conversation...") |
| #37 | data = load_beam_dataset(["100K"], max_conversations=1) |
| #38 | convs = data.get("100K", []) |
| #39 | if not convs: |
| #40 | print("ERROR: No 100K data") |
| #41 | sys.exit(1) |
| #42 | conv = convs[0] |
| #43 | questions = conv.get("questions", [])[:16] # Max 16 questions |
| #44 | print(f" {len(conv['messages'])} messages, {len(questions)} questions") |
| #45 | |
| #46 | # Ingest |
| #47 | print("\nIngesting (use_cloud=True)...") |
| #48 | with tempfile.TemporaryDirectory() as tmpdir: |
| #49 | db_path = Path(tmpdir) / "diag.db" |
| #50 | init_beam(db_path) |
| #51 | beam = BeamMemory(session_id="diag", db_path=db_path, use_cloud=True) |
| #52 | |
| #53 | t0 = time.perf_counter() |
| #54 | stats = ingest_conversation(beam, conv["messages"]) |
| #55 | print(f" Done in {time.perf_counter()-t0:.1f}s, DB: {os.path.getsize(db_path)/1024:.0f}KB") |
| #56 | |
| #57 | facts_count = beam.conn.execute("SELECT COUNT(*) FROM facts").fetchone()[0] |
| #58 | print(f" Facts stored: {facts_count}") |
| #59 | |
| #60 | # Show random sample facts |
| #61 | samples = beam.conn.execute( |
| #62 | "SELECT subject, predicate, object FROM facts ORDER BY RANDOM() LIMIT 5" |
| #63 | ).fetchall() |
| #64 | print(" Sample facts:") |
| #65 | for s in samples: |
| #66 | print(f" [{s['predicate']}] {s['subject']}: {s['object'][:80]}") |
| #67 | |
| #68 | # Diagnose each question |
| #69 | print("\n--- FACT RECALL PRECISION DIAGNOSTIC ---\n") |
| #70 | judge = JudgeLLM() |
| #71 | |
| #72 | total_precision_5 = [] |
| #73 | total_precision_10 = [] |
| #74 | total_precision_30 = [] |
| #75 | total_coverage = [] # How many relevant facts out of all facts? |
| #76 | |
| #77 | for qi, q in enumerate(questions): |
| #78 | question = q.get("question", q.get("text", "")) |
| #79 | rubric = q.get("rubric", q.get("ideal", [])) |
| #80 | ability = q.get("ability", "?") |
| #81 | |
| #82 | facts = beam.fact_recall(question, top_k=30) |
| #83 | |
| #84 | if not facts: |
| #85 | print(f"[{qi}] {ability}: 0 facts returned, question: {question[:80]}") |
| #86 | total_precision_5.append(0.0) |
| #87 | total_precision_10.append(0.0) |
| #88 | total_precision_30.append(0.0) |
| #89 | total_coverage.append(0.0) |
| #90 | continue |
| #91 | |
| #92 | # Judge top 30 for relevance |
| #93 | relevant_indices = set() |
| #94 | for fi, f in enumerate(facts[:30]): |
| #95 | is_rel = judge.judge_relevance(question, f["content"]) |
| #96 | if is_rel: |
| #97 | relevant_indices.add(fi) |
| #98 | |
| #99 | p5 = len([i for i in relevant_indices if i < 5]) / min(5, len(facts)) |
| #100 | p10 = len([i for i in relevant_indices if i < 10]) / min(10, len(facts)) |
| #101 | p30 = len([i for i in relevant_indices if i < 30]) / min(30, len(facts)) |
| #102 | |
| #103 | total_precision_5.append(p5) |
| #104 | total_precision_10.append(p10) |
| #105 | total_precision_30.append(p30) |
| #106 | total_coverage.append(len(relevant_indices)) |
| #107 | |
| #108 | print(f"[{qi}] {ability}: p@5={p5:.0%} p@10={p10:.0%} p@30={p30:.0%} " |
| #109 | f"({len(relevant_indices)}/{len(facts)} relevant) " |
| #110 | f"Q: {question[:60]}") |
| #111 | |
| #112 | # Aggregate |
| #113 | print(f"\n{'='*60}") |
| #114 | print(f"AGGREGATE (n={len(questions)} questions):") |
| #115 | avg_p5 = sum(total_precision_5) / len(total_precision_5) |
| #116 | avg_p10 = sum(total_precision_10) / len(total_precision_10) |
| #117 | avg_p30 = sum(total_precision_30) / len(total_precision_30) |
| #118 | avg_cov = sum(total_coverage) / len(total_coverage) |
| #119 | |
| #120 | print(f" Precision@5: {avg_p5:.1%}") |
| #121 | print(f" Precision@10: {avg_p10:.1%}") |
| #122 | print(f" Precision@30: {avg_p30:.1%}") |
| #123 | print(f" Avg relevant facts: {avg_cov:.1f}") |
| #124 | print(f" Zero-recall questions: {sum(1 for c in total_coverage if c == 0)}/{len(total_coverage)}") |
| #125 | print(f" p@5 ≥ 50%: {sum(1 for p in total_precision_5 if p >= 0.5)}/{len(total_precision_5)}") |
| #126 | |
| #127 | verdict = "GOOD" if avg_p10 >= 0.5 else "BROKEN" |
| #128 | print(f"\n VERDICT: Recall pipeline is {verdict}") |
| #129 | |
| #130 | beam.conn.close() |
| #131 | |
| #132 | if __name__ == "__main__": |
| #133 | main() |
| #134 |