repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources15d ago| #1 | #!/usr/bin/env python3 |
| #2 | """NAI-0 Minimal Benchmark — 20 questions across 1 synthetic conversation. |
| #3 | |
| #4 | Quick and dirty: ingest 1 fake user+assistant chat, ask 20 questions, |
| #5 | measure keyword-overlap accuracy before and after Phase 0 optimizations. |
| #6 | """ |
| #7 | import json |
| #8 | import time |
| #9 | from pathlib import Path |
| #10 | from mnemosyne.core.beam import BeamMemory |
| #11 | |
| #12 | # ── Synthetic Conversation: 15 turns, 5 factual, 5 temporal, 5 multi-hop signals ── |
| #13 | |
| #14 | CONVERSATION = [ |
| #15 | ("user", "Hey, can you help me set up my development environment?"), |
| #16 | ("assistant", "Sure! First, let's install Python 3.12. Download it from python.org."), |
| #17 | ("user", "I'm on Ubuntu 24.04. I prefer using dark mode for everything."), |
| #18 | ("assistant", "Got it. For Ubuntu, use apt: sudo apt install python3.12. Also noted your dark mode preference."), |
| #19 | ("user", "What about Docker? I need to containerize my services."), |
| #20 | ("assistant", "Docker is separate. sudo apt install docker.io. Make sure to add your user to the docker group."), |
| #21 | ("user", "The API I'm building needs to handle about 10,000 requests per second."), |
| #22 | ("assistant", "That's high throughput. You'll want async Python with FastAPI, behind an nginx reverse proxy."), |
| #23 | ("user", "I changed my mind. I want to use Go instead of Python for the API."), |
| #24 | ("assistant", "OK, switching to Go. Install Go from golang.org, then use gin or chi for the web framework."), |
| #25 | ("user", "Last week I deployed a test server and the response time was about 250ms."), |
| #26 | ("assistant", "250ms is good. For 10K rps in Go, you should get under 50ms. Want help optimizing?"), |
| #27 | ("user", "Yes please. Is my system secure enough for production?"), |
| #28 | ("assistant", "Run a security audit: check open ports with nmap, ensure all services run as non-root users."), |
| #29 | ("user", "Great. One more thing - I'm going to San Francisco next month for a conference."), |
| #30 | ("assistant", "Nice! SF in June is great. I'll note that down. For the trip, we should finish the deploy first."), |
| #31 | ] |
| #32 | |
| #33 | # 20 questions: 10 factual, 5 temporal, 5 multi-hop |
| #34 | QUESTIONS = [ |
| #35 | # Factual (direct recall) |
| #36 | ("What operating system is the user on?", ["Ubuntu", "24.04"]), |
| #37 | ("What display preference does the user have?", ["dark mode"]), |
| #38 | ("What tool does the user need to containerize services?", ["Docker"]), |
| #39 | ("What Python feature was suggested for high throughput?", ["FastAPI", "async"]), |
| #40 | ("What language did the user switch to for the API?", ["Go", "golang"]), |
| #41 | ("What web framework was suggested for the new language?", ["gin", "chi"]), |
| #42 | ("What was the response time of the test server?", ["250ms", "250"]), |
| #43 | ("What does the user need to install Docker on Ubuntu?", ["apt", "apt install"]), |
| #44 | ("What city is the user traveling to?", ["San Francisco"]), |
| #45 | ("What type of application is the API expected to be?", ["high throughput", "10,000", "10K"]), |
| #46 | # Temporal |
| #47 | ("When did the user deploy the test server?", ["last week"]), |
| #48 | ("When is the conference the user mentioned?", ["next month"]), |
| #49 | ("What OS version is the user on?", ["Ubuntu", "24.04"]), |
| #50 | ("What was discussed before the user mentioned San Francisco?", ["security", "secure"]), |
| #51 | ("What was the last topic discussed?", ["conference", "San Francisco", "deploy"]), |
| #52 | # Multi-hop |
| #53 | ("What language was chosen after the user changed their mind?", ["Go", "golang"]), |
| #54 | ("What framework should be used for the new language choice?", ["gin", "chi"]), |
| #55 | ("How should the user improve throughput in the new language?", ["optimizing", "50ms"]), |
| #56 | ("What two installation methods were discussed?", ["apt", "python.org", "apt install", "golang.org"]), |
| #57 | ("What security steps were recommended?", ["nmap", "non-root", "security audit"]), |
| #58 | ] |
| #59 | |
| #60 | |
| #61 | def score_answer(predicted: str, expected: list) -> float: |
| #62 | """Simple keyword-overlap score. 1.0 = all keywords found.""" |
| #63 | predicted_lower = predicted.lower() |
| #64 | hits = sum(1 for kw in expected if kw.lower() in predicted_lower) |
| #65 | return hits / len(expected) if expected else 0.0 |
| #66 | |
| #67 | |
| #68 | def run_benchmark(label: str, top_k: int = 40, use_format: bool = False) -> dict: |
| #69 | """Run full benchmark pipeline.""" |
| #70 | import tempfile, os |
| #71 | tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False) |
| #72 | db_path = Path(tmp.name) |
| #73 | tmp.close() |
| #74 | |
| #75 | beam = BeamMemory(session_id=f"bench_nai0_{label}", db_path=db_path) |
| #76 | |
| #77 | # Ingest conversation |
| #78 | t_start = time.time() |
| #79 | for role, msg in CONVERSATION: |
| #80 | beam.remember(f"[{role}] {msg}", source=role, importance=0.6 if role == "user" else 0.5) |
| #81 | ingest_time = time.time() - t_start |
| #82 | |
| #83 | # Answer questions |
| #84 | scores = [] |
| #85 | coverage_scores = [] # How many of expected keywords appear in ALL retrieved results |
| #86 | total_latency = 0 |
| #87 | results_preview = [] |
| #88 | total_results = 0 |
| #89 | |
| #90 | for question, expected in QUESTIONS: |
| #91 | t0 = time.time() |
| #92 | results = beam.recall(question, top_k=top_k) |
| #93 | recall_time = time.time() - t0 |
| #94 | total_results += len(results) |
| #95 | |
| #96 | # Build full context from all retrieved results |
| #97 | full_context = " ".join(r.get("content", "")[:200] for r in results) |
| #98 | |
| #99 | # Top-5 score (same as before for comparison) |
| #100 | top5_content = " ".join(r.get("content", "")[:100] for r in results[:5]) |
| #101 | s5 = score_answer(top5_content, expected) |
| #102 | scores.append(s5) |
| #103 | |
| #104 | # Coverage score: search ALL results, not just top-5 |
| #105 | sc = score_answer(full_context, expected) |
| #106 | coverage_scores.append(sc) |
| #107 | |
| #108 | total_latency += recall_time |
| #109 | |
| #110 | if len(results_preview) < 3: |
| #111 | results_preview.append({ |
| #112 | "question": question[:50], |
| #113 | "top5_score": s5, |
| #114 | "coverage_score": sc, |
| #115 | "latency_ms": round(recall_time * 1000), |
| #116 | "results": len(results), |
| #117 | }) |
| #118 | |
| #119 | avg_score = sum(scores) / len(scores) |
| #120 | avg_coverage = sum(coverage_scores) / len(coverage_scores) |
| #121 | |
| #122 | beam.conn.close() |
| #123 | os.unlink(str(db_path)) |
| #124 | |
| #125 | return { |
| #126 | "label": label, |
| #127 | "avg_top5_score": round(avg_score, 3), |
| #128 | "avg_coverage": round(avg_coverage, 3), |
| #129 | "p50_latency_ms": round((total_latency / len(QUESTIONS)) * 1000), |
| #130 | "ingest_time_ms": round(ingest_time * 1000), |
| #131 | "avg_results": round(total_results / len(QUESTIONS)), |
| #132 | "preview": results_preview, |
| #133 | } |
| #134 | |
| #135 | |
| #136 | if __name__ == "__main__": |
| #137 | print("=" * 60) |
| #138 | print("NAI-0 Minimal Benchmark — Phase 0 Algorithmic Sprint") |
| #139 | print("=" * 60) |
| #140 | print(f"Conversation: {len(CONVERSATION)} turns") |
| #141 | print(f"Questions: {len(QUESTIONS)} (10 factual, 5 temporal, 5 multi-hop)") |
| #142 | print() |
| #143 | |
| #144 | # Baseline: old k=5, no formatting |
| #145 | print("--- BASELINE (k=5, no formatting) ---") |
| #146 | baseline = run_benchmark("baseline", top_k=5, use_format=False) |
| #147 | print(f" Top-5 Score: {baseline['avg_top5_score']:.3f}") |
| #148 | print(f" Coverage: {baseline['avg_coverage']:.3f} (full k=5 context)") |
| #149 | print(f" P50 Latency: {baseline['p50_latency_ms']}ms") |
| #150 | print(f" Avg Results: {baseline['avg_results']}") |
| #151 | for p in baseline["preview"]: |
| #152 | print(f" Q: {p['question']}... => top5={p['top5_score']:.2f} cov={p['coverage_score']:.2f} ({p['latency_ms']}ms, {p['results']}r)") |
| #153 | |
| #154 | print() |
| #155 | print("--- PHASE 0 (k=40, RRF, sandwich formatting) ---") |
| #156 | phase0 = run_benchmark("phase0", top_k=40, use_format=True) |
| #157 | print(f" Top-5 Score: {phase0['avg_top5_score']:.3f}") |
| #158 | print(f" Coverage: {phase0['avg_coverage']:.3f} (full k=40 context)") |
| #159 | print(f" P50 Latency: {phase0['p50_latency_ms']}ms") |
| #160 | print(f" Avg Results: {phase0['avg_results']}") |
| #161 | for p in phase0["preview"]: |
| #162 | print(f" Q: {p['question']}... => top5={p['top5_score']:.2f} cov={p['coverage_score']:.2f} ({p['latency_ms']}ms, {p['results']}r)") |
| #163 | |
| #164 | delta_cov = phase0["avg_coverage"] - baseline["avg_coverage"] |
| #165 | print() |
| #166 | print("=" * 60) |
| #167 | print(f"COVERAGE DELTA: {delta_cov:+.3f} ({delta_cov/baseline['avg_coverage']*100:+.1f}% vs baseline)" if baseline["avg_coverage"] > 0 else f"COVERAGE DELTA: {delta_cov:+.3f}") |
| #168 | print(f"RESULTS SCALE: {baseline['avg_results']} → {phase0['avg_results']} ({phase0['avg_results']/baseline['avg_results']:.1f}x more context)") |
| #169 | print("=" * 60) |
| #170 |