repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources15d ago| #1 | #!/usr/bin/env python3 |
| #2 | """NAI-1 Benchmark — Chat Normalization Impact |
| #3 | |
| #4 | Generates a messy chat conversation (contractions, emojis, filler, fragments), |
| #5 | ingests it with and without normalization, and measures coverage delta. |
| #6 | """ |
| #7 | import json, time |
| #8 | from pathlib import Path |
| #9 | from mnemosyne.core.beam import BeamMemory |
| #10 | from mnemosyne.core.chat_normalize import normalize_chat, extraction_rate |
| #11 | |
| #12 | # ── Same factual content as bench_nai0, but in messy chat form ── |
| #13 | |
| #14 | MESSY_CONVERSATION = [ |
| #15 | ("user", "yo wassup can u help me setup my dev thingy lol"), |
| #16 | ("assistant", "yea sure!! first u gotta install python 3.12. download it from python.org"), |
| #17 | ("user", "im on ubuntu 24.04 bro. also i kinda prefer that dark mode thing for everything tbh fr"), |
| #18 | ("assistant", "got it. for ubuntu use apt: sudo apt install python3.12. also i noted ur dark pref np"), |
| #19 | ("user", "wbu that container thing? docker or whatever, i need to containerize my stuff lol"), |
| #20 | ("assistant", "docker is separate. sudo apt install docker.io. make sure to add ur user to the docker group"), |
| #21 | ("user", "the api im building gotta handle like 10k requests per sec omg fr fr"), |
| #22 | ("assistant", "thats a lot! u'll want async python w fastapi behind nginx reverse proxy"), |
| #23 | ("user", "actually nvm changed my mind lmao. gonna use go instead of python for the api"), |
| #24 | ("assistant", "ok bet. switching to go. install from golang.org, use gin or chi for the web framework"), |
| #25 | ("user", "yo last week i deployed a test server n the response time was like ~250ms"), |
| #26 | ("assistant", "250ms is solid. for 10k rps in go u should get <50ms. want help optimizing?"), |
| #27 | ("user", "yea pls. btw is my system secure enough for prod lol?"), |
| #28 | ("assistant", "run a security audit: check open ports w nmap, make sure all services run as non-root tbh"), |
| #29 | ("user", "aight bet. one more thing fr - im going to sf next month for a conference"), |
| #30 | ("assistant", "nice!! sf in june is great. i'll note that. for the trip, finish deploy first"), |
| #31 | ] |
| #32 | |
| #33 | # Same questions but adapted for messy context |
| #34 | QUESTIONS = [ |
| #35 | ("What operating system is the user on?", ["Ubuntu", "24.04"]), |
| #36 | ("What display preference does the user have?", ["dark mode"]), |
| #37 | ("What tool does the user need to containerize services?", ["Docker"]), |
| #38 | ("What language did the user switch to for the API?", ["Go", "golang"]), |
| #39 | ("What was the response time of the test server?", ["250ms", "250"]), |
| #40 | ("What city is the user traveling to?", ["San Francisco", "SF"]), |
| #41 | ("What framework was suggested for Go?", ["gin", "chi"]), |
| #42 | ("What security steps were recommended?", ["nmap", "non-root"]), |
| #43 | ("What is the deployment OS?", ["Ubuntu", "24.04"]), |
| #44 | ("What Python framework was suggested?", ["FastAPI"]), |
| #45 | ("When was the test server deployed?", ["last week"]), |
| #46 | ("When is the conference?", ["next month"]), |
| #47 | ("What was discussed before San Francisco?", ["security", "secure"]), |
| #48 | ("What language was chosen after mind change?", ["Go", "golang"]), |
| #49 | ("What throughput is needed?", ["10k", "10,000"]), |
| #50 | ("What should response time be in Go?", ["50ms", "50"]), |
| #51 | ("What two installation methods were mentioned?", ["apt", "python.org", "golang.org"]), |
| #52 | ("What was the last topic?", ["conference", "deploy"]), |
| #53 | ("What should the user install on Ubuntu?", ["apt", "python3.12"]), |
| #54 | ("What change did the user make to their API plans?", ["Go", "golang", "go"]), |
| #55 | ] |
| #56 | |
| #57 | |
| #58 | def score_answer(predicted, expected): |
| #59 | predicted_lower = predicted.lower() |
| #60 | hits = sum(1 for kw in expected if kw.lower() in predicted_lower) |
| #61 | return hits / len(expected) if expected else 0.0 |
| #62 | |
| #63 | |
| #64 | def run_bench(label, messages, normalize=False): |
| #65 | import tempfile, os |
| #66 | tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False) |
| #67 | db_path = Path(tmp.name) |
| #68 | tmp.close() |
| #69 | |
| #70 | beam = BeamMemory(session_id=f"bench_nai1_{label}", db_path=db_path) |
| #71 | |
| #72 | # Ingest with optional normalization |
| #73 | t0 = time.time() |
| #74 | dropped = 0 |
| #75 | for role, msg in messages: |
| #76 | if normalize: |
| #77 | cleaned = normalize_chat(msg) |
| #78 | if cleaned is None: |
| #79 | dropped += 1 |
| #80 | continue |
| #81 | beam.remember(f"[{role}] {cleaned}", source=role, importance=0.6 if role=="user" else 0.5) |
| #82 | else: |
| #83 | beam.remember(f"[{role}] {msg}", source=role, importance=0.6 if role=="user" else 0.5) |
| #84 | ingest_ms = round((time.time()-t0)*1000) |
| #85 | |
| #86 | # Answer questions |
| #87 | scores = [] |
| #88 | coverage_scores = [] |
| #89 | latencies = [] |
| #90 | |
| #91 | for question, expected in QUESTIONS: |
| #92 | t0 = time.time() |
| #93 | results = beam.recall(question, top_k=40) |
| #94 | latencies.append(round((time.time()-t0)*1000)) |
| #95 | |
| #96 | top5 = " ".join(r.get("content","")[:100] for r in results[:5]) |
| #97 | full = " ".join(r.get("content","")[:200] for r in results) |
| #98 | |
| #99 | scores.append(score_answer(top5, expected)) |
| #100 | coverage_scores.append(score_answer(full, expected)) |
| #101 | |
| #102 | beam.conn.close() |
| #103 | os.unlink(str(db_path)) |
| #104 | |
| #105 | return { |
| #106 | "label": label, |
| #107 | "ingest_ms": ingest_ms, |
| #108 | "dropped": dropped, |
| #109 | "avg_top5": round(sum(scores)/len(scores), 3), |
| #110 | "avg_coverage": round(sum(coverage_scores)/len(coverage_scores), 3), |
| #111 | "p50_ms": sorted(latencies)[len(latencies)//2], |
| #112 | } |
| #113 | |
| #114 | |
| #115 | if __name__ == "__main__": |
| #116 | print("=" * 60) |
| #117 | print("NAI-1 Benchmark — Chat Normalization Impact") |
| #118 | print("=" * 60) |
| #119 | |
| #120 | # Extract rate on the raw messages |
| #121 | raw_msgs = [m[1] for m in MESSY_CONVERSATION] |
| #122 | rate = extraction_rate(raw_msgs) |
| #123 | print(f"Raw messages: {rate['total']}") |
| #124 | print(f"Survived normalization: {rate['survived']} ({rate['rate']:.0%})") |
| #125 | print(f"Dropped: {rate['dropped_samples']}") |
| #126 | print() |
| #127 | |
| #128 | print("--- RAW (no normalization) ---") |
| #129 | raw = run_bench("raw", MESSY_CONVERSATION, normalize=False) |
| #130 | print(f" Top-5: {raw['avg_top5']:.3f} Coverage: {raw['avg_coverage']:.3f}") |
| #131 | print(f" Ingest: {raw['ingest_ms']}ms P50: {raw['p50_ms']}ms") |
| #132 | |
| #133 | print() |
| #134 | print("--- NORMALIZED ---") |
| #135 | norm = run_bench("norm", MESSY_CONVERSATION, normalize=True) |
| #136 | print(f" Top-5: {norm['avg_top5']:.3f} Coverage: {norm['avg_coverage']:.3f}") |
| #137 | print(f" Ingest: {norm['ingest_ms']}ms P50: {norm['p50_ms']}ms") |
| #138 | print(f" Dropped: {norm['dropped']}/{len(MESSY_CONVERSATION)} messages") |
| #139 | |
| #140 | delta = norm['avg_coverage'] - raw['avg_coverage'] |
| #141 | print() |
| #142 | print("=" * 60) |
| #143 | print(f"COVERAGE DELTA: {delta:+.3f} (normalized vs raw)") |
| #144 | print("=" * 60) |
| #145 |