repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources15d ago| #1 | #!/usr/bin/env python3 |
| #2 | """NAI-2 Re-ranking Benchmark — LLM cross-attention vs temporal baseline""" |
| #3 | import time, hashlib, random |
| #4 | from datetime import datetime, timedelta |
| #5 | from pathlib import Path |
| #6 | from mnemosyne.core.beam import BeamMemory |
| #7 | from mnemosyne.core.rerank import rerank as llm_rerank, rerank_available |
| #8 | |
| #9 | now = datetime.now() |
| #10 | random.seed(42) |
| #11 | |
| #12 | TIMELINE = [] |
| #13 | # Signals (same as bench_nai2) |
| #14 | signals = [ |
| #15 | (30, "user", "I use Python for backend development."), |
| #16 | (28, "user", "Using MySQL as primary database."), |
| #17 | (25, "user", "API throughput currently 500 rps."), |
| #18 | (23, "user", "Switching backend from Python to Go language."), |
| #19 | (20, "user", "Migrating database from MySQL to PostgreSQL."), |
| #20 | (18, "user", "Throughput improved to 5000 rps after Go."), |
| #21 | (15, "user", "Added Redis caching layer."), |
| #22 | (12, "user", "Throughput now at 10000 rps with Redis."), |
| #23 | (10, "user", "Deployed Go backend. Response time 250ms."), |
| #24 | (8, "user", "Running security audit with nmap."), |
| #25 | (6, "user", "All services running as non-root."), |
| #26 | (4, "user", "Going to San Francisco next month."), |
| #27 | (2, "user", "Conference is in July."), |
| #28 | (1, "user", "Added gin framework for Go API."), |
| #29 | (0, "user", "Final stack: Go + PostgreSQL + Redis + gin."), |
| #30 | (29, "assistant", "Python and MySQL noted."), |
| #31 | (24, "assistant", "Go migration confirmed. gin or chi?"), |
| #32 | (19, "assistant", "PostgreSQL migration noted."), |
| #33 | (14, "assistant", "Redis caching is smart."), |
| #34 | (7, "assistant", "250ms response time is solid."), |
| #35 | ] |
| #36 | # Filler |
| #37 | topics = ["checked weather", "read AI article", "had lunch", "updated OS", |
| #38 | "watched tutorial", "fixed CSS bug", "cleaned logs", "email thread", |
| #39 | "npm deps", "db backup", "tested endpoint", "reviewed PR", |
| #40 | "unit tests", "CI pipeline", "race condition", "slow query", |
| #41 | "error handling", "refactored code", "monitoring", "API docs"] |
| #42 | for day in range(30, -1, -1): |
| #43 | for _ in range(random.randint(4,8)): |
| #44 | TIMELINE.append((day, "system", f"Log: {random.choice(topics)} on day {day}.")) |
| #45 | TIMELINE = sorted(signals + TIMELINE, key=lambda x: x[0], reverse=True) |
| #46 | |
| #47 | # 10 key questions (speed over coverage) |
| #48 | QUESTIONS = [ |
| #49 | ("What language does user use NOW?", ["Go"], False), |
| #50 | ("What database NOW?", ["PostgreSQL"], False), |
| #51 | ("What is current throughput?", ["10000", "10K"], False), |
| #52 | ("What caching is used?", ["Redis"], False), |
| #53 | ("What framework with Go?", ["gin"], False), |
| #54 | ("Where is conference?", ["San Francisco"], False), |
| #55 | ("What was ORIGINAL language?", ["Python"], True), |
| #56 | ("What was ORIGINAL database?", ["MySQL"], True), |
| #57 | ("What was ORIGINAL throughput?", ["500"], True), |
| #58 | ("What security tool?", ["nmap"], False), |
| #59 | ] |
| #60 | |
| #61 | |
| #62 | def score_answer(predicted, expected): |
| #63 | hl = predicted.lower() |
| #64 | hits = sum(1 for kw in expected if kw.lower() in hl) |
| #65 | return hits / len(expected) if expected else 0.0 |
| #66 | |
| #67 | |
| #68 | def setup_db(): |
| #69 | import tempfile, os |
| #70 | tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False) |
| #71 | db_path = Path(tmp.name) |
| #72 | tmp.close() |
| #73 | beam = BeamMemory(session_id="bench_rerank", db_path=db_path) |
| #74 | for days_ago, role, content in TIMELINE: |
| #75 | ts = (now - timedelta(days=days_ago)).isoformat() |
| #76 | beam.conn.execute( |
| #77 | """INSERT OR IGNORE INTO working_memory |
| #78 | (id, content, source, timestamp, session_id, importance, scope) |
| #79 | VALUES (?, ?, ?, ?, ?, ?, 'global')""", |
| #80 | (hashlib.sha256(f"rr{len(TIMELINE)}{content}".encode()).hexdigest()[:16], |
| #81 | f"[{role}] {content}", role, ts, "bench_rerank", 0.5)) |
| #82 | beam.conn.commit() |
| #83 | return beam, db_path |
| #84 | |
| #85 | |
| #86 | def run_bench(label, beam, use_rerank=False): |
| #87 | all_scores, curr_scores = [], [] |
| #88 | latencies = [] |
| #89 | for question, expected, is_hist in QUESTIONS: |
| #90 | t0 = time.time() |
| #91 | results = beam.recall(question, top_k=20, temporal_weight=0.3, temporal_halflife=24) |
| #92 | |
| #93 | if use_rerank and len(results) > 5: |
| #94 | reranked = llm_rerank(question, results, top_k=5, timeout=60) |
| #95 | if reranked: |
| #96 | results = reranked |
| #97 | |
| #98 | lat = round((time.time()-t0)*1000) |
| #99 | latencies.append(lat) |
| #100 | top5 = " ".join(r.get("content","")[:150] for r in results[:5]) |
| #101 | s = score_answer(top5, expected) |
| #102 | all_scores.append(s) |
| #103 | if not is_hist: |
| #104 | curr_scores.append(s) |
| #105 | |
| #106 | return { |
| #107 | "label": label, |
| #108 | "avg": round(sum(all_scores)/len(all_scores), 3), |
| #109 | "avg_curr": round(sum(curr_scores)/len(curr_scores), 3) if curr_scores else 0, |
| #110 | "p50_ms": sorted(latencies)[len(latencies)//2], |
| #111 | "total_ms": sum(latencies), |
| #112 | } |
| #113 | |
| #114 | |
| #115 | if __name__ == "__main__": |
| #116 | print(f"Timeline: {len(TIMELINE)} msgs, Questions: {len(QUESTIONS)}") |
| #117 | print(f"LLM available: {rerank_available()}") |
| #118 | print() |
| #119 | |
| #120 | beam, db_path = setup_db() |
| #121 | print("--- BASELINE (temporal, no re-rank, k=20) ---") |
| #122 | bl = run_bench("baseline", beam) |
| #123 | print(f" Avg: {bl['avg']:.3f} Current: {bl['avg_curr']:.3f} P50: {bl['p50_ms']}ms Total: {bl['total_ms']}ms") |
| #124 | beam.conn.close() |
| #125 | import os; os.unlink(str(db_path)) |
| #126 | |
| #127 | beam, db_path = setup_db() |
| #128 | print() |
| #129 | print("--- RE-RANKED (temporal + LLM re-rank, k=20→5) ---") |
| #130 | rr = run_bench("rerank", beam, use_rerank=True) |
| #131 | print(f" Avg: {rr['avg']:.3f} Current: {rr['avg_curr']:.3f} P50: {rr['p50_ms']}ms Total: {rr['total_ms']}ms") |
| #132 | |
| #133 | dc = rr['avg_curr'] - bl['avg_curr'] |
| #134 | print() |
| #135 | print(f"CURRENT Q DELTA: {dc:+.3f}") |
| #136 | if dc > 0.03: |
| #137 | print("VERDICT: Keep re-ranking (>3pp improvement)") |
| #138 | else: |
| #139 | print(f"VERDICT: Cut re-ranking (<3pp, delta={dc:+.3f})") |
| #140 | beam.conn.close() |
| #141 | os.unlink(str(db_path)) |
| #142 |