my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	#!/usr/bin/env python3
#2	"""NAI-2 Re-ranking Benchmark — LLM cross-attention vs temporal baseline"""
#3	import time, hashlib, random
#4	from datetime import datetime, timedelta
#5	from pathlib import Path
#6	from mnemosyne.core.beam import BeamMemory
#7	from mnemosyne.core.rerank import rerank as llm_rerank, rerank_available
#8
#9	now = datetime.now()
#10	random.seed(42)
#11
#12	TIMELINE = []
#13	# Signals (same as bench_nai2)
#14	signals = [
#15	(30, "user", "I use Python for backend development."),
#16	(28, "user", "Using MySQL as primary database."),
#17	(25, "user", "API throughput currently 500 rps."),
#18	(23, "user", "Switching backend from Python to Go language."),
#19	(20, "user", "Migrating database from MySQL to PostgreSQL."),
#20	(18, "user", "Throughput improved to 5000 rps after Go."),
#21	(15, "user", "Added Redis caching layer."),
#22	(12, "user", "Throughput now at 10000 rps with Redis."),
#23	(10, "user", "Deployed Go backend. Response time 250ms."),
#24	(8, "user", "Running security audit with nmap."),
#25	(6, "user", "All services running as non-root."),
#26	(4, "user", "Going to San Francisco next month."),
#27	(2, "user", "Conference is in July."),
#28	(1, "user", "Added gin framework for Go API."),
#29	(0, "user", "Final stack: Go + PostgreSQL + Redis + gin."),
#30	(29, "assistant", "Python and MySQL noted."),
#31	(24, "assistant", "Go migration confirmed. gin or chi?"),
#32	(19, "assistant", "PostgreSQL migration noted."),
#33	(14, "assistant", "Redis caching is smart."),
#34	(7, "assistant", "250ms response time is solid."),
#35	]
#36	# Filler
#37	topics = ["checked weather", "read AI article", "had lunch", "updated OS",
#38	"watched tutorial", "fixed CSS bug", "cleaned logs", "email thread",
#39	"npm deps", "db backup", "tested endpoint", "reviewed PR",
#40	"unit tests", "CI pipeline", "race condition", "slow query",
#41	"error handling", "refactored code", "monitoring", "API docs"]
#42	for day in range(30, -1, -1):
#43	for _ in range(random.randint(4,8)):
#44	TIMELINE.append((day, "system", f"Log: {random.choice(topics)} on day {day}."))
#45	TIMELINE = sorted(signals + TIMELINE, key=lambda x: x[0], reverse=True)
#46
#47	# 10 key questions (speed over coverage)
#48	QUESTIONS = [
#49	("What language does user use NOW?", ["Go"], False),
#50	("What database NOW?", ["PostgreSQL"], False),
#51	("What is current throughput?", ["10000", "10K"], False),
#52	("What caching is used?", ["Redis"], False),
#53	("What framework with Go?", ["gin"], False),
#54	("Where is conference?", ["San Francisco"], False),
#55	("What was ORIGINAL language?", ["Python"], True),
#56	("What was ORIGINAL database?", ["MySQL"], True),
#57	("What was ORIGINAL throughput?", ["500"], True),
#58	("What security tool?", ["nmap"], False),
#59	]
#60
#61
#62	def score_answer(predicted, expected):
#63	hl = predicted.lower()
#64	hits = sum(1 for kw in expected if kw.lower() in hl)
#65	return hits / len(expected) if expected else 0.0
#66
#67
#68	def setup_db():
#69	import tempfile, os
#70	tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
#71	db_path = Path(tmp.name)
#72	tmp.close()
#73	beam = BeamMemory(session_id="bench_rerank", db_path=db_path)
#74	for days_ago, role, content in TIMELINE:
#75	ts = (now - timedelta(days=days_ago)).isoformat()
#76	beam.conn.execute(
#77	"""INSERT OR IGNORE INTO working_memory
#78	(id, content, source, timestamp, session_id, importance, scope)
#79	VALUES (?, ?, ?, ?, ?, ?, 'global')""",
#80	(hashlib.sha256(f"rr{len(TIMELINE)}{content}".encode()).hexdigest()[:16],
#81	f"[{role}] {content}", role, ts, "bench_rerank", 0.5))
#82	beam.conn.commit()
#83	return beam, db_path
#84
#85
#86	def run_bench(label, beam, use_rerank=False):
#87	all_scores, curr_scores = [], []
#88	latencies = []
#89	for question, expected, is_hist in QUESTIONS:
#90	t0 = time.time()
#91	results = beam.recall(question, top_k=20, temporal_weight=0.3, temporal_halflife=24)
#92
#93	if use_rerank and len(results) > 5:
#94	reranked = llm_rerank(question, results, top_k=5, timeout=60)
#95	if reranked:
#96	results = reranked
#97
#98	lat = round((time.time()-t0)*1000)
#99	latencies.append(lat)
#100	top5 = " ".join(r.get("content","")[:150] for r in results[:5])
#101	s = score_answer(top5, expected)
#102	all_scores.append(s)
#103	if not is_hist:
#104	curr_scores.append(s)
#105
#106	return {
#107	"label": label,
#108	"avg": round(sum(all_scores)/len(all_scores), 3),
#109	"avg_curr": round(sum(curr_scores)/len(curr_scores), 3) if curr_scores else 0,
#110	"p50_ms": sorted(latencies)[len(latencies)//2],
#111	"total_ms": sum(latencies),
#112	}
#113
#114
#115	if __name__ == "__main__":
#116	print(f"Timeline: {len(TIMELINE)} msgs, Questions: {len(QUESTIONS)}")
#117	print(f"LLM available: {rerank_available()}")
#118	print()
#119
#120	beam, db_path = setup_db()
#121	print("--- BASELINE (temporal, no re-rank, k=20) ---")
#122	bl = run_bench("baseline", beam)
#123	print(f" Avg: {bl['avg']:.3f} Current: {bl['avg_curr']:.3f} P50: {bl['p50_ms']}ms Total: {bl['total_ms']}ms")
#124	beam.conn.close()
#125	import os; os.unlink(str(db_path))
#126
#127	beam, db_path = setup_db()
#128	print()
#129	print("--- RE-RANKED (temporal + LLM re-rank, k=20→5) ---")
#130	rr = run_bench("rerank", beam, use_rerank=True)
#131	print(f" Avg: {rr['avg']:.3f} Current: {rr['avg_curr']:.3f} P50: {rr['p50_ms']}ms Total: {rr['total_ms']}ms")
#132
#133	dc = rr['avg_curr'] - bl['avg_curr']
#134	print()
#135	print(f"CURRENT Q DELTA: {dc:+.3f}")
#136	if dc > 0.03:
#137	print("VERDICT: Keep re-ranking (>3pp improvement)")
#138	else:
#139	print(f"VERDICT: Cut re-ranking (<3pp, delta={dc:+.3f})")
#140	beam.conn.close()
#141	os.unlink(str(db_path))
#142

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public