my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	#!/usr/bin/env python3
#2	"""NAI-0 Minimal Benchmark — 20 questions across 1 synthetic conversation.
#3
#4	Quick and dirty: ingest 1 fake user+assistant chat, ask 20 questions,
#5	measure keyword-overlap accuracy before and after Phase 0 optimizations.
#6	"""
#7	import json
#8	import time
#9	from pathlib import Path
#10	from mnemosyne.core.beam import BeamMemory
#11
#12	# ── Synthetic Conversation: 15 turns, 5 factual, 5 temporal, 5 multi-hop signals ──
#13
#14	CONVERSATION = [
#15	("user", "Hey, can you help me set up my development environment?"),
#16	("assistant", "Sure! First, let's install Python 3.12. Download it from python.org."),
#17	("user", "I'm on Ubuntu 24.04. I prefer using dark mode for everything."),
#18	("assistant", "Got it. For Ubuntu, use apt: sudo apt install python3.12. Also noted your dark mode preference."),
#19	("user", "What about Docker? I need to containerize my services."),
#20	("assistant", "Docker is separate. sudo apt install docker.io. Make sure to add your user to the docker group."),
#21	("user", "The API I'm building needs to handle about 10,000 requests per second."),
#22	("assistant", "That's high throughput. You'll want async Python with FastAPI, behind an nginx reverse proxy."),
#23	("user", "I changed my mind. I want to use Go instead of Python for the API."),
#24	("assistant", "OK, switching to Go. Install Go from golang.org, then use gin or chi for the web framework."),
#25	("user", "Last week I deployed a test server and the response time was about 250ms."),
#26	("assistant", "250ms is good. For 10K rps in Go, you should get under 50ms. Want help optimizing?"),
#27	("user", "Yes please. Is my system secure enough for production?"),
#28	("assistant", "Run a security audit: check open ports with nmap, ensure all services run as non-root users."),
#29	("user", "Great. One more thing - I'm going to San Francisco next month for a conference."),
#30	("assistant", "Nice! SF in June is great. I'll note that down. For the trip, we should finish the deploy first."),
#31	]
#32
#33	# 20 questions: 10 factual, 5 temporal, 5 multi-hop
#34	QUESTIONS = [
#35	# Factual (direct recall)
#36	("What operating system is the user on?", ["Ubuntu", "24.04"]),
#37	("What display preference does the user have?", ["dark mode"]),
#38	("What tool does the user need to containerize services?", ["Docker"]),
#39	("What Python feature was suggested for high throughput?", ["FastAPI", "async"]),
#40	("What language did the user switch to for the API?", ["Go", "golang"]),
#41	("What web framework was suggested for the new language?", ["gin", "chi"]),
#42	("What was the response time of the test server?", ["250ms", "250"]),
#43	("What does the user need to install Docker on Ubuntu?", ["apt", "apt install"]),
#44	("What city is the user traveling to?", ["San Francisco"]),
#45	("What type of application is the API expected to be?", ["high throughput", "10,000", "10K"]),
#46	# Temporal
#47	("When did the user deploy the test server?", ["last week"]),
#48	("When is the conference the user mentioned?", ["next month"]),
#49	("What OS version is the user on?", ["Ubuntu", "24.04"]),
#50	("What was discussed before the user mentioned San Francisco?", ["security", "secure"]),
#51	("What was the last topic discussed?", ["conference", "San Francisco", "deploy"]),
#52	# Multi-hop
#53	("What language was chosen after the user changed their mind?", ["Go", "golang"]),
#54	("What framework should be used for the new language choice?", ["gin", "chi"]),
#55	("How should the user improve throughput in the new language?", ["optimizing", "50ms"]),
#56	("What two installation methods were discussed?", ["apt", "python.org", "apt install", "golang.org"]),
#57	("What security steps were recommended?", ["nmap", "non-root", "security audit"]),
#58	]
#59
#60
#61	def score_answer(predicted: str, expected: list) -> float:
#62	"""Simple keyword-overlap score. 1.0 = all keywords found."""
#63	predicted_lower = predicted.lower()
#64	hits = sum(1 for kw in expected if kw.lower() in predicted_lower)
#65	return hits / len(expected) if expected else 0.0
#66
#67
#68	def run_benchmark(label: str, top_k: int = 40, use_format: bool = False) -> dict:
#69	"""Run full benchmark pipeline."""
#70	import tempfile, os
#71	tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
#72	db_path = Path(tmp.name)
#73	tmp.close()
#74
#75	beam = BeamMemory(session_id=f"bench_nai0_{label}", db_path=db_path)
#76
#77	# Ingest conversation
#78	t_start = time.time()
#79	for role, msg in CONVERSATION:
#80	beam.remember(f"[{role}] {msg}", source=role, importance=0.6 if role == "user" else 0.5)
#81	ingest_time = time.time() - t_start
#82
#83	# Answer questions
#84	scores = []
#85	coverage_scores = [] # How many of expected keywords appear in ALL retrieved results
#86	total_latency = 0
#87	results_preview = []
#88	total_results = 0
#89
#90	for question, expected in QUESTIONS:
#91	t0 = time.time()
#92	results = beam.recall(question, top_k=top_k)
#93	recall_time = time.time() - t0
#94	total_results += len(results)
#95
#96	# Build full context from all retrieved results
#97	full_context = " ".join(r.get("content", "")[:200] for r in results)
#98
#99	# Top-5 score (same as before for comparison)
#100	top5_content = " ".join(r.get("content", "")[:100] for r in results[:5])
#101	s5 = score_answer(top5_content, expected)
#102	scores.append(s5)
#103
#104	# Coverage score: search ALL results, not just top-5
#105	sc = score_answer(full_context, expected)
#106	coverage_scores.append(sc)
#107
#108	total_latency += recall_time
#109
#110	if len(results_preview) < 3:
#111	results_preview.append({
#112	"question": question[:50],
#113	"top5_score": s5,
#114	"coverage_score": sc,
#115	"latency_ms": round(recall_time * 1000),
#116	"results": len(results),
#117	})
#118
#119	avg_score = sum(scores) / len(scores)
#120	avg_coverage = sum(coverage_scores) / len(coverage_scores)
#121
#122	beam.conn.close()
#123	os.unlink(str(db_path))
#124
#125	return {
#126	"label": label,
#127	"avg_top5_score": round(avg_score, 3),
#128	"avg_coverage": round(avg_coverage, 3),
#129	"p50_latency_ms": round((total_latency / len(QUESTIONS)) * 1000),
#130	"ingest_time_ms": round(ingest_time * 1000),
#131	"avg_results": round(total_results / len(QUESTIONS)),
#132	"preview": results_preview,
#133	}
#134
#135
#136	if __name__ == "__main__":
#137	print("=" * 60)
#138	print("NAI-0 Minimal Benchmark — Phase 0 Algorithmic Sprint")
#139	print("=" * 60)
#140	print(f"Conversation: {len(CONVERSATION)} turns")
#141	print(f"Questions: {len(QUESTIONS)} (10 factual, 5 temporal, 5 multi-hop)")
#142	print()
#143
#144	# Baseline: old k=5, no formatting
#145	print("--- BASELINE (k=5, no formatting) ---")
#146	baseline = run_benchmark("baseline", top_k=5, use_format=False)
#147	print(f" Top-5 Score: {baseline['avg_top5_score']:.3f}")
#148	print(f" Coverage: {baseline['avg_coverage']:.3f} (full k=5 context)")
#149	print(f" P50 Latency: {baseline['p50_latency_ms']}ms")
#150	print(f" Avg Results: {baseline['avg_results']}")
#151	for p in baseline["preview"]:
#152	print(f" Q: {p['question']}... => top5={p['top5_score']:.2f} cov={p['coverage_score']:.2f} ({p['latency_ms']}ms, {p['results']}r)")
#153
#154	print()
#155	print("--- PHASE 0 (k=40, RRF, sandwich formatting) ---")
#156	phase0 = run_benchmark("phase0", top_k=40, use_format=True)
#157	print(f" Top-5 Score: {phase0['avg_top5_score']:.3f}")
#158	print(f" Coverage: {phase0['avg_coverage']:.3f} (full k=40 context)")
#159	print(f" P50 Latency: {phase0['p50_latency_ms']}ms")
#160	print(f" Avg Results: {phase0['avg_results']}")
#161	for p in phase0["preview"]:
#162	print(f" Q: {p['question']}... => top5={p['top5_score']:.2f} cov={p['coverage_score']:.2f} ({p['latency_ms']}ms, {p['results']}r)")
#163
#164	delta_cov = phase0["avg_coverage"] - baseline["avg_coverage"]
#165	print()
#166	print("=" * 60)
#167	print(f"COVERAGE DELTA: {delta_cov:+.3f} ({delta_cov/baseline['avg_coverage']*100:+.1f}% vs baseline)" if baseline["avg_coverage"] > 0 else f"COVERAGE DELTA: {delta_cov:+.3f}")
#168	print(f"RESULTS SCALE: {baseline['avg_results']} → {phase0['avg_results']} ({phase0['avg_results']/baseline['avg_results']:.1f}x more context)")
#169	print("=" * 60)
#170

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public