my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	#!/usr/bin/env python3
#2	"""NAI-1 Benchmark — Chat Normalization Impact
#3
#4	Generates a messy chat conversation (contractions, emojis, filler, fragments),
#5	ingests it with and without normalization, and measures coverage delta.
#6	"""
#7	import json, time
#8	from pathlib import Path
#9	from mnemosyne.core.beam import BeamMemory
#10	from mnemosyne.core.chat_normalize import normalize_chat, extraction_rate
#11
#12	# ── Same factual content as bench_nai0, but in messy chat form ──
#13
#14	MESSY_CONVERSATION = [
#15	("user", "yo wassup can u help me setup my dev thingy lol"),
#16	("assistant", "yea sure!! first u gotta install python 3.12. download it from python.org"),
#17	("user", "im on ubuntu 24.04 bro. also i kinda prefer that dark mode thing for everything tbh fr"),
#18	("assistant", "got it. for ubuntu use apt: sudo apt install python3.12. also i noted ur dark pref np"),
#19	("user", "wbu that container thing? docker or whatever, i need to containerize my stuff lol"),
#20	("assistant", "docker is separate. sudo apt install docker.io. make sure to add ur user to the docker group"),
#21	("user", "the api im building gotta handle like 10k requests per sec omg fr fr"),
#22	("assistant", "thats a lot! u'll want async python w fastapi behind nginx reverse proxy"),
#23	("user", "actually nvm changed my mind lmao. gonna use go instead of python for the api"),
#24	("assistant", "ok bet. switching to go. install from golang.org, use gin or chi for the web framework"),
#25	("user", "yo last week i deployed a test server n the response time was like ~250ms"),
#26	("assistant", "250ms is solid. for 10k rps in go u should get <50ms. want help optimizing?"),
#27	("user", "yea pls. btw is my system secure enough for prod lol?"),
#28	("assistant", "run a security audit: check open ports w nmap, make sure all services run as non-root tbh"),
#29	("user", "aight bet. one more thing fr - im going to sf next month for a conference"),
#30	("assistant", "nice!! sf in june is great. i'll note that. for the trip, finish deploy first"),
#31	]
#32
#33	# Same questions but adapted for messy context
#34	QUESTIONS = [
#35	("What operating system is the user on?", ["Ubuntu", "24.04"]),
#36	("What display preference does the user have?", ["dark mode"]),
#37	("What tool does the user need to containerize services?", ["Docker"]),
#38	("What language did the user switch to for the API?", ["Go", "golang"]),
#39	("What was the response time of the test server?", ["250ms", "250"]),
#40	("What city is the user traveling to?", ["San Francisco", "SF"]),
#41	("What framework was suggested for Go?", ["gin", "chi"]),
#42	("What security steps were recommended?", ["nmap", "non-root"]),
#43	("What is the deployment OS?", ["Ubuntu", "24.04"]),
#44	("What Python framework was suggested?", ["FastAPI"]),
#45	("When was the test server deployed?", ["last week"]),
#46	("When is the conference?", ["next month"]),
#47	("What was discussed before San Francisco?", ["security", "secure"]),
#48	("What language was chosen after mind change?", ["Go", "golang"]),
#49	("What throughput is needed?", ["10k", "10,000"]),
#50	("What should response time be in Go?", ["50ms", "50"]),
#51	("What two installation methods were mentioned?", ["apt", "python.org", "golang.org"]),
#52	("What was the last topic?", ["conference", "deploy"]),
#53	("What should the user install on Ubuntu?", ["apt", "python3.12"]),
#54	("What change did the user make to their API plans?", ["Go", "golang", "go"]),
#55	]
#56
#57
#58	def score_answer(predicted, expected):
#59	predicted_lower = predicted.lower()
#60	hits = sum(1 for kw in expected if kw.lower() in predicted_lower)
#61	return hits / len(expected) if expected else 0.0
#62
#63
#64	def run_bench(label, messages, normalize=False):
#65	import tempfile, os
#66	tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
#67	db_path = Path(tmp.name)
#68	tmp.close()
#69
#70	beam = BeamMemory(session_id=f"bench_nai1_{label}", db_path=db_path)
#71
#72	# Ingest with optional normalization
#73	t0 = time.time()
#74	dropped = 0
#75	for role, msg in messages:
#76	if normalize:
#77	cleaned = normalize_chat(msg)
#78	if cleaned is None:
#79	dropped += 1
#80	continue
#81	beam.remember(f"[{role}] {cleaned}", source=role, importance=0.6 if role=="user" else 0.5)
#82	else:
#83	beam.remember(f"[{role}] {msg}", source=role, importance=0.6 if role=="user" else 0.5)
#84	ingest_ms = round((time.time()-t0)*1000)
#85
#86	# Answer questions
#87	scores = []
#88	coverage_scores = []
#89	latencies = []
#90
#91	for question, expected in QUESTIONS:
#92	t0 = time.time()
#93	results = beam.recall(question, top_k=40)
#94	latencies.append(round((time.time()-t0)*1000))
#95
#96	top5 = " ".join(r.get("content","")[:100] for r in results[:5])
#97	full = " ".join(r.get("content","")[:200] for r in results)
#98
#99	scores.append(score_answer(top5, expected))
#100	coverage_scores.append(score_answer(full, expected))
#101
#102	beam.conn.close()
#103	os.unlink(str(db_path))
#104
#105	return {
#106	"label": label,
#107	"ingest_ms": ingest_ms,
#108	"dropped": dropped,
#109	"avg_top5": round(sum(scores)/len(scores), 3),
#110	"avg_coverage": round(sum(coverage_scores)/len(coverage_scores), 3),
#111	"p50_ms": sorted(latencies)[len(latencies)//2],
#112	}
#113
#114
#115	if __name__ == "__main__":
#116	print("=" * 60)
#117	print("NAI-1 Benchmark — Chat Normalization Impact")
#118	print("=" * 60)
#119
#120	# Extract rate on the raw messages
#121	raw_msgs = [m[1] for m in MESSY_CONVERSATION]
#122	rate = extraction_rate(raw_msgs)
#123	print(f"Raw messages: {rate['total']}")
#124	print(f"Survived normalization: {rate['survived']} ({rate['rate']:.0%})")
#125	print(f"Dropped: {rate['dropped_samples']}")
#126	print()
#127
#128	print("--- RAW (no normalization) ---")
#129	raw = run_bench("raw", MESSY_CONVERSATION, normalize=False)
#130	print(f" Top-5: {raw['avg_top5']:.3f} Coverage: {raw['avg_coverage']:.3f}")
#131	print(f" Ingest: {raw['ingest_ms']}ms P50: {raw['p50_ms']}ms")
#132
#133	print()
#134	print("--- NORMALIZED ---")
#135	norm = run_bench("norm", MESSY_CONVERSATION, normalize=True)
#136	print(f" Top-5: {norm['avg_top5']:.3f} Coverage: {norm['avg_coverage']:.3f}")
#137	print(f" Ingest: {norm['ingest_ms']}ms P50: {norm['p50_ms']}ms")
#138	print(f" Dropped: {norm['dropped']}/{len(MESSY_CONVERSATION)} messages")
#139
#140	delta = norm['avg_coverage'] - raw['avg_coverage']
#141	print()
#142	print("=" * 60)
#143	print(f"COVERAGE DELTA: {delta:+.3f} (normalized vs raw)")
#144	print("=" * 60)
#145

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public