my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	#!/usr/bin/env python3
#2	"""
#3	Phase 3 BEAM Benchmark - Direct Integration Test
#4	Runs the full benchmark pipeline correctly (avoiding module caching issues).
#5	"""
#6	import sys, os, tempfile, json, time
#7	from pathlib import Path
#8
#9	# Ensure we use the local mnemosyne
#10	sys.path.insert(0, str(Path(__file__).parent.parent))
#11
#12	from tools.evaluate_beam_end_to_end import (
#13	LLMClient, evaluate_conversation, load_beam_dataset,
#14	ingest_conversation, init_beam, BeamMemory, compute_ability_scores,
#15	print_sota_report, DEFAULT_TOP_K
#16	)
#17	from datetime import datetime, timezone
#18
#19	RESULTS_FILE = Path(__file__).parent.parent / "results" / "beam_e2e_results.json"
#20	SUMMARY_FILE = Path(__file__).parent.parent / "results" / "beam_e2e_summary.json"
#21
#22	def main():
#23	print("=" * 80)
#24	print(" BEAM Phase 3 End-to-End Evaluation")
#25	print(" Scale: 100K, Sample: 1 conversation")
#26	print("=" * 80)
#27
#28	# Load dataset
#29	print("\n[1/3] Loading dataset...")
#30	data = load_beam_dataset(['100K'], max_conversations=1)
#31	scale = '100K'
#32	conv = data[scale][0]
#33	print(f" Loaded: {len(conv['messages'])} messages, {len(conv['questions'])} questions")
#34
#35	# Create DB and ingest
#36	print("\n[2/3] Ingesting...")
#37	with tempfile.TemporaryDirectory() as tmpdir:
#38	db_path = Path(tmpdir) / f"beam_{scale}_{conv['id']}.db"
#39	init_beam(db_path)
#40	beam = BeamMemory(session_id=f"beam_{scale}_{conv['id']}", db_path=db_path)
#41
#42	stats = ingest_conversation(beam, conv['messages'])
#43	print(f" WM: {beam.get_working_stats()['total']}, EP: {beam.get_episodic_stats()['total']}, SP: {stats.get('sp_count', 0)}")
#44
#45	# Create LLM clients
#46	llm = LLMClient(model='nvidia/nemotron-3-super-120b-a12b:free')
#47	judge_llm = LLMClient(model='nvidia/nemotron-3-super-120b-a12b:free')
#48
#49	# Evaluate
#50	print("\n[3/3] Evaluating...")
#51	result = evaluate_conversation(llm, judge_llm, beam, conv, set())
#52
#53	# Compute scores
#54	ability_summary = compute_ability_scores([result])
#55
#56	# Print results
#57	print("\n" + "=" * 80)
#58	print(" RESULTS")
#59	print("=" * 80)
#60
#61	by_ability = {}
#62	for r in result['results']:
#63	ab = r['ability']
#64	by_ability[ab] = by_ability.get(ab, []) + [r['score']]
#65
#66	for ab, scores in sorted(by_ability.items()):
#67	avg = sum(scores) / len(scores)
#68	print(f" {ab}: {avg:.1%} (n={len(scores)})")
#69
#70	overall = sum(r['score'] for r in result['results']) / len(result['results']) if result['results'] else 0
#71	print(f" OVERALL: {overall:.1%}")
#72
#73	# Save
#74	os.makedirs(RESULTS_FILE.parent, exist_ok=True)
#75	metadata = {
#76	"date": datetime.now(timezone.utc).isoformat(),
#77	"model": "nvidia/nemotron-3-super-120b-a12b:free",
#78	"scales": ["100K"],
#79	"total_conversations": 1,
#80	}
#81	with open(RESULTS_FILE, "w") as f:
#82	json.dump({"metadata": metadata, "results": [result]}, f, indent=2)
#83
#84	with open(SUMMARY_FILE, "w") as f:
#85	json.dump(ability_summary, f, indent=2)
#86
#87	print(f"\n Saved to: {RESULTS_FILE}")
#88	print(f" Summary: {SUMMARY_FILE}")
#89
#90	beam.conn.close()
#91	llm.close()
#92	judge_llm.close()
#93
#94	if __name__ == "__main__":
#95	main()
#96

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public