repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources15d ago| #1 | #!/usr/bin/env python3 |
| #2 | """ |
| #3 | Phase 3 BEAM Benchmark - Direct Integration Test |
| #4 | Runs the full benchmark pipeline correctly (avoiding module caching issues). |
| #5 | """ |
| #6 | import sys, os, tempfile, json, time |
| #7 | from pathlib import Path |
| #8 | |
| #9 | # Ensure we use the local mnemosyne |
| #10 | sys.path.insert(0, str(Path(__file__).parent.parent)) |
| #11 | |
| #12 | from tools.evaluate_beam_end_to_end import ( |
| #13 | LLMClient, evaluate_conversation, load_beam_dataset, |
| #14 | ingest_conversation, init_beam, BeamMemory, compute_ability_scores, |
| #15 | print_sota_report, DEFAULT_TOP_K |
| #16 | ) |
| #17 | from datetime import datetime, timezone |
| #18 | |
| #19 | RESULTS_FILE = Path(__file__).parent.parent / "results" / "beam_e2e_results.json" |
| #20 | SUMMARY_FILE = Path(__file__).parent.parent / "results" / "beam_e2e_summary.json" |
| #21 | |
| #22 | def main(): |
| #23 | print("=" * 80) |
| #24 | print(" BEAM Phase 3 End-to-End Evaluation") |
| #25 | print(" Scale: 100K, Sample: 1 conversation") |
| #26 | print("=" * 80) |
| #27 | |
| #28 | # Load dataset |
| #29 | print("\n[1/3] Loading dataset...") |
| #30 | data = load_beam_dataset(['100K'], max_conversations=1) |
| #31 | scale = '100K' |
| #32 | conv = data[scale][0] |
| #33 | print(f" Loaded: {len(conv['messages'])} messages, {len(conv['questions'])} questions") |
| #34 | |
| #35 | # Create DB and ingest |
| #36 | print("\n[2/3] Ingesting...") |
| #37 | with tempfile.TemporaryDirectory() as tmpdir: |
| #38 | db_path = Path(tmpdir) / f"beam_{scale}_{conv['id']}.db" |
| #39 | init_beam(db_path) |
| #40 | beam = BeamMemory(session_id=f"beam_{scale}_{conv['id']}", db_path=db_path) |
| #41 | |
| #42 | stats = ingest_conversation(beam, conv['messages']) |
| #43 | print(f" WM: {beam.get_working_stats()['total']}, EP: {beam.get_episodic_stats()['total']}, SP: {stats.get('sp_count', 0)}") |
| #44 | |
| #45 | # Create LLM clients |
| #46 | llm = LLMClient(model='nvidia/nemotron-3-super-120b-a12b:free') |
| #47 | judge_llm = LLMClient(model='nvidia/nemotron-3-super-120b-a12b:free') |
| #48 | |
| #49 | # Evaluate |
| #50 | print("\n[3/3] Evaluating...") |
| #51 | result = evaluate_conversation(llm, judge_llm, beam, conv, set()) |
| #52 | |
| #53 | # Compute scores |
| #54 | ability_summary = compute_ability_scores([result]) |
| #55 | |
| #56 | # Print results |
| #57 | print("\n" + "=" * 80) |
| #58 | print(" RESULTS") |
| #59 | print("=" * 80) |
| #60 | |
| #61 | by_ability = {} |
| #62 | for r in result['results']: |
| #63 | ab = r['ability'] |
| #64 | by_ability[ab] = by_ability.get(ab, []) + [r['score']] |
| #65 | |
| #66 | for ab, scores in sorted(by_ability.items()): |
| #67 | avg = sum(scores) / len(scores) |
| #68 | print(f" {ab}: {avg:.1%} (n={len(scores)})") |
| #69 | |
| #70 | overall = sum(r['score'] for r in result['results']) / len(result['results']) if result['results'] else 0 |
| #71 | print(f" OVERALL: {overall:.1%}") |
| #72 | |
| #73 | # Save |
| #74 | os.makedirs(RESULTS_FILE.parent, exist_ok=True) |
| #75 | metadata = { |
| #76 | "date": datetime.now(timezone.utc).isoformat(), |
| #77 | "model": "nvidia/nemotron-3-super-120b-a12b:free", |
| #78 | "scales": ["100K"], |
| #79 | "total_conversations": 1, |
| #80 | } |
| #81 | with open(RESULTS_FILE, "w") as f: |
| #82 | json.dump({"metadata": metadata, "results": [result]}, f, indent=2) |
| #83 | |
| #84 | with open(SUMMARY_FILE, "w") as f: |
| #85 | json.dump(ability_summary, f, indent=2) |
| #86 | |
| #87 | print(f"\n Saved to: {RESULTS_FILE}") |
| #88 | print(f" Summary: {SUMMARY_FILE}") |
| #89 | |
| #90 | beam.conn.close() |
| #91 | llm.close() |
| #92 | judge_llm.close() |
| #93 | |
| #94 | if __name__ == "__main__": |
| #95 | main() |
| #96 |