my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	#!/usr/bin/env python3
#2	"""
#3	Mnemosyne Legacy Migration Script
#4	=================================
#5
#6	Migrates memories from ephemeral/legacy databases to the PERSISTED canonical path.
#7
#8	CRITICAL for Fly.io / ephemeral VMs: Only ~/.hermes is persisted across restarts!
#9	- Source: ~/.mnemosyne/data/mnemosyne.db (ephemeral — lost on restart)
#10	- Target: ~/.hermes/mnemosyne/data/mnemosyne.db (persisted)
#11
#12	Also migrates legacy mnemosyne_native.db files from earlier versions.
#13
#14	Usage:
#15	python scripts/migrate_from_legacy.py [--dry-run]
#16
#17	What it does:
#18	1. Scans ephemeral and legacy database paths
#19	2. Copies missing memories into the persisted canonical DB
#20	3. Migrates meaningful non-tool memories into BEAM episodic_memory
#21	4. Promotes high-importance memories into working_memory
#22	5. Preserves all existing data (idempotent — safe to run multiple times)
#23	"""
#24
#25	import argparse
#26	import os
#27	import sqlite3
#28	import sys
#29	from pathlib import Path
#30
#31	# Current canonical path (matches mnemosyne.core.beam DEFAULT_DB_PATH)
#32	# NOTE: On Fly.io and other ephemeral VMs, ~/.hermes is the only persisted path
#33	# unless MNEMOSYNE_DATA_DIR explicitly points elsewhere.
#34	CANONICAL_DATA_DIR = Path(
#35	os.environ.get("MNEMOSYNE_DATA_DIR")
#36	or Path.home() / ".hermes" / "mnemosyne" / "data"
#37	)
#38	CANONICAL_DB = CANONICAL_DATA_DIR / "mnemosyne.db"
#39
#40	# Legacy / ephemeral paths to scan and migrate from
#41	LEGACY_CANDIDATES = [
#42	Path.home() / ".mnemosyne" / "data" / "mnemosyne.db", # ephemeral BEAM data
#43	Path.home() / ".mnemosyne" / "data" / "mnemosyne_native.db",
#44	Path.home() / ".hermes" / "mnemosyne" / "data" / "mnemosyne_native.db",
#45	]
#46
#47
#48	def ensure_schema(conn: sqlite3.Connection):
#49	"""Ensure the canonical DB has all required BEAM + legacy tables."""
#50	cursor = conn.cursor()
#51
#52	# Legacy memories table
#53	cursor.execute("""
#54	CREATE TABLE IF NOT EXISTS memories (
#55	id TEXT PRIMARY KEY,
#56	content TEXT NOT NULL,
#57	source TEXT,
#58	timestamp TEXT,
#59	session_id TEXT DEFAULT 'default',
#60	importance REAL DEFAULT 0.5,
#61	metadata_json TEXT,
#62	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
#63	)
#64	""")
#65	# Some old canonical DBs were created without created_at
#66	cursor.execute("PRAGMA table_info(memories)")
#67	mem_cols = [r[1] for r in cursor.fetchall()]
#68	if "created_at" not in mem_cols:
#69	cursor.execute("ALTER TABLE memories ADD COLUMN created_at TIMESTAMP")
#70	cursor.execute("UPDATE memories SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL")
#71	cursor.execute("CREATE INDEX IF NOT EXISTS idx_session ON memories(session_id)")
#72	cursor.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON memories(timestamp)")
#73	cursor.execute("CREATE INDEX IF NOT EXISTS idx_source ON memories(source)")
#74
#75	# Legacy embeddings table
#76	cursor.execute("""
#77	CREATE TABLE IF NOT EXISTS memory_embeddings (
#78	memory_id TEXT PRIMARY KEY,
#79	embedding_json TEXT NOT NULL,
#80	model TEXT DEFAULT 'bge-small-en-v1.5',
#81	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
#82	FOREIGN KEY (memory_id) REFERENCES memories(id) ON DELETE CASCADE
#83	)
#84	""")
#85
#86	# BEAM working_memory
#87	cursor.execute("""
#88	CREATE TABLE IF NOT EXISTS working_memory (
#89	id TEXT PRIMARY KEY,
#90	content TEXT NOT NULL,
#91	source TEXT,
#92	timestamp TEXT,
#93	session_id TEXT DEFAULT 'default',
#94	importance REAL DEFAULT 0.5,
#95	metadata_json TEXT,
#96	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
#97	)
#98	""")
#99	cursor.execute("CREATE INDEX IF NOT EXISTS idx_wm_session ON working_memory(session_id)")
#100	cursor.execute("CREATE INDEX IF NOT EXISTS idx_wm_timestamp ON working_memory(timestamp)")
#101	cursor.execute("CREATE INDEX IF NOT EXISTS idx_wm_source ON working_memory(source)")
#102
#103	# BEAM episodic_memory
#104	cursor.execute("""
#105	CREATE TABLE IF NOT EXISTS episodic_memory (
#106	rowid INTEGER PRIMARY KEY AUTOINCREMENT,
#107	id TEXT UNIQUE NOT NULL,
#108	content TEXT NOT NULL,
#109	source TEXT,
#110	timestamp TEXT,
#111	session_id TEXT DEFAULT 'default',
#112	importance REAL DEFAULT 0.5,
#113	metadata_json TEXT,
#114	summary_of TEXT DEFAULT '',
#115	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
#116	)
#117	""")
#118	cursor.execute("CREATE INDEX IF NOT EXISTS idx_em_session ON episodic_memory(session_id)")
#119	cursor.execute("CREATE INDEX IF NOT EXISTS idx_em_timestamp ON episodic_memory(timestamp)")
#120	cursor.execute("CREATE INDEX IF NOT EXISTS idx_em_source ON episodic_memory(source)")
#121
#122	# BEAM scratchpad
#123	cursor.execute("""
#124	CREATE TABLE IF NOT EXISTS scratchpad (
#125	id TEXT PRIMARY KEY,
#126	content TEXT NOT NULL,
#127	session_id TEXT DEFAULT 'default',
#128	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
#129	updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
#130	)
#131	""")
#132	cursor.execute("CREATE INDEX IF NOT EXISTS idx_sp_session ON scratchpad(session_id)")
#133
#134	# FTS5 for episodic memory
#135	cursor.execute("""
#136	CREATE VIRTUAL TABLE IF NOT EXISTS fts_episodes USING fts5(
#137	content,
#138	content='episodic_memory',
#139	content_rowid='rowid'
#140	)
#141	""")
#142	cursor.execute("""
#143	CREATE TRIGGER IF NOT EXISTS em_ai AFTER INSERT ON episodic_memory BEGIN
#144	INSERT INTO fts_episodes(rowid, content) VALUES (new.rowid, new.content);
#145	END
#146	""")
#147	cursor.execute("""
#148	CREATE TRIGGER IF NOT EXISTS em_ad AFTER DELETE ON episodic_memory BEGIN
#149	INSERT INTO fts_episodes(fts_episodes, rowid, content) VALUES ('delete', old.rowid, old.content);
#150	END
#151	""")
#152
#153	conn.commit()
#154
#155
#156	def get_existing_ids(conn: sqlite3.Connection, table: str) -> set:
#157	cursor = conn.cursor()
#158	cursor.execute(f"SELECT id FROM {table}")
#159	return {row[0] for row in cursor.fetchall()}
#160
#161
#162	def migrate_legacy_db(legacy_path: Path, canonical_conn: sqlite3.Connection, dry_run: bool = False) -> dict:
#163	"""Migrate a single legacy database into the canonical one."""
#164	stats = {"memories_copied": 0, "embeddings_copied": 0, "episodic_migrated": 0, "working_migrated": 0}
#165
#166	legacy_conn = sqlite3.connect(str(legacy_path))
#167	legacy_cursor = legacy_conn.cursor()
#168
#169	# Check what tables exist
#170	legacy_cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
#171	tables = {row[0] for row in legacy_cursor.fetchall()}
#172
#173	if "memories" not in tables:
#174	print(f" ⚠️ No memories table in {legacy_path} — skipping")
#175	legacy_conn.close()
#176	return stats
#177
#178	canonical_cursor = canonical_conn.cursor()
#179	existing_memory_ids = get_existing_ids(canonical_conn, "memories")
#180
#181	# 1. Copy memories
#182	legacy_cursor.execute("""
#183	SELECT id, content, source, timestamp, session_id, importance, metadata_json, created_at
#184	FROM memories
#185	""")
#186	rows = legacy_cursor.fetchall()
#187	to_insert = [row for row in rows if row[0] not in existing_memory_ids]
#188
#189	if dry_run:
#190	print(f" [DRY-RUN] Would copy {len(to_insert)} memories from {legacy_path}")
#191	else:
#192	for row in to_insert:
#193	canonical_cursor.execute("""
#194	INSERT INTO memories (id, content, source, timestamp, session_id, importance, metadata_json, created_at)
#195	VALUES (?, ?, ?, ?, ?, ?, ?, ?)
#196	""", row)
#197	canonical_conn.commit()
#198	stats["memories_copied"] = len(to_insert)
#199
#200	# 2. Copy embeddings if present
#201	if "memory_embeddings" in tables:
#202	legacy_cursor.execute("SELECT memory_id, embedding_json, model, created_at FROM memory_embeddings")
#203	embeddings = legacy_cursor.fetchall()
#204	canonical_cursor.execute("SELECT memory_id FROM memory_embeddings")
#205	existing_emb_ids = {row[0] for row in canonical_cursor.fetchall()}
#206	emb_to_insert = [row for row in embeddings if row[0] not in existing_emb_ids]
#207
#208	if dry_run:
#209	print(f" [DRY-RUN] Would copy {len(emb_to_insert)} embeddings from {legacy_path}")
#210	else:
#211	for row in emb_to_insert:
#212	canonical_cursor.execute("""
#213	INSERT INTO memory_embeddings (memory_id, embedding_json, model, created_at)
#214	VALUES (?, ?, ?, ?)
#215	""", row)
#216	canonical_conn.commit()
#217	stats["embeddings_copied"] = len(emb_to_insert)
#218
#219	# 3. Migrate meaningful non-tool memories into episodic_memory
#220	if not dry_run:
#221	meaningful = [row for row in rows if row[2] != 'tool_execution' and row[0] not in get_existing_ids(canonical_conn, "episodic_memory")]
#222	for row in meaningful:
#223	mid, content, source, timestamp, session_id, importance, metadata_json, created_at = row
#224	canonical_cursor.execute("""
#225	INSERT INTO episodic_memory (id, content, source, timestamp, session_id, importance, metadata_json, summary_of)
#226	VALUES (?, ?, ?, ?, ?, ?, ?, ?)
#227	""", (mid, content, source, timestamp, 'hermes_default', importance, metadata_json or '{}', ''))
#228	canonical_conn.commit()
#229	stats["episodic_migrated"] = len(meaningful)
#230
#231	# 4. Promote top high-importance ones into working_memory
#232	hot = [row for row in meaningful if row[0] not in get_existing_ids(canonical_conn, "working_memory")]
#233	hot.sort(key=lambda r: (r[5] or 0.5), reverse=True)
#234	hot = hot[:30]
#235	for row in hot:
#236	mid, content, source, timestamp, session_id, importance, metadata_json, created_at = row
#237	canonical_cursor.execute("""
#238	INSERT INTO working_memory (id, content, source, timestamp, session_id, importance, metadata_json)
#239	VALUES (?, ?, ?, ?, ?, ?, ?)
#240	""", (mid, content, source, timestamp, 'hermes_default', importance, metadata_json or '{}'))
#241	canonical_conn.commit()
#242	stats["working_migrated"] = len(hot)
#243	else:
#244	meaningful_count = sum(1 for row in rows if row[2] != 'tool_execution')
#245	print(f" [DRY-RUN] Would migrate {meaningful_count} memories to episodic + up to 30 to working")
#246
#247	legacy_conn.close()
#248	return stats
#249
#250
#251	def main():
#252	parser = argparse.ArgumentParser(description="Migrate legacy Mnemosyne databases to the current canonical path")
#253	parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing")
#254	parser.add_argument("--purge-tools", action="store_true", help="Remove legacy auto-logged tool_execution memories after migration")
#255	args = parser.parse_args()
#256
#257	print("=" * 60)
#258	print("Mnemosyne Legacy Database Migration")
#259	print("=" * 60)
#260	print(f"Canonical DB: {CANONICAL_DB}")
#261	print()
#262
#263	CANONICAL_DB.parent.mkdir(parents=True, exist_ok=True)
#264	canonical_conn = sqlite3.connect(str(CANONICAL_DB))
#265	ensure_schema(canonical_conn)
#266
#267	# Pre-check stats
#268	cursor = canonical_conn.cursor()
#269	cursor.execute("SELECT COUNT(*) FROM memories")
#270	pre_total = cursor.fetchone()[0]
#271	cursor.execute("SELECT COUNT(*) FROM memories WHERE source = 'tool_execution'")
#272	pre_tools = cursor.fetchone()[0]
#273	print(f"Current canonical DB has {pre_total} memories ({pre_tools} tool_execution)")
#274
#275	total_stats = {"memories_copied": 0, "embeddings_copied": 0, "episodic_migrated": 0, "working_migrated": 0}
#276	any_found = False
#277
#278	for legacy_path in LEGACY_CANDIDATES:
#279	if legacy_path.exists() and legacy_path.resolve() != CANONICAL_DB.resolve():
#280	any_found = True
#281	print(f"\n📁 Found legacy DB: {legacy_path}")
#282	stats = migrate_legacy_db(legacy_path, canonical_conn, dry_run=args.dry_run)
#283	for k in total_stats:
#284	total_stats[k] += stats[k]
#285
#286	if not any_found and pre_total == 0:
#287	print("\n✅ No legacy databases found and canonical DB is empty. Nothing to migrate.")
#288	canonical_conn.close()
#289	return 0
#290
#291	# Purge tool_execution noise if requested
#292	purged_tools = 0
#293	if args.purge_tools and not args.dry_run:
#294	cursor.execute("DELETE FROM memories WHERE source = 'tool_execution'")
#295	cursor.execute("DELETE FROM working_memory WHERE source = 'tool_execution'")
#296	purged_tools = cursor.rowcount
#297	canonical_conn.commit()
#298	print(f"\n🧹 Purged {purged_tools} tool_execution memories from canonical DB")
#299	elif args.purge_tools and args.dry_run:
#300	cursor.execute("SELECT COUNT(*) FROM memories WHERE source = 'tool_execution'")
#301	would_purge = cursor.fetchone()[0]
#302	print(f"\n[DRY-RUN] Would purge {would_purge} tool_execution memories")
#303
#304	if args.dry_run:
#305	print("\n🏁 Dry-run complete. No changes were made.")
#306	else:
#307	cursor.execute("SELECT COUNT(*) FROM memories")
#308	post_total = cursor.fetchone()[0]
#309	print("\n🏁 Migration complete!")
#310	print(f" Memories copied: {total_stats['memories_copied']}")
#311	print(f" Embeddings copied: {total_stats['embeddings_copied']}")
#312	print(f" Episodic migrated: {total_stats['episodic_migrated']}")
#313	print(f" Working promoted: {total_stats['working_migrated']}")
#314	if purged_tools:
#315	print(f" Tool memories purged: {purged_tools}")
#316	print(f" Total in canonical: {post_total} (was {pre_total})")
#317
#318	canonical_conn.close()
#319	return 0
#320
#321
#322	if __name__ == "__main__":
#323	sys.exit(main())
#324

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public