my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	"""
#2	Mnemosyne Information-Theoretic Binary Vectors
#3	==============================================
#4	Building on Moorcheh ITS (arXiv:2601.11557)
#5
#6	Three core innovations:
#7	1. Maximally Informative Binarization (MIB) - 32x compression
#8	2. Efficient Distance Metric (EDM) - Hamming distance via bitwise ops
#9	3. Information-Theoretic Score (ITS) - deterministic ranking
#10
#11	Replaces: float32 embeddings + HNSW index + cosine similarity
#12	With: binary vectors + exhaustive scan + Hamming distance
#13
#14	Benefits:
#15	- 32x memory reduction
#16	- Deterministic retrieval (same query = same results)
#17	- No ANN index needed (no HNSW, no IVF, no PQ)
#18	- SQLite-native storage
#19	- CPU-efficient (bitwise XOR + popcount)
#20	"""
#21
#22	import numpy as np
#23	import json
#24	import sqlite3
#25	from typing import List, Dict, Tuple, Optional
#26	from pathlib import Path
#27
#28
#29	# --- Configuration ---
#30	EMBEDDING_DIM = 384 # bge-small-en-v1.5 dimension
#31	BITS_PER_BYTE = 8
#32	BYTES_PER_VECTOR = EMBEDDING_DIM // BITS_PER_BYTE # 48 bytes for 384 bits
#33
#34
#35	# --- Module-level function aliases for beam.py compatibility ---
#36	# beam.py imports these as: from mnemosyne.core.binary_vectors import maximally_informative_binarization as _mib, hamming_distance as _hamming
#37	# but they were only defined as class static methods, causing ImportError silently setting them to None.
#38	def maximally_informative_binarization(embedding) -> bytes:
#39	"""Module-level alias for BinaryVectorStore.maximally_informative_binarization."""
#40	return BinaryVectorStore.maximally_informative_binarization(embedding)
#41
#42
#43	def hamming_distance(binary_a: bytes, binary_b: bytes) -> int:
#44	"""Module-level alias for BinaryVectorStore.hamming_distance."""
#45	return BinaryVectorStore.hamming_distance(binary_a, binary_b)
#46
#47
#48	class BinaryVectorStore:
#49	"""
#50	SQLite-native binary vector storage with deterministic retrieval.
#51
#52	No external vector DB needed. No ANN index. Just SQLite + numpy.
#53	"""
#54
#55	def __init__(self, db_path: Path = None, table_name: str = "binary_vectors", conn=None):
#56	if conn is not None:
#57	self.conn = conn
#58	self.db_path = db_path or Path(":memory:")
#59	else:
#60	self.db_path = db_path or Path.home() / ".hermes" / "mnemosyne" / "data" / "mnemosyne.db"
#61	self.conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
#62	self.table_name = table_name
#63	self.conn.row_factory = sqlite3.Row
#64	self._owns_connection = conn is None
#65	self._init_table()
#66
#67	def _init_table(self):
#68	"""Create binary vector table."""
#69	cursor = self.conn.cursor()
#70	cursor.execute(f"""
#71	CREATE TABLE IF NOT EXISTS {self.table_name} (
#72	memory_id TEXT PRIMARY KEY,
#73	binary_vector BLOB NOT NULL,
#74	original_dim INTEGER DEFAULT {EMBEDDING_DIM},
#75	magnitude REAL DEFAULT 1.0,
#76	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
#77	)
#78	""")
#79	self.conn.commit()
#80
#81	@staticmethod
#82	def maximally_informative_binarization(embedding: np.ndarray) -> bytes:
#83	"""
#84	Convert float32 embedding to binary representation.
#85
#86	Algorithm: For each dimension, if value > 0, bit = 1, else bit = 0.
#87	This preserves the sign information which carries the most signal.
#88
#89	Args:
#90	embedding: float32 numpy array of shape (EMBEDDING_DIM,)
#91
#92	Returns:
#93	bytes: Binary representation (48 bytes for 384 dims)
#94	"""
#95	# Ensure correct shape
#96	embedding = embedding.flatten()[:EMBEDDING_DIM]
#97
#98	# Binarize: positive = 1, negative/zero = 0
#99	binary_bits = (embedding > 0).astype(np.uint8)
#100
#101	# Pack bits into bytes
#102	# Reshape to (48, 8) for 384 bits
#103	n_bytes = (len(binary_bits) + 7) // 8
#104	padded = np.pad(binary_bits, (0, n_bytes * 8 - len(binary_bits)), mode='constant')
#105
#106	# Pack bits into bytes
#107	bytes_array = np.packbits(padded)
#108	return bytes(bytes_array)
#109
#110	@staticmethod
#111	def hamming_distance(binary_a: bytes, binary_b: bytes) -> int:
#112	"""
#113	Compute Hamming distance between two binary vectors.
#114
#115	Uses XOR + popcount for efficiency.
#116	Distance = number of differing bits.
#117
#118	Args:
#119	binary_a: First binary vector
#120	binary_b: Second binary vector
#121
#122	Returns:
#123	int: Hamming distance (0 = identical, max = EMBEDDING_DIM)
#124	"""
#125	# Convert to numpy arrays
#126	arr_a = np.frombuffer(binary_a, dtype=np.uint8)
#127	arr_b = np.frombuffer(binary_b, dtype=np.uint8)
#128
#129	# XOR to find differing bits
#130	xor_result = np.bitwise_xor(arr_a, arr_b)
#131
#132	# Popcount (count set bits)
#133	# Use lookup table for speed
#134	popcount_table = np.array([
#135	bin(i).count('1') for i in range(256)
#136	], dtype=np.uint8)
#137
#138	distance = np.sum(popcount_table[xor_result])
#139	return int(distance)
#140
#141	@staticmethod
#142	def information_theoretic_score(distance: int, dim: int = EMBEDDING_DIM) -> float:
#143	"""
#144	Convert Hamming distance to normalized relevance score.
#145
#146	ITS = 1.0 - (distance / dim)
#147
#148	Args:
#149	distance: Hamming distance
#150	dim: Total dimensions
#151
#152	Returns:
#153	float: Score in [0, 1], higher = more similar
#154	"""
#155	return 1.0 - (distance / dim)
#156
#157	def store_vector(self, memory_id: str, embedding: np.ndarray):
#158	"""
#159	Store a binary vector for a memory.
#160
#161	Args:
#162	memory_id: Unique memory identifier
#163	embedding: float32 embedding vector
#164	"""
#165	binary = self.maximally_informative_binarization(embedding)
#166	magnitude = float(np.linalg.norm(embedding))
#167
#168	cursor = self.conn.cursor()
#169	cursor.execute(f"""
#170	INSERT OR REPLACE INTO {self.table_name}
#171	(memory_id, binary_vector, original_dim, magnitude)
#172	VALUES (?, ?, ?, ?)
#173	""", (memory_id, binary, EMBEDDING_DIM, magnitude))
#174	self.conn.commit()
#175
#176	def search(self, query_embedding: np.ndarray, top_k: int = 10) -> List[Dict]:
#177	"""
#178	Deterministic exhaustive search over all binary vectors.
#179
#180	Args:
#181	query_embedding: float32 query vector
#182	top_k: Number of results to return
#183
#184	Returns:
#185	List of dicts: {memory_id, distance, score}
#186	"""
#187	query_binary = self.maximally_informative_binarization(query_embedding)
#188
#189	# Fetch all vectors
#190	cursor = self.conn.cursor()
#191	cursor.execute(f"""
#192	SELECT memory_id, binary_vector, magnitude
#193	FROM {self.table_name}
#194	""")
#195
#196	results = []
#197	for row in cursor.fetchall():
#198	memory_id = row["memory_id"]
#199	binary_vector = row["binary_vector"]
#200
#201	# Compute Hamming distance
#202	distance = self.hamming_distance(query_binary, binary_vector)
#203
#204	# Convert to ITS score
#205	score = self.information_theoretic_score(distance)
#206
#207	results.append({
#208	"memory_id": memory_id,
#209	"distance": distance,
#210	"score": score
#211	})
#212
#213	# Sort by score descending
#214	results.sort(key=lambda x: x["score"], reverse=True)
#215
#216	return results[:top_k]
#217
#218	def search_batch(self, query_embeddings: List[np.ndarray], top_k: int = 10) -> List[List[Dict]]:
#219	"""Search multiple queries efficiently."""
#220	return [self.search(q, top_k) for q in query_embeddings]
#221
#222	def delete_vector(self, memory_id: str):
#223	"""Remove a vector from storage."""
#224	cursor = self.conn.cursor()
#225	cursor.execute(f"""
#226	DELETE FROM {self.table_name} WHERE memory_id = ?
#227	""", (memory_id,))
#228	self.conn.commit()
#229
#230	def get_stats(self) -> Dict:
#231	"""Get storage statistics."""
#232	cursor = self.conn.cursor()
#233	cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
#234	count = cursor.fetchone()[0]
#235
#236	cursor.execute(f"""
#237	SELECT
#238	COUNT(*) as count,
#239	AVG(LENGTH(binary_vector)) as avg_bytes,
#240	MAX(LENGTH(binary_vector)) as max_bytes,
#241	MIN(LENGTH(binary_vector)) as min_bytes
#242	FROM {self.table_name}
#243	""")
#244	row = cursor.fetchone()
#245
#246	return {
#247	"total_vectors": count,
#248	"avg_bytes_per_vector": row["avg_bytes"] if row else 0,
#249	"max_bytes": row["max_bytes"] if row else 0,
#250	"min_bytes": row["min_bytes"] if row else 0,
#251	"compression_ratio": 48.0 / (EMBEDDING_DIM * 4), # 48 bytes vs 1536 bytes float32
#252	"theoretical_size_mb": (count * 48) / (1024 * 1024)
#253	}
#254
#255	def close(self):
#256	"""Close database connection."""
#257	self.conn.close()
#258
#259
#260	class FastBinarySearch:
#261	"""
#262	Optimized binary vector search with numpy batching.
#263	For high-throughput scenarios.
#264	"""
#265
#266	def __init__(self, binary_vectors: Dict[str, bytes]):
#267	"""
#268	Initialize with pre-loaded binary vectors.
#269
#270	Args:
#271	binary_vectors: Dict mapping memory_id -> binary bytes
#272	"""
#273	self.memory_ids = list(binary_vectors.keys())
#274	self.vectors = np.array([
#275	np.frombuffer(v, dtype=np.uint8)
#276	for v in binary_vectors.values()
#277	])
#278	self.popcount_table = np.array([
#279	bin(i).count('1') for i in range(256)
#280	], dtype=np.uint32)
#281
#282	def search(self, query_binary: bytes, top_k: int = 10) -> List[Dict]:
#283	"""
#284	Fast batch Hamming distance computation.
#285
#286	Args:
#287	query_binary: Binary query vector
#288	top_k: Number of results
#289
#290	Returns:
#291	List of {memory_id, distance, score}
#292	"""
#293	query_arr = np.frombuffer(query_binary, dtype=np.uint8)
#294
#295	# Broadcast XOR across all vectors
#296	xor_results = np.bitwise_xor(self.vectors, query_arr)
#297
#298	# Vectorized popcount
#299	distances = np.sum(self.popcount_table[xor_results], axis=1)
#300
#301	# Get top-k indices
#302	top_indices = np.argsort(distances)[:top_k]
#303
#304	results = []
#305	for idx in top_indices:
#306	distance = int(distances[idx])
#307	score = 1.0 - (distance / EMBEDDING_DIM)
#308	results.append({
#309	"memory_id": self.memory_ids[idx],
#310	"distance": distance,
#311	"score": score
#312	})
#313
#314	return results
#315
#316
#317	# --- Testing ---
#318	if __name__ == "__main__":
#319	import tempfile
#320	import os
#321
#322	print("Binary Vector Store Tests")
#323	print("=" * 60)
#324
#325	# Create temp database
#326	with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
#327	db_path = f.name
#328
#329	store = BinaryVectorStore(db_path=Path(db_path))
#330
#331	# Generate test embeddings
#332	np.random.seed(42)
#333	test_embeddings = [
#334	np.random.randn(EMBEDDING_DIM).astype(np.float32),
#335	np.random.randn(EMBEDDING_DIM).astype(np.float32),
#336	np.random.randn(EMBEDDING_DIM).astype(np.float32),
#337	]
#338
#339	# Store vectors
#340	for i, emb in enumerate(test_embeddings):
#341	store.store_vector(f"mem_{i}", emb)
#342
#343	# Search
#344	query = test_embeddings[0]
#345	results = store.search(query, top_k=3)
#346
#347	print(f"Query vector matches itself:")
#348	print(f" Top result: {results[0]['memory_id']} (score: {results[0]['score']:.4f})")
#349	print(f" Distance: {results[0]['distance']} / {EMBEDDING_DIM}")
#350
#351	# Stats
#352	stats = store.get_stats()
#353	print(f"\nStorage Stats:")
#354	print(f" Total vectors: {stats['total_vectors']}")
#355	print(f" Bytes per vector: {stats['avg_bytes_per_vector']}")
#356	print(f" Compression ratio: {stats['compression_ratio']:.2%}")
#357	print(f" Theoretical size: {stats['theoretical_size_mb']:.4f} MB")
#358
#359	# Cleanup
#360	store.close()
#361	os.unlink(db_path)
#362
#363	print("\n" + "=" * 60)
#364	print("Binary vector tests passed!")
#365

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public