repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | """ |
| #2 | Mnemosyne Information-Theoretic Binary Vectors |
| #3 | ============================================== |
| #4 | Building on Moorcheh ITS (arXiv:2601.11557) |
| #5 | |
| #6 | Three core innovations: |
| #7 | 1. Maximally Informative Binarization (MIB) - 32x compression |
| #8 | 2. Efficient Distance Metric (EDM) - Hamming distance via bitwise ops |
| #9 | 3. Information-Theoretic Score (ITS) - deterministic ranking |
| #10 | |
| #11 | Replaces: float32 embeddings + HNSW index + cosine similarity |
| #12 | With: binary vectors + exhaustive scan + Hamming distance |
| #13 | |
| #14 | Benefits: |
| #15 | - 32x memory reduction |
| #16 | - Deterministic retrieval (same query = same results) |
| #17 | - No ANN index needed (no HNSW, no IVF, no PQ) |
| #18 | - SQLite-native storage |
| #19 | - CPU-efficient (bitwise XOR + popcount) |
| #20 | """ |
| #21 | |
| #22 | import numpy as np |
| #23 | import json |
| #24 | import sqlite3 |
| #25 | from typing import List, Dict, Tuple, Optional |
| #26 | from pathlib import Path |
| #27 | |
| #28 | |
| #29 | # --- Configuration --- |
| #30 | EMBEDDING_DIM = 384 # bge-small-en-v1.5 dimension |
| #31 | BITS_PER_BYTE = 8 |
| #32 | BYTES_PER_VECTOR = EMBEDDING_DIM // BITS_PER_BYTE # 48 bytes for 384 bits |
| #33 | |
| #34 | |
| #35 | # --- Module-level function aliases for beam.py compatibility --- |
| #36 | # beam.py imports these as: from mnemosyne.core.binary_vectors import maximally_informative_binarization as _mib, hamming_distance as _hamming |
| #37 | # but they were only defined as class static methods, causing ImportError silently setting them to None. |
| #38 | def maximally_informative_binarization(embedding) -> bytes: |
| #39 | """Module-level alias for BinaryVectorStore.maximally_informative_binarization.""" |
| #40 | return BinaryVectorStore.maximally_informative_binarization(embedding) |
| #41 | |
| #42 | |
| #43 | def hamming_distance(binary_a: bytes, binary_b: bytes) -> int: |
| #44 | """Module-level alias for BinaryVectorStore.hamming_distance.""" |
| #45 | return BinaryVectorStore.hamming_distance(binary_a, binary_b) |
| #46 | |
| #47 | |
| #48 | class BinaryVectorStore: |
| #49 | """ |
| #50 | SQLite-native binary vector storage with deterministic retrieval. |
| #51 | |
| #52 | No external vector DB needed. No ANN index. Just SQLite + numpy. |
| #53 | """ |
| #54 | |
| #55 | def __init__(self, db_path: Path = None, table_name: str = "binary_vectors", conn=None): |
| #56 | if conn is not None: |
| #57 | self.conn = conn |
| #58 | self.db_path = db_path or Path(":memory:") |
| #59 | else: |
| #60 | self.db_path = db_path or Path.home() / ".hermes" / "mnemosyne" / "data" / "mnemosyne.db" |
| #61 | self.conn = sqlite3.connect(str(self.db_path), check_same_thread=False) |
| #62 | self.table_name = table_name |
| #63 | self.conn.row_factory = sqlite3.Row |
| #64 | self._owns_connection = conn is None |
| #65 | self._init_table() |
| #66 | |
| #67 | def _init_table(self): |
| #68 | """Create binary vector table.""" |
| #69 | cursor = self.conn.cursor() |
| #70 | cursor.execute(f""" |
| #71 | CREATE TABLE IF NOT EXISTS {self.table_name} ( |
| #72 | memory_id TEXT PRIMARY KEY, |
| #73 | binary_vector BLOB NOT NULL, |
| #74 | original_dim INTEGER DEFAULT {EMBEDDING_DIM}, |
| #75 | magnitude REAL DEFAULT 1.0, |
| #76 | created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP |
| #77 | ) |
| #78 | """) |
| #79 | self.conn.commit() |
| #80 | |
| #81 | @staticmethod |
| #82 | def maximally_informative_binarization(embedding: np.ndarray) -> bytes: |
| #83 | """ |
| #84 | Convert float32 embedding to binary representation. |
| #85 | |
| #86 | Algorithm: For each dimension, if value > 0, bit = 1, else bit = 0. |
| #87 | This preserves the sign information which carries the most signal. |
| #88 | |
| #89 | Args: |
| #90 | embedding: float32 numpy array of shape (EMBEDDING_DIM,) |
| #91 | |
| #92 | Returns: |
| #93 | bytes: Binary representation (48 bytes for 384 dims) |
| #94 | """ |
| #95 | # Ensure correct shape |
| #96 | embedding = embedding.flatten()[:EMBEDDING_DIM] |
| #97 | |
| #98 | # Binarize: positive = 1, negative/zero = 0 |
| #99 | binary_bits = (embedding > 0).astype(np.uint8) |
| #100 | |
| #101 | # Pack bits into bytes |
| #102 | # Reshape to (48, 8) for 384 bits |
| #103 | n_bytes = (len(binary_bits) + 7) // 8 |
| #104 | padded = np.pad(binary_bits, (0, n_bytes * 8 - len(binary_bits)), mode='constant') |
| #105 | |
| #106 | # Pack bits into bytes |
| #107 | bytes_array = np.packbits(padded) |
| #108 | return bytes(bytes_array) |
| #109 | |
| #110 | @staticmethod |
| #111 | def hamming_distance(binary_a: bytes, binary_b: bytes) -> int: |
| #112 | """ |
| #113 | Compute Hamming distance between two binary vectors. |
| #114 | |
| #115 | Uses XOR + popcount for efficiency. |
| #116 | Distance = number of differing bits. |
| #117 | |
| #118 | Args: |
| #119 | binary_a: First binary vector |
| #120 | binary_b: Second binary vector |
| #121 | |
| #122 | Returns: |
| #123 | int: Hamming distance (0 = identical, max = EMBEDDING_DIM) |
| #124 | """ |
| #125 | # Convert to numpy arrays |
| #126 | arr_a = np.frombuffer(binary_a, dtype=np.uint8) |
| #127 | arr_b = np.frombuffer(binary_b, dtype=np.uint8) |
| #128 | |
| #129 | # XOR to find differing bits |
| #130 | xor_result = np.bitwise_xor(arr_a, arr_b) |
| #131 | |
| #132 | # Popcount (count set bits) |
| #133 | # Use lookup table for speed |
| #134 | popcount_table = np.array([ |
| #135 | bin(i).count('1') for i in range(256) |
| #136 | ], dtype=np.uint8) |
| #137 | |
| #138 | distance = np.sum(popcount_table[xor_result]) |
| #139 | return int(distance) |
| #140 | |
| #141 | @staticmethod |
| #142 | def information_theoretic_score(distance: int, dim: int = EMBEDDING_DIM) -> float: |
| #143 | """ |
| #144 | Convert Hamming distance to normalized relevance score. |
| #145 | |
| #146 | ITS = 1.0 - (distance / dim) |
| #147 | |
| #148 | Args: |
| #149 | distance: Hamming distance |
| #150 | dim: Total dimensions |
| #151 | |
| #152 | Returns: |
| #153 | float: Score in [0, 1], higher = more similar |
| #154 | """ |
| #155 | return 1.0 - (distance / dim) |
| #156 | |
| #157 | def store_vector(self, memory_id: str, embedding: np.ndarray): |
| #158 | """ |
| #159 | Store a binary vector for a memory. |
| #160 | |
| #161 | Args: |
| #162 | memory_id: Unique memory identifier |
| #163 | embedding: float32 embedding vector |
| #164 | """ |
| #165 | binary = self.maximally_informative_binarization(embedding) |
| #166 | magnitude = float(np.linalg.norm(embedding)) |
| #167 | |
| #168 | cursor = self.conn.cursor() |
| #169 | cursor.execute(f""" |
| #170 | INSERT OR REPLACE INTO {self.table_name} |
| #171 | (memory_id, binary_vector, original_dim, magnitude) |
| #172 | VALUES (?, ?, ?, ?) |
| #173 | """, (memory_id, binary, EMBEDDING_DIM, magnitude)) |
| #174 | self.conn.commit() |
| #175 | |
| #176 | def search(self, query_embedding: np.ndarray, top_k: int = 10) -> List[Dict]: |
| #177 | """ |
| #178 | Deterministic exhaustive search over all binary vectors. |
| #179 | |
| #180 | Args: |
| #181 | query_embedding: float32 query vector |
| #182 | top_k: Number of results to return |
| #183 | |
| #184 | Returns: |
| #185 | List of dicts: {memory_id, distance, score} |
| #186 | """ |
| #187 | query_binary = self.maximally_informative_binarization(query_embedding) |
| #188 | |
| #189 | # Fetch all vectors |
| #190 | cursor = self.conn.cursor() |
| #191 | cursor.execute(f""" |
| #192 | SELECT memory_id, binary_vector, magnitude |
| #193 | FROM {self.table_name} |
| #194 | """) |
| #195 | |
| #196 | results = [] |
| #197 | for row in cursor.fetchall(): |
| #198 | memory_id = row["memory_id"] |
| #199 | binary_vector = row["binary_vector"] |
| #200 | |
| #201 | # Compute Hamming distance |
| #202 | distance = self.hamming_distance(query_binary, binary_vector) |
| #203 | |
| #204 | # Convert to ITS score |
| #205 | score = self.information_theoretic_score(distance) |
| #206 | |
| #207 | results.append({ |
| #208 | "memory_id": memory_id, |
| #209 | "distance": distance, |
| #210 | "score": score |
| #211 | }) |
| #212 | |
| #213 | # Sort by score descending |
| #214 | results.sort(key=lambda x: x["score"], reverse=True) |
| #215 | |
| #216 | return results[:top_k] |
| #217 | |
| #218 | def search_batch(self, query_embeddings: List[np.ndarray], top_k: int = 10) -> List[List[Dict]]: |
| #219 | """Search multiple queries efficiently.""" |
| #220 | return [self.search(q, top_k) for q in query_embeddings] |
| #221 | |
| #222 | def delete_vector(self, memory_id: str): |
| #223 | """Remove a vector from storage.""" |
| #224 | cursor = self.conn.cursor() |
| #225 | cursor.execute(f""" |
| #226 | DELETE FROM {self.table_name} WHERE memory_id = ? |
| #227 | """, (memory_id,)) |
| #228 | self.conn.commit() |
| #229 | |
| #230 | def get_stats(self) -> Dict: |
| #231 | """Get storage statistics.""" |
| #232 | cursor = self.conn.cursor() |
| #233 | cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}") |
| #234 | count = cursor.fetchone()[0] |
| #235 | |
| #236 | cursor.execute(f""" |
| #237 | SELECT |
| #238 | COUNT(*) as count, |
| #239 | AVG(LENGTH(binary_vector)) as avg_bytes, |
| #240 | MAX(LENGTH(binary_vector)) as max_bytes, |
| #241 | MIN(LENGTH(binary_vector)) as min_bytes |
| #242 | FROM {self.table_name} |
| #243 | """) |
| #244 | row = cursor.fetchone() |
| #245 | |
| #246 | return { |
| #247 | "total_vectors": count, |
| #248 | "avg_bytes_per_vector": row["avg_bytes"] if row else 0, |
| #249 | "max_bytes": row["max_bytes"] if row else 0, |
| #250 | "min_bytes": row["min_bytes"] if row else 0, |
| #251 | "compression_ratio": 48.0 / (EMBEDDING_DIM * 4), # 48 bytes vs 1536 bytes float32 |
| #252 | "theoretical_size_mb": (count * 48) / (1024 * 1024) |
| #253 | } |
| #254 | |
| #255 | def close(self): |
| #256 | """Close database connection.""" |
| #257 | self.conn.close() |
| #258 | |
| #259 | |
| #260 | class FastBinarySearch: |
| #261 | """ |
| #262 | Optimized binary vector search with numpy batching. |
| #263 | For high-throughput scenarios. |
| #264 | """ |
| #265 | |
| #266 | def __init__(self, binary_vectors: Dict[str, bytes]): |
| #267 | """ |
| #268 | Initialize with pre-loaded binary vectors. |
| #269 | |
| #270 | Args: |
| #271 | binary_vectors: Dict mapping memory_id -> binary bytes |
| #272 | """ |
| #273 | self.memory_ids = list(binary_vectors.keys()) |
| #274 | self.vectors = np.array([ |
| #275 | np.frombuffer(v, dtype=np.uint8) |
| #276 | for v in binary_vectors.values() |
| #277 | ]) |
| #278 | self.popcount_table = np.array([ |
| #279 | bin(i).count('1') for i in range(256) |
| #280 | ], dtype=np.uint32) |
| #281 | |
| #282 | def search(self, query_binary: bytes, top_k: int = 10) -> List[Dict]: |
| #283 | """ |
| #284 | Fast batch Hamming distance computation. |
| #285 | |
| #286 | Args: |
| #287 | query_binary: Binary query vector |
| #288 | top_k: Number of results |
| #289 | |
| #290 | Returns: |
| #291 | List of {memory_id, distance, score} |
| #292 | """ |
| #293 | query_arr = np.frombuffer(query_binary, dtype=np.uint8) |
| #294 | |
| #295 | # Broadcast XOR across all vectors |
| #296 | xor_results = np.bitwise_xor(self.vectors, query_arr) |
| #297 | |
| #298 | # Vectorized popcount |
| #299 | distances = np.sum(self.popcount_table[xor_results], axis=1) |
| #300 | |
| #301 | # Get top-k indices |
| #302 | top_indices = np.argsort(distances)[:top_k] |
| #303 | |
| #304 | results = [] |
| #305 | for idx in top_indices: |
| #306 | distance = int(distances[idx]) |
| #307 | score = 1.0 - (distance / EMBEDDING_DIM) |
| #308 | results.append({ |
| #309 | "memory_id": self.memory_ids[idx], |
| #310 | "distance": distance, |
| #311 | "score": score |
| #312 | }) |
| #313 | |
| #314 | return results |
| #315 | |
| #316 | |
| #317 | # --- Testing --- |
| #318 | if __name__ == "__main__": |
| #319 | import tempfile |
| #320 | import os |
| #321 | |
| #322 | print("Binary Vector Store Tests") |
| #323 | print("=" * 60) |
| #324 | |
| #325 | # Create temp database |
| #326 | with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: |
| #327 | db_path = f.name |
| #328 | |
| #329 | store = BinaryVectorStore(db_path=Path(db_path)) |
| #330 | |
| #331 | # Generate test embeddings |
| #332 | np.random.seed(42) |
| #333 | test_embeddings = [ |
| #334 | np.random.randn(EMBEDDING_DIM).astype(np.float32), |
| #335 | np.random.randn(EMBEDDING_DIM).astype(np.float32), |
| #336 | np.random.randn(EMBEDDING_DIM).astype(np.float32), |
| #337 | ] |
| #338 | |
| #339 | # Store vectors |
| #340 | for i, emb in enumerate(test_embeddings): |
| #341 | store.store_vector(f"mem_{i}", emb) |
| #342 | |
| #343 | # Search |
| #344 | query = test_embeddings[0] |
| #345 | results = store.search(query, top_k=3) |
| #346 | |
| #347 | print(f"Query vector matches itself:") |
| #348 | print(f" Top result: {results[0]['memory_id']} (score: {results[0]['score']:.4f})") |
| #349 | print(f" Distance: {results[0]['distance']} / {EMBEDDING_DIM}") |
| #350 | |
| #351 | # Stats |
| #352 | stats = store.get_stats() |
| #353 | print(f"\nStorage Stats:") |
| #354 | print(f" Total vectors: {stats['total_vectors']}") |
| #355 | print(f" Bytes per vector: {stats['avg_bytes_per_vector']}") |
| #356 | print(f" Compression ratio: {stats['compression_ratio']:.2%}") |
| #357 | print(f" Theoretical size: {stats['theoretical_size_mb']:.4f} MB") |
| #358 | |
| #359 | # Cleanup |
| #360 | store.close() |
| #361 | os.unlink(db_path) |
| #362 | |
| #363 | print("\n" + "=" * 60) |
| #364 | print("Binary vector tests passed!") |
| #365 |