repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | --- |
| #2 | title: Performance Optimization |
| #3 | --- |
| #4 | |
| #5 | Optimizing reranker performance is crucial for maintaining fast search response times while improving result quality. This guide covers best practices for different reranker types. |
| #6 | |
| #7 | ## General Optimization Principles |
| #8 | |
| #9 | ### Candidate Set Size |
| #10 | The number of candidates sent to the reranker significantly impacts performance: |
| #11 | |
| #12 | ```python |
| #13 | # Optimal candidate sizes for different rerankers |
| #14 | config_map = { |
| #15 | "cohere": {"initial_candidates": 100, "top_n": 10}, |
| #16 | "sentence_transformer": {"initial_candidates": 50, "top_n": 10}, |
| #17 | "huggingface": {"initial_candidates": 30, "top_n": 5}, |
| #18 | "llm_reranker": {"initial_candidates": 20, "top_n": 5} |
| #19 | } |
| #20 | ``` |
| #21 | |
| #22 | ### Batching Strategy |
| #23 | Process multiple queries efficiently: |
| #24 | |
| #25 | ```python |
| #26 | # Configure for batch processing |
| #27 | config = { |
| #28 | "reranker": { |
| #29 | "provider": "sentence_transformer", |
| #30 | "config": { |
| #31 | "model": "cross-encoder/ms-marco-MiniLM-L-6-v2", |
| #32 | "batch_size": 16, # Process multiple candidates at once |
| #33 | "top_n": 10 |
| #34 | } |
| #35 | } |
| #36 | } |
| #37 | ``` |
| #38 | |
| #39 | ## Provider-Specific Optimizations |
| #40 | |
| #41 | ### Cohere Optimization |
| #42 | |
| #43 | ```python |
| #44 | # Optimized Cohere configuration |
| #45 | config = { |
| #46 | "reranker": { |
| #47 | "provider": "cohere", |
| #48 | "config": { |
| #49 | "model": "rerank-english-v3.0", |
| #50 | "top_n": 10, |
| #51 | "max_chunks_per_doc": 10, # Limit chunk processing |
| #52 | "return_documents": False # Reduce response size |
| #53 | } |
| #54 | } |
| #55 | } |
| #56 | ``` |
| #57 | |
| #58 | **Best Practices:** |
| #59 | - Use v3.0 models for better speed/accuracy balance |
| #60 | - Limit candidates to 100 or fewer |
| #61 | - Cache API responses when possible |
| #62 | - Monitor API rate limits |
| #63 | |
| #64 | ### Sentence Transformer Optimization |
| #65 | |
| #66 | ```python |
| #67 | # Performance-optimized configuration |
| #68 | config = { |
| #69 | "reranker": { |
| #70 | "provider": "sentence_transformer", |
| #71 | "config": { |
| #72 | "model": "cross-encoder/ms-marco-MiniLM-L-6-v2", |
| #73 | "device": "cuda", # Use GPU when available |
| #74 | "batch_size": 32, |
| #75 | "top_n": 10, |
| #76 | "max_length": 512 # Limit input length |
| #77 | } |
| #78 | } |
| #79 | } |
| #80 | ``` |
| #81 | |
| #82 | **Device Optimization:** |
| #83 | ```python |
| #84 | import torch |
| #85 | |
| #86 | # Auto-detect best device |
| #87 | device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" |
| #88 | |
| #89 | config = { |
| #90 | "reranker": { |
| #91 | "provider": "sentence_transformer", |
| #92 | "config": { |
| #93 | "device": device, |
| #94 | "model": "cross-encoder/ms-marco-MiniLM-L-6-v2" |
| #95 | } |
| #96 | } |
| #97 | } |
| #98 | ``` |
| #99 | |
| #100 | ### Hugging Face Optimization |
| #101 | |
| #102 | ```python |
| #103 | # Optimized for Hugging Face models |
| #104 | config = { |
| #105 | "reranker": { |
| #106 | "provider": "huggingface", |
| #107 | "config": { |
| #108 | "model": "BAAI/bge-reranker-base", |
| #109 | "use_fp16": True, # Half precision for speed |
| #110 | "max_length": 512, |
| #111 | "batch_size": 8, |
| #112 | "top_n": 10 |
| #113 | } |
| #114 | } |
| #115 | } |
| #116 | ``` |
| #117 | |
| #118 | ### LLM Reranker Optimization |
| #119 | |
| #120 | ```python |
| #121 | # Optimized LLM reranker configuration |
| #122 | config = { |
| #123 | "reranker": { |
| #124 | "provider": "llm_reranker", |
| #125 | "config": { |
| #126 | "llm": { |
| #127 | "provider": "openai", |
| #128 | "config": { |
| #129 | "model": "gpt-3.5-turbo", # Faster than gpt-4 |
| #130 | "temperature": 0, # Deterministic results |
| #131 | "max_tokens": 500 # Limit response length |
| #132 | } |
| #133 | }, |
| #134 | "batch_ranking": True, # Rank multiple at once |
| #135 | "top_n": 5, # Fewer results for faster processing |
| #136 | "timeout": 10 # Request timeout |
| #137 | } |
| #138 | } |
| #139 | } |
| #140 | ``` |
| #141 | |
| #142 | ## Performance Monitoring |
| #143 | |
| #144 | ### Latency Tracking |
| #145 | ```python |
| #146 | import time |
| #147 | from mem0 import Memory |
| #148 | |
| #149 | def measure_reranker_performance(config, queries, user_id): |
| #150 | memory = Memory.from_config(config) |
| #151 | |
| #152 | latencies = [] |
| #153 | for query in queries: |
| #154 | start_time = time.time() |
| #155 | results = memory.search(query, user_id=user_id) |
| #156 | latency = time.time() - start_time |
| #157 | latencies.append(latency) |
| #158 | |
| #159 | return { |
| #160 | "avg_latency": sum(latencies) / len(latencies), |
| #161 | "max_latency": max(latencies), |
| #162 | "min_latency": min(latencies) |
| #163 | } |
| #164 | ``` |
| #165 | |
| #166 | ### Memory Usage Monitoring |
| #167 | ```python |
| #168 | import psutil |
| #169 | import os |
| #170 | |
| #171 | def monitor_memory_usage(): |
| #172 | process = psutil.Process(os.getpid()) |
| #173 | return { |
| #174 | "memory_mb": process.memory_info().rss / 1024 / 1024, |
| #175 | "memory_percent": process.memory_percent() |
| #176 | } |
| #177 | ``` |
| #178 | |
| #179 | ## Caching Strategies |
| #180 | |
| #181 | ### Result Caching |
| #182 | ```python |
| #183 | from functools import lru_cache |
| #184 | import hashlib |
| #185 | |
| #186 | class CachedReranker: |
| #187 | def __init__(self, config): |
| #188 | self.memory = Memory.from_config(config) |
| #189 | self.cache_size = 1000 |
| #190 | |
| #191 | @lru_cache(maxsize=1000) |
| #192 | def search_cached(self, query_hash, user_id): |
| #193 | return self.memory.search(query, user_id=user_id) |
| #194 | |
| #195 | def search(self, query, user_id): |
| #196 | query_hash = hashlib.md5(f"{query}_{user_id}".encode()).hexdigest() |
| #197 | return self.search_cached(query_hash, user_id) |
| #198 | ``` |
| #199 | |
| #200 | ### Model Caching |
| #201 | ```python |
| #202 | # Pre-load models to avoid initialization overhead |
| #203 | config = { |
| #204 | "reranker": { |
| #205 | "provider": "sentence_transformer", |
| #206 | "config": { |
| #207 | "model": "cross-encoder/ms-marco-MiniLM-L-6-v2", |
| #208 | "cache_folder": "/path/to/model/cache", |
| #209 | "device": "cuda" |
| #210 | } |
| #211 | } |
| #212 | } |
| #213 | ``` |
| #214 | |
| #215 | ## Parallel Processing |
| #216 | |
| #217 | ### Async Configuration |
| #218 | ```python |
| #219 | import asyncio |
| #220 | from mem0 import Memory |
| #221 | |
| #222 | async def parallel_search(config, queries, user_id): |
| #223 | memory = Memory.from_config(config) |
| #224 | |
| #225 | # Process multiple queries concurrently |
| #226 | tasks = [ |
| #227 | memory.search_async(query, user_id=user_id) |
| #228 | for query in queries |
| #229 | ] |
| #230 | |
| #231 | results = await asyncio.gather(*tasks) |
| #232 | return results |
| #233 | ``` |
| #234 | |
| #235 | ## Hardware Optimization |
| #236 | |
| #237 | ### GPU Configuration |
| #238 | ```python |
| #239 | # Optimize for GPU usage |
| #240 | import torch |
| #241 | |
| #242 | if torch.cuda.is_available(): |
| #243 | torch.cuda.set_per_process_memory_fraction(0.8) # Reserve GPU memory |
| #244 | |
| #245 | config = { |
| #246 | "reranker": { |
| #247 | "provider": "sentence_transformer", |
| #248 | "config": { |
| #249 | "device": "cuda", |
| #250 | "model": "cross-encoder/ms-marco-electra-base", |
| #251 | "batch_size": 64, # Larger batch for GPU |
| #252 | "fp16": True # Half precision |
| #253 | } |
| #254 | } |
| #255 | } |
| #256 | ``` |
| #257 | |
| #258 | ### CPU Optimization |
| #259 | ```python |
| #260 | import torch |
| #261 | |
| #262 | # Optimize CPU threading |
| #263 | torch.set_num_threads(4) # Adjust based on your CPU |
| #264 | |
| #265 | config = { |
| #266 | "reranker": { |
| #267 | "provider": "sentence_transformer", |
| #268 | "config": { |
| #269 | "device": "cpu", |
| #270 | "model": "cross-encoder/ms-marco-MiniLM-L-6-v2", |
| #271 | "num_workers": 4 # Parallel processing |
| #272 | } |
| #273 | } |
| #274 | } |
| #275 | ``` |
| #276 | |
| #277 | ## Benchmarking Different Configurations |
| #278 | |
| #279 | ```python |
| #280 | def benchmark_rerankers(): |
| #281 | configs = [ |
| #282 | {"provider": "cohere", "model": "rerank-english-v3.0"}, |
| #283 | {"provider": "sentence_transformer", "model": "cross-encoder/ms-marco-MiniLM-L-6-v2"}, |
| #284 | {"provider": "huggingface", "model": "BAAI/bge-reranker-base"} |
| #285 | ] |
| #286 | |
| #287 | test_queries = ["sample query 1", "sample query 2", "sample query 3"] |
| #288 | |
| #289 | results = {} |
| #290 | for config in configs: |
| #291 | provider = config["provider"] |
| #292 | performance = measure_reranker_performance( |
| #293 | {"reranker": {"provider": provider, "config": config}}, |
| #294 | test_queries, |
| #295 | "test_user" |
| #296 | ) |
| #297 | results[provider] = performance |
| #298 | |
| #299 | return results |
| #300 | ``` |
| #301 | |
| #302 | ## Production Best Practices |
| #303 | |
| #304 | 1. **Model Selection**: Choose the right balance of speed vs. accuracy |
| #305 | 2. **Resource Allocation**: Monitor CPU/GPU usage and memory consumption |
| #306 | 3. **Error Handling**: Implement fallbacks for reranker failures |
| #307 | 4. **Load Balancing**: Distribute reranking load across multiple instances |
| #308 | 5. **Monitoring**: Track latency, throughput, and error rates |
| #309 | 6. **Caching**: Cache frequent queries and model predictions |
| #310 | 7. **Batch Processing**: Group similar queries for efficient processing |