my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	---
#2	title: Performance Optimization
#3	---
#4
#5	Optimizing reranker performance is crucial for maintaining fast search response times while improving result quality. This guide covers best practices for different reranker types.
#6
#7	## General Optimization Principles
#8
#9	### Candidate Set Size
#10	The number of candidates sent to the reranker significantly impacts performance:
#11
#12	```python
#13	# Optimal candidate sizes for different rerankers
#14	config_map = {
#15	"cohere": {"initial_candidates": 100, "top_n": 10},
#16	"sentence_transformer": {"initial_candidates": 50, "top_n": 10},
#17	"huggingface": {"initial_candidates": 30, "top_n": 5},
#18	"llm_reranker": {"initial_candidates": 20, "top_n": 5}
#19	}
#20	```
#21
#22	### Batching Strategy
#23	Process multiple queries efficiently:
#24
#25	```python
#26	# Configure for batch processing
#27	config = {
#28	"reranker": {
#29	"provider": "sentence_transformer",
#30	"config": {
#31	"model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
#32	"batch_size": 16, # Process multiple candidates at once
#33	"top_n": 10
#34	}
#35	}
#36	}
#37	```
#38
#39	## Provider-Specific Optimizations
#40
#41	### Cohere Optimization
#42
#43	```python
#44	# Optimized Cohere configuration
#45	config = {
#46	"reranker": {
#47	"provider": "cohere",
#48	"config": {
#49	"model": "rerank-english-v3.0",
#50	"top_n": 10,
#51	"max_chunks_per_doc": 10, # Limit chunk processing
#52	"return_documents": False # Reduce response size
#53	}
#54	}
#55	}
#56	```
#57
#58	Best Practices:
#59	- Use v3.0 models for better speed/accuracy balance
#60	- Limit candidates to 100 or fewer
#61	- Cache API responses when possible
#62	- Monitor API rate limits
#63
#64	### Sentence Transformer Optimization
#65
#66	```python
#67	# Performance-optimized configuration
#68	config = {
#69	"reranker": {
#70	"provider": "sentence_transformer",
#71	"config": {
#72	"model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
#73	"device": "cuda", # Use GPU when available
#74	"batch_size": 32,
#75	"top_n": 10,
#76	"max_length": 512 # Limit input length
#77	}
#78	}
#79	}
#80	```
#81
#82	Device Optimization:
#83	```python
#84	import torch
#85
#86	# Auto-detect best device
#87	device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
#88
#89	config = {
#90	"reranker": {
#91	"provider": "sentence_transformer",
#92	"config": {
#93	"device": device,
#94	"model": "cross-encoder/ms-marco-MiniLM-L-6-v2"
#95	}
#96	}
#97	}
#98	```
#99
#100	### Hugging Face Optimization
#101
#102	```python
#103	# Optimized for Hugging Face models
#104	config = {
#105	"reranker": {
#106	"provider": "huggingface",
#107	"config": {
#108	"model": "BAAI/bge-reranker-base",
#109	"use_fp16": True, # Half precision for speed
#110	"max_length": 512,
#111	"batch_size": 8,
#112	"top_n": 10
#113	}
#114	}
#115	}
#116	```
#117
#118	### LLM Reranker Optimization
#119
#120	```python
#121	# Optimized LLM reranker configuration
#122	config = {
#123	"reranker": {
#124	"provider": "llm_reranker",
#125	"config": {
#126	"llm": {
#127	"provider": "openai",
#128	"config": {
#129	"model": "gpt-3.5-turbo", # Faster than gpt-4
#130	"temperature": 0, # Deterministic results
#131	"max_tokens": 500 # Limit response length
#132	}
#133	},
#134	"batch_ranking": True, # Rank multiple at once
#135	"top_n": 5, # Fewer results for faster processing
#136	"timeout": 10 # Request timeout
#137	}
#138	}
#139	}
#140	```
#141
#142	## Performance Monitoring
#143
#144	### Latency Tracking
#145	```python
#146	import time
#147	from mem0 import Memory
#148
#149	def measure_reranker_performance(config, queries, user_id):
#150	memory = Memory.from_config(config)
#151
#152	latencies = []
#153	for query in queries:
#154	start_time = time.time()
#155	results = memory.search(query, user_id=user_id)
#156	latency = time.time() - start_time
#157	latencies.append(latency)
#158
#159	return {
#160	"avg_latency": sum(latencies) / len(latencies),
#161	"max_latency": max(latencies),
#162	"min_latency": min(latencies)
#163	}
#164	```
#165
#166	### Memory Usage Monitoring
#167	```python
#168	import psutil
#169	import os
#170
#171	def monitor_memory_usage():
#172	process = psutil.Process(os.getpid())
#173	return {
#174	"memory_mb": process.memory_info().rss / 1024 / 1024,
#175	"memory_percent": process.memory_percent()
#176	}
#177	```
#178
#179	## Caching Strategies
#180
#181	### Result Caching
#182	```python
#183	from functools import lru_cache
#184	import hashlib
#185
#186	class CachedReranker:
#187	def __init__(self, config):
#188	self.memory = Memory.from_config(config)
#189	self.cache_size = 1000
#190
#191	@lru_cache(maxsize=1000)
#192	def search_cached(self, query_hash, user_id):
#193	return self.memory.search(query, user_id=user_id)
#194
#195	def search(self, query, user_id):
#196	query_hash = hashlib.md5(f"{query}_{user_id}".encode()).hexdigest()
#197	return self.search_cached(query_hash, user_id)
#198	```
#199
#200	### Model Caching
#201	```python
#202	# Pre-load models to avoid initialization overhead
#203	config = {
#204	"reranker": {
#205	"provider": "sentence_transformer",
#206	"config": {
#207	"model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
#208	"cache_folder": "/path/to/model/cache",
#209	"device": "cuda"
#210	}
#211	}
#212	}
#213	```
#214
#215	## Parallel Processing
#216
#217	### Async Configuration
#218	```python
#219	import asyncio
#220	from mem0 import Memory
#221
#222	async def parallel_search(config, queries, user_id):
#223	memory = Memory.from_config(config)
#224
#225	# Process multiple queries concurrently
#226	tasks = [
#227	memory.search_async(query, user_id=user_id)
#228	for query in queries
#229	]
#230
#231	results = await asyncio.gather(*tasks)
#232	return results
#233	```
#234
#235	## Hardware Optimization
#236
#237	### GPU Configuration
#238	```python
#239	# Optimize for GPU usage
#240	import torch
#241
#242	if torch.cuda.is_available():
#243	torch.cuda.set_per_process_memory_fraction(0.8) # Reserve GPU memory
#244
#245	config = {
#246	"reranker": {
#247	"provider": "sentence_transformer",
#248	"config": {
#249	"device": "cuda",
#250	"model": "cross-encoder/ms-marco-electra-base",
#251	"batch_size": 64, # Larger batch for GPU
#252	"fp16": True # Half precision
#253	}
#254	}
#255	}
#256	```
#257
#258	### CPU Optimization
#259	```python
#260	import torch
#261
#262	# Optimize CPU threading
#263	torch.set_num_threads(4) # Adjust based on your CPU
#264
#265	config = {
#266	"reranker": {
#267	"provider": "sentence_transformer",
#268	"config": {
#269	"device": "cpu",
#270	"model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
#271	"num_workers": 4 # Parallel processing
#272	}
#273	}
#274	}
#275	```
#276
#277	## Benchmarking Different Configurations
#278
#279	```python
#280	def benchmark_rerankers():
#281	configs = [
#282	{"provider": "cohere", "model": "rerank-english-v3.0"},
#283	{"provider": "sentence_transformer", "model": "cross-encoder/ms-marco-MiniLM-L-6-v2"},
#284	{"provider": "huggingface", "model": "BAAI/bge-reranker-base"}
#285	]
#286
#287	test_queries = ["sample query 1", "sample query 2", "sample query 3"]
#288
#289	results = {}
#290	for config in configs:
#291	provider = config["provider"]
#292	performance = measure_reranker_performance(
#293	{"reranker": {"provider": provider, "config": config}},
#294	test_queries,
#295	"test_user"
#296	)
#297	results[provider] = performance
#298
#299	return results
#300	```
#301
#302	## Production Best Practices
#303
#304	1. Model Selection: Choose the right balance of speed vs. accuracy
#305	2. Resource Allocation: Monitor CPU/GPU usage and memory consumption
#306	3. Error Handling: Implement fallbacks for reranker failures
#307	4. Load Balancing: Distribute reranking load across multiple instances
#308	5. Monitoring: Track latency, throughput, and error rates
#309	6. Caching: Cache frequent queries and model predictions
#310	7. Batch Processing: Group similar queries for efficient processing

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public