my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	import logging
#2	import os
#3	from typing import Any, Dict, List, Optional, Union
#4
#5	from pydantic import BaseModel
#6
#7	try:
#8	from pinecone import Pinecone, PodSpec, ServerlessSpec, Vector
#9	except ImportError:
#10	raise ImportError(
#11	"Pinecone requires extra dependencies. Install with `pip install pinecone pinecone-text`"
#12	) from None
#13
#14	from mem0.vector_stores.base import VectorStoreBase
#15
#16	logger = logging.getLogger(__name__)
#17
#18
#19	class OutputData(BaseModel):
#20	id: Optional[str] # memory id
#21	score: Optional[float] # distance
#22	payload: Optional[Dict] # metadata
#23
#24
#25	class PineconeDB(VectorStoreBase):
#26	def __init__(
#27	self,
#28	collection_name: str,
#29	embedding_model_dims: int,
#30	client: Optional["Pinecone"],
#31	api_key: Optional[str],
#32	environment: Optional[str],
#33	serverless_config: Optional[Dict[str, Any]],
#34	pod_config: Optional[Dict[str, Any]],
#35	hybrid_search: bool,
#36	metric: str,
#37	batch_size: int,
#38	extra_params: Optional[Dict[str, Any]],
#39	namespace: Optional[str] = None,
#40	):
#41	"""
#42	Initialize the Pinecone vector store.
#43
#44	Args:
#45	collection_name (str): Name of the index/collection.
#46	embedding_model_dims (int): Dimensions of the embedding model.
#47	client (Pinecone, optional): Existing Pinecone client instance. Defaults to None.
#48	api_key (str, optional): API key for Pinecone. Defaults to None.
#49	environment (str, optional): Pinecone environment. Defaults to None.
#50	serverless_config (Dict, optional): Configuration for serverless deployment. Defaults to None.
#51	pod_config (Dict, optional): Configuration for pod-based deployment. Defaults to None.
#52	hybrid_search (bool, optional): Whether to enable hybrid search. Defaults to False.
#53	metric (str, optional): Distance metric for vector similarity. Defaults to "cosine".
#54	batch_size (int, optional): Batch size for operations. Defaults to 100.
#55	extra_params (Dict, optional): Additional parameters for Pinecone client. Defaults to None.
#56	namespace (str, optional): Namespace for the collection. Defaults to None.
#57	"""
#58	if client:
#59	self.client = client
#60	else:
#61	api_key = api_key or os.environ.get("PINECONE_API_KEY")
#62	if not api_key:
#63	raise ValueError(
#64	"Pinecone API key must be provided either as a parameter or as an environment variable"
#65	)
#66
#67	params = extra_params or {}
#68	self.client = Pinecone(api_key=api_key, **params)
#69
#70	self.collection_name = collection_name
#71	self.embedding_model_dims = embedding_model_dims
#72	self.environment = environment
#73	self.serverless_config = serverless_config
#74	self.pod_config = pod_config
#75	self.hybrid_search = hybrid_search
#76	self.metric = metric
#77	self.batch_size = batch_size
#78	self.namespace = namespace
#79
#80	self.sparse_encoder = None
#81	if self.hybrid_search:
#82	try:
#83	from pinecone_text.sparse import BM25Encoder
#84
#85	logger.info("Initializing BM25Encoder for sparse vectors...")
#86	self.sparse_encoder = BM25Encoder.default()
#87	except ImportError:
#88	logger.warning("pinecone-text not installed. Hybrid search will be disabled.")
#89	self.hybrid_search = False
#90
#91	self.create_col(embedding_model_dims, metric)
#92
#93	def create_col(self, vector_size: int, metric: str = "cosine"):
#94	"""
#95	Create a new index/collection.
#96
#97	Args:
#98	vector_size (int): Size of the vectors to be stored.
#99	metric (str, optional): Distance metric for vector similarity. Defaults to "cosine".
#100	"""
#101	existing_indexes = self.list_cols().names()
#102
#103	if self.collection_name in existing_indexes:
#104	logger.debug(f"Index {self.collection_name} already exists. Skipping creation.")
#105	self.index = self.client.Index(self.collection_name)
#106	return
#107
#108	if self.serverless_config:
#109	spec = ServerlessSpec(**self.serverless_config)
#110	elif self.pod_config:
#111	spec = PodSpec(**self.pod_config)
#112	else:
#113	spec = ServerlessSpec(cloud="aws", region="us-west-2")
#114
#115	self.client.create_index(
#116	name=self.collection_name,
#117	dimension=vector_size,
#118	metric=metric,
#119	spec=spec,
#120	)
#121
#122	self.index = self.client.Index(self.collection_name)
#123
#124	def insert(
#125	self,
#126	vectors: List[List[float]],
#127	payloads: Optional[List[Dict]] = None,
#128	ids: Optional[List[Union[str, int]]] = None,
#129	):
#130	"""
#131	Insert vectors into an index.
#132
#133	Args:
#134	vectors (list): List of vectors to insert.
#135	payloads (list, optional): List of payloads corresponding to vectors. Defaults to None.
#136	ids (list, optional): List of IDs corresponding to vectors. Defaults to None.
#137	"""
#138	logger.info(f"Inserting {len(vectors)} vectors into index {self.collection_name}")
#139	items = []
#140
#141	for idx, vector in enumerate(vectors):
#142	item_id = str(ids[idx]) if ids is not None else str(idx)
#143	payload = payloads[idx] if payloads else {}
#144
#145	vector_record = {"id": item_id, "values": vector, "metadata": payload}
#146
#147	if self.hybrid_search and self.sparse_encoder and "text" in payload:
#148	sparse_vector = self.sparse_encoder.encode_documents(payload["text"])
#149	vector_record["sparse_values"] = sparse_vector
#150
#151	items.append(vector_record)
#152
#153	if len(items) >= self.batch_size:
#154	self.index.upsert(vectors=items, namespace=self.namespace)
#155	items = []
#156
#157	if items:
#158	self.index.upsert(vectors=items, namespace=self.namespace)
#159
#160	def _parse_output(self, data: Dict) -> List[OutputData]:
#161	"""
#162	Parse the output data from Pinecone search results.
#163
#164	Args:
#165	data (Dict): Output data from Pinecone query.
#166
#167	Returns:
#168	List[OutputData]: Parsed output data.
#169	"""
#170	if isinstance(data, Vector):
#171	result = OutputData(
#172	id=data.id,
#173	score=0.0,
#174	payload=data.metadata,
#175	)
#176	return result
#177	else:
#178	result = []
#179	for match in data:
#180	entry = OutputData(
#181	id=match.get("id"),
#182	score=match.get("score"),
#183	payload=match.get("metadata"),
#184	)
#185	result.append(entry)
#186
#187	return result
#188
#189	def _create_filter(self, filters: Optional[Dict]) -> Dict:
#190	"""
#191	Create a filter dictionary from the provided filters.
#192	"""
#193	if not filters:
#194	return {}
#195
#196	pinecone_filter = {}
#197
#198	for key, value in filters.items():
#199	if isinstance(value, dict) and "gte" in value and "lte" in value:
#200	pinecone_filter[key] = {"$gte": value["gte"], "$lte": value["lte"]}
#201	else:
#202	pinecone_filter[key] = {"$eq": value}
#203
#204	return pinecone_filter
#205
#206	def search(
#207	self, query: str, vectors: List[float], limit: int = 5, filters: Optional[Dict] = None
#208	) -> List[OutputData]:
#209	"""
#210	Search for similar vectors.
#211
#212	Args:
#213	query (str): Query.
#214	vectors (list): List of vectors to search.
#215	limit (int, optional): Number of results to return. Defaults to 5.
#216	filters (dict, optional): Filters to apply to the search. Defaults to None.
#217
#218	Returns:
#219	list: Search results.
#220	"""
#221	filter_dict = self._create_filter(filters) if filters else None
#222
#223	query_params = {
#224	"vector": vectors,
#225	"top_k": limit,
#226	"include_metadata": True,
#227	"include_values": False,
#228	}
#229
#230	if filter_dict:
#231	query_params["filter"] = filter_dict
#232
#233	if self.hybrid_search and self.sparse_encoder and "text" in filters:
#234	query_text = filters.get("text")
#235	if query_text:
#236	sparse_vector = self.sparse_encoder.encode_queries(query_text)
#237	query_params["sparse_vector"] = sparse_vector
#238
#239	response = self.index.query(**query_params, namespace=self.namespace)
#240
#241	results = self._parse_output(response.matches)
#242	return results
#243
#244	def delete(self, vector_id: Union[str, int]):
#245	"""
#246	Delete a vector by ID.
#247
#248	Args:
#249	vector_id (Union[str, int]): ID of the vector to delete.
#250	"""
#251	self.index.delete(ids=[str(vector_id)], namespace=self.namespace)
#252
#253	def update(self, vector_id: Union[str, int], vector: Optional[List[float]] = None, payload: Optional[Dict] = None):
#254	"""
#255	Update a vector and its payload.
#256
#257	Args:
#258	vector_id (Union[str, int]): ID of the vector to update.
#259	vector (list, optional): Updated vector. Defaults to None.
#260	payload (dict, optional): Updated payload. Defaults to None.
#261	"""
#262	item = {
#263	"id": str(vector_id),
#264	}
#265
#266	if vector is not None:
#267	item["values"] = vector
#268
#269	if payload is not None:
#270	item["metadata"] = payload
#271
#272	if self.hybrid_search and self.sparse_encoder and "text" in payload:
#273	sparse_vector = self.sparse_encoder.encode_documents(payload["text"])
#274	item["sparse_values"] = sparse_vector
#275
#276	self.index.upsert(vectors=[item], namespace=self.namespace)
#277
#278	def get(self, vector_id: Union[str, int]) -> OutputData:
#279	"""
#280	Retrieve a vector by ID.
#281
#282	Args:
#283	vector_id (Union[str, int]): ID of the vector to retrieve.
#284
#285	Returns:
#286	dict: Retrieved vector or None if not found.
#287	"""
#288	try:
#289	response = self.index.fetch(ids=[str(vector_id)], namespace=self.namespace)
#290	if str(vector_id) in response.vectors:
#291	return self._parse_output(response.vectors[str(vector_id)])
#292	return None
#293	except Exception as e:
#294	logger.error(f"Error retrieving vector {vector_id}: {e}")
#295	return None
#296
#297	def list_cols(self):
#298	"""
#299	List all indexes/collections.
#300
#301	Returns:
#302	list: List of index information.
#303	"""
#304	return self.client.list_indexes()
#305
#306	def delete_col(self):
#307	"""Delete an index/collection."""
#308	try:
#309	self.client.delete_index(self.collection_name)
#310	logger.info(f"Index {self.collection_name} deleted successfully")
#311	except Exception as e:
#312	logger.error(f"Error deleting index {self.collection_name}: {e}")
#313
#314	def col_info(self) -> Dict:
#315	"""
#316	Get information about an index/collection.
#317
#318	Returns:
#319	dict: Index information.
#320	"""
#321	return self.client.describe_index(self.collection_name)
#322
#323	def list(self, filters: Optional[Dict] = None, limit: int = 100) -> List[OutputData]:
#324	"""
#325	List vectors in an index with optional filtering.
#326
#327	Args:
#328	filters (dict, optional): Filters to apply to the list. Defaults to None.
#329	limit (int, optional): Number of vectors to return. Defaults to 100.
#330
#331	Returns:
#332	dict: List of vectors with their metadata.
#333	"""
#334	filter_dict = self._create_filter(filters) if filters else None
#335
#336	stats = self.index.describe_index_stats()
#337	dimension = stats.dimension
#338
#339	zero_vector = [0.0] * dimension
#340
#341	query_params = {
#342	"vector": zero_vector,
#343	"top_k": limit,
#344	"include_metadata": True,
#345	"include_values": True,
#346	}
#347
#348	if filter_dict:
#349	query_params["filter"] = filter_dict
#350
#351	try:
#352	response = self.index.query(**query_params, namespace=self.namespace)
#353	response = response.to_dict()
#354	results = self._parse_output(response["matches"])
#355	return [results]
#356	except Exception as e:
#357	logger.error(f"Error listing vectors: {e}")
#358	return {"points": [], "next_page_token": None}
#359
#360	def count(self) -> int:
#361	"""
#362	Count number of vectors in the index.
#363
#364	Returns:
#365	int: Total number of vectors.
#366	"""
#367	stats = self.index.describe_index_stats()
#368	if self.namespace:
#369	# Safely get the namespace stats and return vector_count, defaulting to 0 if not found
#370	namespace_summary = (stats.namespaces or {}).get(self.namespace)
#371	if namespace_summary:
#372	return namespace_summary.vector_count or 0
#373	return 0
#374	return stats.total_vector_count or 0
#375
#376	def reset(self):
#377	"""
#378	Reset the index by deleting and recreating it.
#379	"""
#380	logger.warning(f"Resetting index {self.collection_name}...")
#381	self.delete_col()
#382	self.create_col(self.embedding_model_dims, self.metric)
#383

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public