repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import logging |
| #2 | import os |
| #3 | from typing import Any, Dict, List, Optional, Union |
| #4 | |
| #5 | from pydantic import BaseModel |
| #6 | |
| #7 | try: |
| #8 | from pinecone import Pinecone, PodSpec, ServerlessSpec, Vector |
| #9 | except ImportError: |
| #10 | raise ImportError( |
| #11 | "Pinecone requires extra dependencies. Install with `pip install pinecone pinecone-text`" |
| #12 | ) from None |
| #13 | |
| #14 | from mem0.vector_stores.base import VectorStoreBase |
| #15 | |
| #16 | logger = logging.getLogger(__name__) |
| #17 | |
| #18 | |
| #19 | class OutputData(BaseModel): |
| #20 | id: Optional[str] # memory id |
| #21 | score: Optional[float] # distance |
| #22 | payload: Optional[Dict] # metadata |
| #23 | |
| #24 | |
| #25 | class PineconeDB(VectorStoreBase): |
| #26 | def __init__( |
| #27 | self, |
| #28 | collection_name: str, |
| #29 | embedding_model_dims: int, |
| #30 | client: Optional["Pinecone"], |
| #31 | api_key: Optional[str], |
| #32 | environment: Optional[str], |
| #33 | serverless_config: Optional[Dict[str, Any]], |
| #34 | pod_config: Optional[Dict[str, Any]], |
| #35 | hybrid_search: bool, |
| #36 | metric: str, |
| #37 | batch_size: int, |
| #38 | extra_params: Optional[Dict[str, Any]], |
| #39 | namespace: Optional[str] = None, |
| #40 | ): |
| #41 | """ |
| #42 | Initialize the Pinecone vector store. |
| #43 | |
| #44 | Args: |
| #45 | collection_name (str): Name of the index/collection. |
| #46 | embedding_model_dims (int): Dimensions of the embedding model. |
| #47 | client (Pinecone, optional): Existing Pinecone client instance. Defaults to None. |
| #48 | api_key (str, optional): API key for Pinecone. Defaults to None. |
| #49 | environment (str, optional): Pinecone environment. Defaults to None. |
| #50 | serverless_config (Dict, optional): Configuration for serverless deployment. Defaults to None. |
| #51 | pod_config (Dict, optional): Configuration for pod-based deployment. Defaults to None. |
| #52 | hybrid_search (bool, optional): Whether to enable hybrid search. Defaults to False. |
| #53 | metric (str, optional): Distance metric for vector similarity. Defaults to "cosine". |
| #54 | batch_size (int, optional): Batch size for operations. Defaults to 100. |
| #55 | extra_params (Dict, optional): Additional parameters for Pinecone client. Defaults to None. |
| #56 | namespace (str, optional): Namespace for the collection. Defaults to None. |
| #57 | """ |
| #58 | if client: |
| #59 | self.client = client |
| #60 | else: |
| #61 | api_key = api_key or os.environ.get("PINECONE_API_KEY") |
| #62 | if not api_key: |
| #63 | raise ValueError( |
| #64 | "Pinecone API key must be provided either as a parameter or as an environment variable" |
| #65 | ) |
| #66 | |
| #67 | params = extra_params or {} |
| #68 | self.client = Pinecone(api_key=api_key, **params) |
| #69 | |
| #70 | self.collection_name = collection_name |
| #71 | self.embedding_model_dims = embedding_model_dims |
| #72 | self.environment = environment |
| #73 | self.serverless_config = serverless_config |
| #74 | self.pod_config = pod_config |
| #75 | self.hybrid_search = hybrid_search |
| #76 | self.metric = metric |
| #77 | self.batch_size = batch_size |
| #78 | self.namespace = namespace |
| #79 | |
| #80 | self.sparse_encoder = None |
| #81 | if self.hybrid_search: |
| #82 | try: |
| #83 | from pinecone_text.sparse import BM25Encoder |
| #84 | |
| #85 | logger.info("Initializing BM25Encoder for sparse vectors...") |
| #86 | self.sparse_encoder = BM25Encoder.default() |
| #87 | except ImportError: |
| #88 | logger.warning("pinecone-text not installed. Hybrid search will be disabled.") |
| #89 | self.hybrid_search = False |
| #90 | |
| #91 | self.create_col(embedding_model_dims, metric) |
| #92 | |
| #93 | def create_col(self, vector_size: int, metric: str = "cosine"): |
| #94 | """ |
| #95 | Create a new index/collection. |
| #96 | |
| #97 | Args: |
| #98 | vector_size (int): Size of the vectors to be stored. |
| #99 | metric (str, optional): Distance metric for vector similarity. Defaults to "cosine". |
| #100 | """ |
| #101 | existing_indexes = self.list_cols().names() |
| #102 | |
| #103 | if self.collection_name in existing_indexes: |
| #104 | logger.debug(f"Index {self.collection_name} already exists. Skipping creation.") |
| #105 | self.index = self.client.Index(self.collection_name) |
| #106 | return |
| #107 | |
| #108 | if self.serverless_config: |
| #109 | spec = ServerlessSpec(**self.serverless_config) |
| #110 | elif self.pod_config: |
| #111 | spec = PodSpec(**self.pod_config) |
| #112 | else: |
| #113 | spec = ServerlessSpec(cloud="aws", region="us-west-2") |
| #114 | |
| #115 | self.client.create_index( |
| #116 | name=self.collection_name, |
| #117 | dimension=vector_size, |
| #118 | metric=metric, |
| #119 | spec=spec, |
| #120 | ) |
| #121 | |
| #122 | self.index = self.client.Index(self.collection_name) |
| #123 | |
| #124 | def insert( |
| #125 | self, |
| #126 | vectors: List[List[float]], |
| #127 | payloads: Optional[List[Dict]] = None, |
| #128 | ids: Optional[List[Union[str, int]]] = None, |
| #129 | ): |
| #130 | """ |
| #131 | Insert vectors into an index. |
| #132 | |
| #133 | Args: |
| #134 | vectors (list): List of vectors to insert. |
| #135 | payloads (list, optional): List of payloads corresponding to vectors. Defaults to None. |
| #136 | ids (list, optional): List of IDs corresponding to vectors. Defaults to None. |
| #137 | """ |
| #138 | logger.info(f"Inserting {len(vectors)} vectors into index {self.collection_name}") |
| #139 | items = [] |
| #140 | |
| #141 | for idx, vector in enumerate(vectors): |
| #142 | item_id = str(ids[idx]) if ids is not None else str(idx) |
| #143 | payload = payloads[idx] if payloads else {} |
| #144 | |
| #145 | vector_record = {"id": item_id, "values": vector, "metadata": payload} |
| #146 | |
| #147 | if self.hybrid_search and self.sparse_encoder and "text" in payload: |
| #148 | sparse_vector = self.sparse_encoder.encode_documents(payload["text"]) |
| #149 | vector_record["sparse_values"] = sparse_vector |
| #150 | |
| #151 | items.append(vector_record) |
| #152 | |
| #153 | if len(items) >= self.batch_size: |
| #154 | self.index.upsert(vectors=items, namespace=self.namespace) |
| #155 | items = [] |
| #156 | |
| #157 | if items: |
| #158 | self.index.upsert(vectors=items, namespace=self.namespace) |
| #159 | |
| #160 | def _parse_output(self, data: Dict) -> List[OutputData]: |
| #161 | """ |
| #162 | Parse the output data from Pinecone search results. |
| #163 | |
| #164 | Args: |
| #165 | data (Dict): Output data from Pinecone query. |
| #166 | |
| #167 | Returns: |
| #168 | List[OutputData]: Parsed output data. |
| #169 | """ |
| #170 | if isinstance(data, Vector): |
| #171 | result = OutputData( |
| #172 | id=data.id, |
| #173 | score=0.0, |
| #174 | payload=data.metadata, |
| #175 | ) |
| #176 | return result |
| #177 | else: |
| #178 | result = [] |
| #179 | for match in data: |
| #180 | entry = OutputData( |
| #181 | id=match.get("id"), |
| #182 | score=match.get("score"), |
| #183 | payload=match.get("metadata"), |
| #184 | ) |
| #185 | result.append(entry) |
| #186 | |
| #187 | return result |
| #188 | |
| #189 | def _create_filter(self, filters: Optional[Dict]) -> Dict: |
| #190 | """ |
| #191 | Create a filter dictionary from the provided filters. |
| #192 | """ |
| #193 | if not filters: |
| #194 | return {} |
| #195 | |
| #196 | pinecone_filter = {} |
| #197 | |
| #198 | for key, value in filters.items(): |
| #199 | if isinstance(value, dict) and "gte" in value and "lte" in value: |
| #200 | pinecone_filter[key] = {"$gte": value["gte"], "$lte": value["lte"]} |
| #201 | else: |
| #202 | pinecone_filter[key] = {"$eq": value} |
| #203 | |
| #204 | return pinecone_filter |
| #205 | |
| #206 | def search( |
| #207 | self, query: str, vectors: List[float], limit: int = 5, filters: Optional[Dict] = None |
| #208 | ) -> List[OutputData]: |
| #209 | """ |
| #210 | Search for similar vectors. |
| #211 | |
| #212 | Args: |
| #213 | query (str): Query. |
| #214 | vectors (list): List of vectors to search. |
| #215 | limit (int, optional): Number of results to return. Defaults to 5. |
| #216 | filters (dict, optional): Filters to apply to the search. Defaults to None. |
| #217 | |
| #218 | Returns: |
| #219 | list: Search results. |
| #220 | """ |
| #221 | filter_dict = self._create_filter(filters) if filters else None |
| #222 | |
| #223 | query_params = { |
| #224 | "vector": vectors, |
| #225 | "top_k": limit, |
| #226 | "include_metadata": True, |
| #227 | "include_values": False, |
| #228 | } |
| #229 | |
| #230 | if filter_dict: |
| #231 | query_params["filter"] = filter_dict |
| #232 | |
| #233 | if self.hybrid_search and self.sparse_encoder and "text" in filters: |
| #234 | query_text = filters.get("text") |
| #235 | if query_text: |
| #236 | sparse_vector = self.sparse_encoder.encode_queries(query_text) |
| #237 | query_params["sparse_vector"] = sparse_vector |
| #238 | |
| #239 | response = self.index.query(**query_params, namespace=self.namespace) |
| #240 | |
| #241 | results = self._parse_output(response.matches) |
| #242 | return results |
| #243 | |
| #244 | def delete(self, vector_id: Union[str, int]): |
| #245 | """ |
| #246 | Delete a vector by ID. |
| #247 | |
| #248 | Args: |
| #249 | vector_id (Union[str, int]): ID of the vector to delete. |
| #250 | """ |
| #251 | self.index.delete(ids=[str(vector_id)], namespace=self.namespace) |
| #252 | |
| #253 | def update(self, vector_id: Union[str, int], vector: Optional[List[float]] = None, payload: Optional[Dict] = None): |
| #254 | """ |
| #255 | Update a vector and its payload. |
| #256 | |
| #257 | Args: |
| #258 | vector_id (Union[str, int]): ID of the vector to update. |
| #259 | vector (list, optional): Updated vector. Defaults to None. |
| #260 | payload (dict, optional): Updated payload. Defaults to None. |
| #261 | """ |
| #262 | item = { |
| #263 | "id": str(vector_id), |
| #264 | } |
| #265 | |
| #266 | if vector is not None: |
| #267 | item["values"] = vector |
| #268 | |
| #269 | if payload is not None: |
| #270 | item["metadata"] = payload |
| #271 | |
| #272 | if self.hybrid_search and self.sparse_encoder and "text" in payload: |
| #273 | sparse_vector = self.sparse_encoder.encode_documents(payload["text"]) |
| #274 | item["sparse_values"] = sparse_vector |
| #275 | |
| #276 | self.index.upsert(vectors=[item], namespace=self.namespace) |
| #277 | |
| #278 | def get(self, vector_id: Union[str, int]) -> OutputData: |
| #279 | """ |
| #280 | Retrieve a vector by ID. |
| #281 | |
| #282 | Args: |
| #283 | vector_id (Union[str, int]): ID of the vector to retrieve. |
| #284 | |
| #285 | Returns: |
| #286 | dict: Retrieved vector or None if not found. |
| #287 | """ |
| #288 | try: |
| #289 | response = self.index.fetch(ids=[str(vector_id)], namespace=self.namespace) |
| #290 | if str(vector_id) in response.vectors: |
| #291 | return self._parse_output(response.vectors[str(vector_id)]) |
| #292 | return None |
| #293 | except Exception as e: |
| #294 | logger.error(f"Error retrieving vector {vector_id}: {e}") |
| #295 | return None |
| #296 | |
| #297 | def list_cols(self): |
| #298 | """ |
| #299 | List all indexes/collections. |
| #300 | |
| #301 | Returns: |
| #302 | list: List of index information. |
| #303 | """ |
| #304 | return self.client.list_indexes() |
| #305 | |
| #306 | def delete_col(self): |
| #307 | """Delete an index/collection.""" |
| #308 | try: |
| #309 | self.client.delete_index(self.collection_name) |
| #310 | logger.info(f"Index {self.collection_name} deleted successfully") |
| #311 | except Exception as e: |
| #312 | logger.error(f"Error deleting index {self.collection_name}: {e}") |
| #313 | |
| #314 | def col_info(self) -> Dict: |
| #315 | """ |
| #316 | Get information about an index/collection. |
| #317 | |
| #318 | Returns: |
| #319 | dict: Index information. |
| #320 | """ |
| #321 | return self.client.describe_index(self.collection_name) |
| #322 | |
| #323 | def list(self, filters: Optional[Dict] = None, limit: int = 100) -> List[OutputData]: |
| #324 | """ |
| #325 | List vectors in an index with optional filtering. |
| #326 | |
| #327 | Args: |
| #328 | filters (dict, optional): Filters to apply to the list. Defaults to None. |
| #329 | limit (int, optional): Number of vectors to return. Defaults to 100. |
| #330 | |
| #331 | Returns: |
| #332 | dict: List of vectors with their metadata. |
| #333 | """ |
| #334 | filter_dict = self._create_filter(filters) if filters else None |
| #335 | |
| #336 | stats = self.index.describe_index_stats() |
| #337 | dimension = stats.dimension |
| #338 | |
| #339 | zero_vector = [0.0] * dimension |
| #340 | |
| #341 | query_params = { |
| #342 | "vector": zero_vector, |
| #343 | "top_k": limit, |
| #344 | "include_metadata": True, |
| #345 | "include_values": True, |
| #346 | } |
| #347 | |
| #348 | if filter_dict: |
| #349 | query_params["filter"] = filter_dict |
| #350 | |
| #351 | try: |
| #352 | response = self.index.query(**query_params, namespace=self.namespace) |
| #353 | response = response.to_dict() |
| #354 | results = self._parse_output(response["matches"]) |
| #355 | return [results] |
| #356 | except Exception as e: |
| #357 | logger.error(f"Error listing vectors: {e}") |
| #358 | return {"points": [], "next_page_token": None} |
| #359 | |
| #360 | def count(self) -> int: |
| #361 | """ |
| #362 | Count number of vectors in the index. |
| #363 | |
| #364 | Returns: |
| #365 | int: Total number of vectors. |
| #366 | """ |
| #367 | stats = self.index.describe_index_stats() |
| #368 | if self.namespace: |
| #369 | # Safely get the namespace stats and return vector_count, defaulting to 0 if not found |
| #370 | namespace_summary = (stats.namespaces or {}).get(self.namespace) |
| #371 | if namespace_summary: |
| #372 | return namespace_summary.vector_count or 0 |
| #373 | return 0 |
| #374 | return stats.total_vector_count or 0 |
| #375 | |
| #376 | def reset(self): |
| #377 | """ |
| #378 | Reset the index by deleting and recreating it. |
| #379 | """ |
| #380 | logger.warning(f"Resetting index {self.collection_name}...") |
| #381 | self.delete_col() |
| #382 | self.create_col(self.embedding_model_dims, self.metric) |
| #383 |