repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import ast |
| #2 | import concurrent.futures |
| #3 | import json |
| #4 | import logging |
| #5 | import os |
| #6 | from typing import Any, Optional, Union |
| #7 | |
| #8 | import requests |
| #9 | import yaml |
| #10 | from tqdm import tqdm |
| #11 | |
| #12 | from embedchain.cache import ( |
| #13 | Config, |
| #14 | ExactMatchEvaluation, |
| #15 | SearchDistanceEvaluation, |
| #16 | cache, |
| #17 | gptcache_data_manager, |
| #18 | gptcache_pre_function, |
| #19 | ) |
| #20 | from embedchain.client import Client |
| #21 | from embedchain.config import AppConfig, CacheConfig, ChunkerConfig, Mem0Config |
| #22 | from embedchain.core.db.database import get_session |
| #23 | from embedchain.core.db.models import DataSource |
| #24 | from embedchain.embedchain import EmbedChain |
| #25 | from embedchain.embedder.base import BaseEmbedder |
| #26 | from embedchain.embedder.openai import OpenAIEmbedder |
| #27 | from embedchain.evaluation.base import BaseMetric |
| #28 | from embedchain.evaluation.metrics import ( |
| #29 | AnswerRelevance, |
| #30 | ContextRelevance, |
| #31 | Groundedness, |
| #32 | ) |
| #33 | from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory |
| #34 | from embedchain.helpers.json_serializable import register_deserializable |
| #35 | from embedchain.llm.base import BaseLlm |
| #36 | from embedchain.llm.openai import OpenAILlm |
| #37 | from embedchain.telemetry.posthog import AnonymousTelemetry |
| #38 | from embedchain.utils.evaluation import EvalData, EvalMetric |
| #39 | from embedchain.utils.misc import validate_config |
| #40 | from embedchain.vectordb.base import BaseVectorDB |
| #41 | from embedchain.vectordb.chroma import ChromaDB |
| #42 | from mem0 import Memory |
| #43 | |
| #44 | logger = logging.getLogger(__name__) |
| #45 | |
| #46 | |
| #47 | @register_deserializable |
| #48 | class App(EmbedChain): |
| #49 | """ |
| #50 | EmbedChain App lets you create a LLM powered app for your unstructured |
| #51 | data by defining your chosen data source, embedding model, |
| #52 | and vector database. |
| #53 | """ |
| #54 | |
| #55 | def __init__( |
| #56 | self, |
| #57 | id: str = None, |
| #58 | name: str = None, |
| #59 | config: AppConfig = None, |
| #60 | db: BaseVectorDB = None, |
| #61 | embedding_model: BaseEmbedder = None, |
| #62 | llm: BaseLlm = None, |
| #63 | config_data: dict = None, |
| #64 | auto_deploy: bool = False, |
| #65 | chunker: ChunkerConfig = None, |
| #66 | cache_config: CacheConfig = None, |
| #67 | memory_config: Mem0Config = None, |
| #68 | log_level: int = logging.WARN, |
| #69 | ): |
| #70 | """ |
| #71 | Initialize a new `App` instance. |
| #72 | |
| #73 | :param config: Configuration for the pipeline, defaults to None |
| #74 | :type config: AppConfig, optional |
| #75 | :param db: The database to use for storing and retrieving embeddings, defaults to None |
| #76 | :type db: BaseVectorDB, optional |
| #77 | :param embedding_model: The embedding model used to calculate embeddings, defaults to None |
| #78 | :type embedding_model: BaseEmbedder, optional |
| #79 | :param llm: The LLM model used to calculate embeddings, defaults to None |
| #80 | :type llm: BaseLlm, optional |
| #81 | :param config_data: Config dictionary, defaults to None |
| #82 | :type config_data: dict, optional |
| #83 | :param auto_deploy: Whether to deploy the pipeline automatically, defaults to False |
| #84 | :type auto_deploy: bool, optional |
| #85 | :raises Exception: If an error occurs while creating the pipeline |
| #86 | """ |
| #87 | if id and config_data: |
| #88 | raise Exception("Cannot provide both id and config. Please provide only one of them.") |
| #89 | |
| #90 | if id and name: |
| #91 | raise Exception("Cannot provide both id and name. Please provide only one of them.") |
| #92 | |
| #93 | if name and config: |
| #94 | raise Exception("Cannot provide both name and config. Please provide only one of them.") |
| #95 | |
| #96 | self.auto_deploy = auto_deploy |
| #97 | # Store the dict config as an attribute to be able to send it |
| #98 | self.config_data = config_data if (config_data and validate_config(config_data)) else None |
| #99 | self.client = None |
| #100 | # pipeline_id from the backend |
| #101 | self.id = None |
| #102 | self.chunker = ChunkerConfig(**chunker) if chunker else None |
| #103 | self.cache_config = cache_config |
| #104 | self.memory_config = memory_config |
| #105 | |
| #106 | self.config = config or AppConfig() |
| #107 | self.name = self.config.name |
| #108 | self.config.id = self.local_id = "default-app-id" if self.config.id is None else self.config.id |
| #109 | |
| #110 | if id is not None: |
| #111 | # Init client first since user is trying to fetch the pipeline |
| #112 | # details from the platform |
| #113 | self._init_client() |
| #114 | pipeline_details = self._get_pipeline(id) |
| #115 | self.config.id = self.local_id = pipeline_details["metadata"]["local_id"] |
| #116 | self.id = id |
| #117 | |
| #118 | if name is not None: |
| #119 | self.name = name |
| #120 | |
| #121 | self.embedding_model = embedding_model or OpenAIEmbedder() |
| #122 | self.db = db or ChromaDB() |
| #123 | self.llm = llm or OpenAILlm() |
| #124 | self._init_db() |
| #125 | |
| #126 | # Session for the metadata db |
| #127 | self.db_session = get_session() |
| #128 | |
| #129 | # If cache_config is provided, initializing the cache ... |
| #130 | if self.cache_config is not None: |
| #131 | self._init_cache() |
| #132 | |
| #133 | # If memory_config is provided, initializing the memory ... |
| #134 | self.mem0_memory = None |
| #135 | if self.memory_config is not None: |
| #136 | self.mem0_memory = Memory() |
| #137 | |
| #138 | # Send anonymous telemetry |
| #139 | self._telemetry_props = {"class": self.__class__.__name__} |
| #140 | self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics) |
| #141 | self.telemetry.capture(event_name="init", properties=self._telemetry_props) |
| #142 | |
| #143 | self.user_asks = [] |
| #144 | if self.auto_deploy: |
| #145 | self.deploy() |
| #146 | |
| #147 | def _init_db(self): |
| #148 | """ |
| #149 | Initialize the database. |
| #150 | """ |
| #151 | self.db._set_embedder(self.embedding_model) |
| #152 | self.db._initialize() |
| #153 | self.db.set_collection_name(self.db.config.collection_name) |
| #154 | |
| #155 | def _init_cache(self): |
| #156 | if self.cache_config.similarity_eval_config.strategy == "exact": |
| #157 | similarity_eval_func = ExactMatchEvaluation() |
| #158 | else: |
| #159 | similarity_eval_func = SearchDistanceEvaluation( |
| #160 | max_distance=self.cache_config.similarity_eval_config.max_distance, |
| #161 | positive=self.cache_config.similarity_eval_config.positive, |
| #162 | ) |
| #163 | |
| #164 | cache.init( |
| #165 | pre_embedding_func=gptcache_pre_function, |
| #166 | embedding_func=self.embedding_model.to_embeddings, |
| #167 | data_manager=gptcache_data_manager(vector_dimension=self.embedding_model.vector_dimension), |
| #168 | similarity_evaluation=similarity_eval_func, |
| #169 | config=Config(**self.cache_config.init_config.as_dict()), |
| #170 | ) |
| #171 | |
| #172 | def _init_client(self): |
| #173 | """ |
| #174 | Initialize the client. |
| #175 | """ |
| #176 | config = Client.load_config() |
| #177 | if config.get("api_key"): |
| #178 | self.client = Client() |
| #179 | else: |
| #180 | api_key = input( |
| #181 | "🔑 Enter your Embedchain API key. You can find the API key at https://app.embedchain.ai/settings/keys/ \n" # noqa: E501 |
| #182 | ) |
| #183 | self.client = Client(api_key=api_key) |
| #184 | |
| #185 | def _get_pipeline(self, id): |
| #186 | """ |
| #187 | Get existing pipeline |
| #188 | """ |
| #189 | print("🛠️ Fetching pipeline details from the platform...") |
| #190 | url = f"{self.client.host}/api/v1/pipelines/{id}/cli/" |
| #191 | r = requests.get( |
| #192 | url, |
| #193 | headers={"Authorization": f"Token {self.client.api_key}"}, |
| #194 | ) |
| #195 | if r.status_code == 404: |
| #196 | raise Exception(f"❌ Pipeline with id {id} not found!") |
| #197 | |
| #198 | print( |
| #199 | f"🎉 Pipeline loaded successfully! Pipeline url: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501 |
| #200 | ) |
| #201 | return r.json() |
| #202 | |
| #203 | def _create_pipeline(self): |
| #204 | """ |
| #205 | Create a pipeline on the platform. |
| #206 | """ |
| #207 | print("🛠️ Creating pipeline on the platform...") |
| #208 | # self.config_data is a dict. Pass it inside the key 'yaml_config' to the backend |
| #209 | payload = { |
| #210 | "yaml_config": json.dumps(self.config_data), |
| #211 | "name": self.name, |
| #212 | "local_id": self.local_id, |
| #213 | } |
| #214 | url = f"{self.client.host}/api/v1/pipelines/cli/create/" |
| #215 | r = requests.post( |
| #216 | url, |
| #217 | json=payload, |
| #218 | headers={"Authorization": f"Token {self.client.api_key}"}, |
| #219 | ) |
| #220 | if r.status_code not in [200, 201]: |
| #221 | raise Exception(f"❌ Error occurred while creating pipeline. API response: {r.text}") |
| #222 | |
| #223 | if r.status_code == 200: |
| #224 | print( |
| #225 | f"🎉🎉🎉 Existing pipeline found! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501 |
| #226 | ) # noqa: E501 |
| #227 | elif r.status_code == 201: |
| #228 | print( |
| #229 | f"🎉🎉🎉 Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501 |
| #230 | ) |
| #231 | return r.json() |
| #232 | |
| #233 | def _get_presigned_url(self, data_type, data_value): |
| #234 | payload = {"data_type": data_type, "data_value": data_value} |
| #235 | r = requests.post( |
| #236 | f"{self.client.host}/api/v1/pipelines/{self.id}/cli/presigned_url/", |
| #237 | json=payload, |
| #238 | headers={"Authorization": f"Token {self.client.api_key}"}, |
| #239 | ) |
| #240 | r.raise_for_status() |
| #241 | return r.json() |
| #242 | |
| #243 | def _upload_file_to_presigned_url(self, presigned_url, file_path): |
| #244 | try: |
| #245 | with open(file_path, "rb") as file: |
| #246 | response = requests.put(presigned_url, data=file) |
| #247 | response.raise_for_status() |
| #248 | return response.status_code == 200 |
| #249 | except Exception as e: |
| #250 | logger.exception(f"Error occurred during file upload: {str(e)}") |
| #251 | print("❌ Error occurred during file upload!") |
| #252 | return False |
| #253 | |
| #254 | def _upload_data_to_pipeline(self, data_type, data_value, metadata=None): |
| #255 | payload = { |
| #256 | "data_type": data_type, |
| #257 | "data_value": data_value, |
| #258 | "metadata": metadata, |
| #259 | } |
| #260 | try: |
| #261 | self._send_api_request(f"/api/v1/pipelines/{self.id}/cli/add/", payload) |
| #262 | # print the local file path if user tries to upload a local file |
| #263 | printed_value = metadata.get("file_path") if metadata.get("file_path") else data_value |
| #264 | print(f"✅ Data of type: {data_type}, value: {printed_value} added successfully.") |
| #265 | except Exception as e: |
| #266 | print(f"❌ Error occurred during data upload for type {data_type}!. Error: {str(e)}") |
| #267 | |
| #268 | def _send_api_request(self, endpoint, payload): |
| #269 | url = f"{self.client.host}{endpoint}" |
| #270 | headers = {"Authorization": f"Token {self.client.api_key}"} |
| #271 | response = requests.post(url, json=payload, headers=headers) |
| #272 | response.raise_for_status() |
| #273 | return response |
| #274 | |
| #275 | def _process_and_upload_data(self, data_hash, data_type, data_value): |
| #276 | if os.path.isabs(data_value): |
| #277 | presigned_url_data = self._get_presigned_url(data_type, data_value) |
| #278 | presigned_url = presigned_url_data["presigned_url"] |
| #279 | s3_key = presigned_url_data["s3_key"] |
| #280 | if self._upload_file_to_presigned_url(presigned_url, file_path=data_value): |
| #281 | metadata = {"file_path": data_value, "s3_key": s3_key} |
| #282 | data_value = presigned_url |
| #283 | else: |
| #284 | logger.error(f"File upload failed for hash: {data_hash}") |
| #285 | return False |
| #286 | else: |
| #287 | if data_type == "qna_pair": |
| #288 | data_value = list(ast.literal_eval(data_value)) |
| #289 | metadata = {} |
| #290 | |
| #291 | try: |
| #292 | self._upload_data_to_pipeline(data_type, data_value, metadata) |
| #293 | self._mark_data_as_uploaded(data_hash) |
| #294 | return True |
| #295 | except Exception: |
| #296 | print(f"❌ Error occurred during data upload for hash {data_hash}!") |
| #297 | return False |
| #298 | |
| #299 | def _mark_data_as_uploaded(self, data_hash): |
| #300 | self.db_session.query(DataSource).filter_by(hash=data_hash, app_id=self.local_id).update({"is_uploaded": 1}) |
| #301 | |
| #302 | def get_data_sources(self): |
| #303 | data_sources = self.db_session.query(DataSource).filter_by(app_id=self.local_id).all() |
| #304 | results = [] |
| #305 | for row in data_sources: |
| #306 | results.append({"data_type": row.type, "data_value": row.value, "metadata": row.meta_data}) |
| #307 | return results |
| #308 | |
| #309 | def deploy(self): |
| #310 | if self.client is None: |
| #311 | self._init_client() |
| #312 | |
| #313 | pipeline_data = self._create_pipeline() |
| #314 | self.id = pipeline_data["id"] |
| #315 | |
| #316 | results = self.db_session.query(DataSource).filter_by(app_id=self.local_id, is_uploaded=0).all() |
| #317 | if len(results) > 0: |
| #318 | print("🛠️ Adding data to your pipeline...") |
| #319 | for result in results: |
| #320 | data_hash, data_type, data_value = result.hash, result.data_type, result.data_value |
| #321 | self._process_and_upload_data(data_hash, data_type, data_value) |
| #322 | |
| #323 | # Send anonymous telemetry |
| #324 | self.telemetry.capture(event_name="deploy", properties=self._telemetry_props) |
| #325 | |
| #326 | @classmethod |
| #327 | def from_config( |
| #328 | cls, |
| #329 | config_path: Optional[str] = None, |
| #330 | config: Optional[dict[str, Any]] = None, |
| #331 | auto_deploy: bool = False, |
| #332 | yaml_path: Optional[str] = None, |
| #333 | ): |
| #334 | """ |
| #335 | Instantiate a App object from a configuration. |
| #336 | |
| #337 | :param config_path: Path to the YAML or JSON configuration file. |
| #338 | :type config_path: Optional[str] |
| #339 | :param config: A dictionary containing the configuration. |
| #340 | :type config: Optional[dict[str, Any]] |
| #341 | :param auto_deploy: Whether to deploy the app automatically, defaults to False |
| #342 | :type auto_deploy: bool, optional |
| #343 | :param yaml_path: (Deprecated) Path to the YAML configuration file. Use config_path instead. |
| #344 | :type yaml_path: Optional[str] |
| #345 | :return: An instance of the App class. |
| #346 | :rtype: App |
| #347 | """ |
| #348 | # Backward compatibility for yaml_path |
| #349 | if yaml_path and not config_path: |
| #350 | config_path = yaml_path |
| #351 | |
| #352 | if config_path and config: |
| #353 | raise ValueError("Please provide only one of config_path or config.") |
| #354 | |
| #355 | config_data = None |
| #356 | |
| #357 | if config_path: |
| #358 | file_extension = os.path.splitext(config_path)[1] |
| #359 | with open(config_path, "r", encoding="UTF-8") as file: |
| #360 | if file_extension in [".yaml", ".yml"]: |
| #361 | config_data = yaml.safe_load(file) |
| #362 | elif file_extension == ".json": |
| #363 | config_data = json.load(file) |
| #364 | else: |
| #365 | raise ValueError("config_path must be a path to a YAML or JSON file.") |
| #366 | elif config and isinstance(config, dict): |
| #367 | config_data = config |
| #368 | else: |
| #369 | logger.error( |
| #370 | "Please provide either a config file path (YAML or JSON) or a config dictionary. Falling back to defaults because no config is provided.", # noqa: E501 |
| #371 | ) |
| #372 | config_data = {} |
| #373 | |
| #374 | # Validate the config |
| #375 | validate_config(config_data) |
| #376 | |
| #377 | app_config_data = config_data.get("app", {}).get("config", {}) |
| #378 | vector_db_config_data = config_data.get("vectordb", {}) |
| #379 | embedding_model_config_data = config_data.get("embedding_model", config_data.get("embedder", {})) |
| #380 | memory_config_data = config_data.get("memory", {}) |
| #381 | llm_config_data = config_data.get("llm", {}) |
| #382 | chunker_config_data = config_data.get("chunker", {}) |
| #383 | cache_config_data = config_data.get("cache", None) |
| #384 | |
| #385 | app_config = AppConfig(**app_config_data) |
| #386 | memory_config = Mem0Config(**memory_config_data) if memory_config_data else None |
| #387 | |
| #388 | vector_db_provider = vector_db_config_data.get("provider", "chroma") |
| #389 | vector_db = VectorDBFactory.create(vector_db_provider, vector_db_config_data.get("config", {})) |
| #390 | |
| #391 | if llm_config_data: |
| #392 | llm_provider = llm_config_data.get("provider", "openai") |
| #393 | llm = LlmFactory.create(llm_provider, llm_config_data.get("config", {})) |
| #394 | else: |
| #395 | llm = None |
| #396 | |
| #397 | embedding_model_provider = embedding_model_config_data.get("provider", "openai") |
| #398 | embedding_model = EmbedderFactory.create( |
| #399 | embedding_model_provider, embedding_model_config_data.get("config", {}) |
| #400 | ) |
| #401 | |
| #402 | if cache_config_data is not None: |
| #403 | cache_config = CacheConfig.from_config(cache_config_data) |
| #404 | else: |
| #405 | cache_config = None |
| #406 | |
| #407 | return cls( |
| #408 | config=app_config, |
| #409 | llm=llm, |
| #410 | db=vector_db, |
| #411 | embedding_model=embedding_model, |
| #412 | config_data=config_data, |
| #413 | auto_deploy=auto_deploy, |
| #414 | chunker=chunker_config_data, |
| #415 | cache_config=cache_config, |
| #416 | memory_config=memory_config, |
| #417 | ) |
| #418 | |
| #419 | def _eval(self, dataset: list[EvalData], metric: Union[BaseMetric, str]): |
| #420 | """ |
| #421 | Evaluate the app on a dataset for a given metric. |
| #422 | """ |
| #423 | metric_str = metric.name if isinstance(metric, BaseMetric) else metric |
| #424 | eval_class_map = { |
| #425 | EvalMetric.CONTEXT_RELEVANCY.value: ContextRelevance, |
| #426 | EvalMetric.ANSWER_RELEVANCY.value: AnswerRelevance, |
| #427 | EvalMetric.GROUNDEDNESS.value: Groundedness, |
| #428 | } |
| #429 | |
| #430 | if metric_str in eval_class_map: |
| #431 | return eval_class_map[metric_str]().evaluate(dataset) |
| #432 | |
| #433 | # Handle the case for custom metrics |
| #434 | if isinstance(metric, BaseMetric): |
| #435 | return metric.evaluate(dataset) |
| #436 | else: |
| #437 | raise ValueError(f"Invalid metric: {metric}") |
| #438 | |
| #439 | def evaluate( |
| #440 | self, |
| #441 | questions: Union[str, list[str]], |
| #442 | metrics: Optional[list[Union[BaseMetric, str]]] = None, |
| #443 | num_workers: int = 4, |
| #444 | ): |
| #445 | """ |
| #446 | Evaluate the app on a question. |
| #447 | |
| #448 | param: questions: A question or a list of questions to evaluate. |
| #449 | type: questions: Union[str, list[str]] |
| #450 | param: metrics: A list of metrics to evaluate. Defaults to all metrics. |
| #451 | type: metrics: Optional[list[Union[BaseMetric, str]]] |
| #452 | param: num_workers: Number of workers to use for parallel processing. |
| #453 | type: num_workers: int |
| #454 | return: A dictionary containing the evaluation results. |
| #455 | rtype: dict |
| #456 | """ |
| #457 | if "OPENAI_API_KEY" not in os.environ: |
| #458 | raise ValueError("Please set the OPENAI_API_KEY environment variable with permission to use `gpt4` model.") |
| #459 | |
| #460 | queries, answers, contexts = [], [], [] |
| #461 | if isinstance(questions, list): |
| #462 | with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: |
| #463 | future_to_data = {executor.submit(self.query, q, citations=True): q for q in questions} |
| #464 | for future in tqdm( |
| #465 | concurrent.futures.as_completed(future_to_data), |
| #466 | total=len(future_to_data), |
| #467 | desc="Getting answer and contexts for questions", |
| #468 | ): |
| #469 | question = future_to_data[future] |
| #470 | queries.append(question) |
| #471 | answer, context = future.result() |
| #472 | answers.append(answer) |
| #473 | contexts.append(list(map(lambda x: x[0], context))) |
| #474 | else: |
| #475 | answer, context = self.query(questions, citations=True) |
| #476 | queries = [questions] |
| #477 | answers = [answer] |
| #478 | contexts = [list(map(lambda x: x[0], context))] |
| #479 | |
| #480 | metrics = metrics or [ |
| #481 | EvalMetric.CONTEXT_RELEVANCY.value, |
| #482 | EvalMetric.ANSWER_RELEVANCY.value, |
| #483 | EvalMetric.GROUNDEDNESS.value, |
| #484 | ] |
| #485 | |
| #486 | logger.info(f"Collecting data from {len(queries)} questions for evaluation...") |
| #487 | dataset = [] |
| #488 | for q, a, c in zip(queries, answers, contexts): |
| #489 | dataset.append(EvalData(question=q, answer=a, contexts=c)) |
| #490 | |
| #491 | logger.info(f"Evaluating {len(dataset)} data points...") |
| #492 | result = {} |
| #493 | with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: |
| #494 | future_to_metric = {executor.submit(self._eval, dataset, metric): metric for metric in metrics} |
| #495 | for future in tqdm( |
| #496 | concurrent.futures.as_completed(future_to_metric), |
| #497 | total=len(future_to_metric), |
| #498 | desc="Evaluating metrics", |
| #499 | ): |
| #500 | metric = future_to_metric[future] |
| #501 | if isinstance(metric, BaseMetric): |
| #502 | result[metric.name] = future.result() |
| #503 | else: |
| #504 | result[metric] = future.result() |
| #505 | |
| #506 | if self.config.collect_metrics: |
| #507 | telemetry_props = self._telemetry_props |
| #508 | metrics_names = [] |
| #509 | for metric in metrics: |
| #510 | if isinstance(metric, BaseMetric): |
| #511 | metrics_names.append(metric.name) |
| #512 | else: |
| #513 | metrics_names.append(metric) |
| #514 | telemetry_props["metrics"] = metrics_names |
| #515 | self.telemetry.capture(event_name="evaluate", properties=telemetry_props) |
| #516 | |
| #517 | return result |
| #518 |