my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	import ast
#2	import concurrent.futures
#3	import json
#4	import logging
#5	import os
#6	from typing import Any, Optional, Union
#7
#8	import requests
#9	import yaml
#10	from tqdm import tqdm
#11
#12	from embedchain.cache import (
#13	Config,
#14	ExactMatchEvaluation,
#15	SearchDistanceEvaluation,
#16	cache,
#17	gptcache_data_manager,
#18	gptcache_pre_function,
#19	)
#20	from embedchain.client import Client
#21	from embedchain.config import AppConfig, CacheConfig, ChunkerConfig, Mem0Config
#22	from embedchain.core.db.database import get_session
#23	from embedchain.core.db.models import DataSource
#24	from embedchain.embedchain import EmbedChain
#25	from embedchain.embedder.base import BaseEmbedder
#26	from embedchain.embedder.openai import OpenAIEmbedder
#27	from embedchain.evaluation.base import BaseMetric
#28	from embedchain.evaluation.metrics import (
#29	AnswerRelevance,
#30	ContextRelevance,
#31	Groundedness,
#32	)
#33	from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory
#34	from embedchain.helpers.json_serializable import register_deserializable
#35	from embedchain.llm.base import BaseLlm
#36	from embedchain.llm.openai import OpenAILlm
#37	from embedchain.telemetry.posthog import AnonymousTelemetry
#38	from embedchain.utils.evaluation import EvalData, EvalMetric
#39	from embedchain.utils.misc import validate_config
#40	from embedchain.vectordb.base import BaseVectorDB
#41	from embedchain.vectordb.chroma import ChromaDB
#42	from mem0 import Memory
#43
#44	logger = logging.getLogger(__name__)
#45
#46
#47	@register_deserializable
#48	class App(EmbedChain):
#49	"""
#50	EmbedChain App lets you create a LLM powered app for your unstructured
#51	data by defining your chosen data source, embedding model,
#52	and vector database.
#53	"""
#54
#55	def __init__(
#56	self,
#57	id: str = None,
#58	name: str = None,
#59	config: AppConfig = None,
#60	db: BaseVectorDB = None,
#61	embedding_model: BaseEmbedder = None,
#62	llm: BaseLlm = None,
#63	config_data: dict = None,
#64	auto_deploy: bool = False,
#65	chunker: ChunkerConfig = None,
#66	cache_config: CacheConfig = None,
#67	memory_config: Mem0Config = None,
#68	log_level: int = logging.WARN,
#69	):
#70	"""
#71	Initialize a new `App` instance.
#72
#73	:param config: Configuration for the pipeline, defaults to None
#74	:type config: AppConfig, optional
#75	:param db: The database to use for storing and retrieving embeddings, defaults to None
#76	:type db: BaseVectorDB, optional
#77	:param embedding_model: The embedding model used to calculate embeddings, defaults to None
#78	:type embedding_model: BaseEmbedder, optional
#79	:param llm: The LLM model used to calculate embeddings, defaults to None
#80	:type llm: BaseLlm, optional
#81	:param config_data: Config dictionary, defaults to None
#82	:type config_data: dict, optional
#83	:param auto_deploy: Whether to deploy the pipeline automatically, defaults to False
#84	:type auto_deploy: bool, optional
#85	:raises Exception: If an error occurs while creating the pipeline
#86	"""
#87	if id and config_data:
#88	raise Exception("Cannot provide both id and config. Please provide only one of them.")
#89
#90	if id and name:
#91	raise Exception("Cannot provide both id and name. Please provide only one of them.")
#92
#93	if name and config:
#94	raise Exception("Cannot provide both name and config. Please provide only one of them.")
#95
#96	self.auto_deploy = auto_deploy
#97	# Store the dict config as an attribute to be able to send it
#98	self.config_data = config_data if (config_data and validate_config(config_data)) else None
#99	self.client = None
#100	# pipeline_id from the backend
#101	self.id = None
#102	self.chunker = ChunkerConfig(**chunker) if chunker else None
#103	self.cache_config = cache_config
#104	self.memory_config = memory_config
#105
#106	self.config = config or AppConfig()
#107	self.name = self.config.name
#108	self.config.id = self.local_id = "default-app-id" if self.config.id is None else self.config.id
#109
#110	if id is not None:
#111	# Init client first since user is trying to fetch the pipeline
#112	# details from the platform
#113	self._init_client()
#114	pipeline_details = self._get_pipeline(id)
#115	self.config.id = self.local_id = pipeline_details["metadata"]["local_id"]
#116	self.id = id
#117
#118	if name is not None:
#119	self.name = name
#120
#121	self.embedding_model = embedding_model or OpenAIEmbedder()
#122	self.db = db or ChromaDB()
#123	self.llm = llm or OpenAILlm()
#124	self._init_db()
#125
#126	# Session for the metadata db
#127	self.db_session = get_session()
#128
#129	# If cache_config is provided, initializing the cache ...
#130	if self.cache_config is not None:
#131	self._init_cache()
#132
#133	# If memory_config is provided, initializing the memory ...
#134	self.mem0_memory = None
#135	if self.memory_config is not None:
#136	self.mem0_memory = Memory()
#137
#138	# Send anonymous telemetry
#139	self._telemetry_props = {"class": self.__class__.__name__}
#140	self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics)
#141	self.telemetry.capture(event_name="init", properties=self._telemetry_props)
#142
#143	self.user_asks = []
#144	if self.auto_deploy:
#145	self.deploy()
#146
#147	def _init_db(self):
#148	"""
#149	Initialize the database.
#150	"""
#151	self.db._set_embedder(self.embedding_model)
#152	self.db._initialize()
#153	self.db.set_collection_name(self.db.config.collection_name)
#154
#155	def _init_cache(self):
#156	if self.cache_config.similarity_eval_config.strategy == "exact":
#157	similarity_eval_func = ExactMatchEvaluation()
#158	else:
#159	similarity_eval_func = SearchDistanceEvaluation(
#160	max_distance=self.cache_config.similarity_eval_config.max_distance,
#161	positive=self.cache_config.similarity_eval_config.positive,
#162	)
#163
#164	cache.init(
#165	pre_embedding_func=gptcache_pre_function,
#166	embedding_func=self.embedding_model.to_embeddings,
#167	data_manager=gptcache_data_manager(vector_dimension=self.embedding_model.vector_dimension),
#168	similarity_evaluation=similarity_eval_func,
#169	config=Config(**self.cache_config.init_config.as_dict()),
#170	)
#171
#172	def _init_client(self):
#173	"""
#174	Initialize the client.
#175	"""
#176	config = Client.load_config()
#177	if config.get("api_key"):
#178	self.client = Client()
#179	else:
#180	api_key = input(
#181	"🔑 Enter your Embedchain API key. You can find the API key at https://app.embedchain.ai/settings/keys/ \n" # noqa: E501
#182	)
#183	self.client = Client(api_key=api_key)
#184
#185	def _get_pipeline(self, id):
#186	"""
#187	Get existing pipeline
#188	"""
#189	print("🛠️ Fetching pipeline details from the platform...")
#190	url = f"{self.client.host}/api/v1/pipelines/{id}/cli/"
#191	r = requests.get(
#192	url,
#193	headers={"Authorization": f"Token {self.client.api_key}"},
#194	)
#195	if r.status_code == 404:
#196	raise Exception(f"❌ Pipeline with id {id} not found!")
#197
#198	print(
#199	f"🎉 Pipeline loaded successfully! Pipeline url: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
#200	)
#201	return r.json()
#202
#203	def _create_pipeline(self):
#204	"""
#205	Create a pipeline on the platform.
#206	"""
#207	print("🛠️ Creating pipeline on the platform...")
#208	# self.config_data is a dict. Pass it inside the key 'yaml_config' to the backend
#209	payload = {
#210	"yaml_config": json.dumps(self.config_data),
#211	"name": self.name,
#212	"local_id": self.local_id,
#213	}
#214	url = f"{self.client.host}/api/v1/pipelines/cli/create/"
#215	r = requests.post(
#216	url,
#217	json=payload,
#218	headers={"Authorization": f"Token {self.client.api_key}"},
#219	)
#220	if r.status_code not in [200, 201]:
#221	raise Exception(f"❌ Error occurred while creating pipeline. API response: {r.text}")
#222
#223	if r.status_code == 200:
#224	print(
#225	f"🎉🎉🎉 Existing pipeline found! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
#226	) # noqa: E501
#227	elif r.status_code == 201:
#228	print(
#229	f"🎉🎉🎉 Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
#230	)
#231	return r.json()
#232
#233	def _get_presigned_url(self, data_type, data_value):
#234	payload = {"data_type": data_type, "data_value": data_value}
#235	r = requests.post(
#236	f"{self.client.host}/api/v1/pipelines/{self.id}/cli/presigned_url/",
#237	json=payload,
#238	headers={"Authorization": f"Token {self.client.api_key}"},
#239	)
#240	r.raise_for_status()
#241	return r.json()
#242
#243	def _upload_file_to_presigned_url(self, presigned_url, file_path):
#244	try:
#245	with open(file_path, "rb") as file:
#246	response = requests.put(presigned_url, data=file)
#247	response.raise_for_status()
#248	return response.status_code == 200
#249	except Exception as e:
#250	logger.exception(f"Error occurred during file upload: {str(e)}")
#251	print("❌ Error occurred during file upload!")
#252	return False
#253
#254	def _upload_data_to_pipeline(self, data_type, data_value, metadata=None):
#255	payload = {
#256	"data_type": data_type,
#257	"data_value": data_value,
#258	"metadata": metadata,
#259	}
#260	try:
#261	self._send_api_request(f"/api/v1/pipelines/{self.id}/cli/add/", payload)
#262	# print the local file path if user tries to upload a local file
#263	printed_value = metadata.get("file_path") if metadata.get("file_path") else data_value
#264	print(f"✅ Data of type: {data_type}, value: {printed_value} added successfully.")
#265	except Exception as e:
#266	print(f"❌ Error occurred during data upload for type {data_type}!. Error: {str(e)}")
#267
#268	def _send_api_request(self, endpoint, payload):
#269	url = f"{self.client.host}{endpoint}"
#270	headers = {"Authorization": f"Token {self.client.api_key}"}
#271	response = requests.post(url, json=payload, headers=headers)
#272	response.raise_for_status()
#273	return response
#274
#275	def _process_and_upload_data(self, data_hash, data_type, data_value):
#276	if os.path.isabs(data_value):
#277	presigned_url_data = self._get_presigned_url(data_type, data_value)
#278	presigned_url = presigned_url_data["presigned_url"]
#279	s3_key = presigned_url_data["s3_key"]
#280	if self._upload_file_to_presigned_url(presigned_url, file_path=data_value):
#281	metadata = {"file_path": data_value, "s3_key": s3_key}
#282	data_value = presigned_url
#283	else:
#284	logger.error(f"File upload failed for hash: {data_hash}")
#285	return False
#286	else:
#287	if data_type == "qna_pair":
#288	data_value = list(ast.literal_eval(data_value))
#289	metadata = {}
#290
#291	try:
#292	self._upload_data_to_pipeline(data_type, data_value, metadata)
#293	self._mark_data_as_uploaded(data_hash)
#294	return True
#295	except Exception:
#296	print(f"❌ Error occurred during data upload for hash {data_hash}!")
#297	return False
#298
#299	def _mark_data_as_uploaded(self, data_hash):
#300	self.db_session.query(DataSource).filter_by(hash=data_hash, app_id=self.local_id).update({"is_uploaded": 1})
#301
#302	def get_data_sources(self):
#303	data_sources = self.db_session.query(DataSource).filter_by(app_id=self.local_id).all()
#304	results = []
#305	for row in data_sources:
#306	results.append({"data_type": row.type, "data_value": row.value, "metadata": row.meta_data})
#307	return results
#308
#309	def deploy(self):
#310	if self.client is None:
#311	self._init_client()
#312
#313	pipeline_data = self._create_pipeline()
#314	self.id = pipeline_data["id"]
#315
#316	results = self.db_session.query(DataSource).filter_by(app_id=self.local_id, is_uploaded=0).all()
#317	if len(results) > 0:
#318	print("🛠️ Adding data to your pipeline...")
#319	for result in results:
#320	data_hash, data_type, data_value = result.hash, result.data_type, result.data_value
#321	self._process_and_upload_data(data_hash, data_type, data_value)
#322
#323	# Send anonymous telemetry
#324	self.telemetry.capture(event_name="deploy", properties=self._telemetry_props)
#325
#326	@classmethod
#327	def from_config(
#328	cls,
#329	config_path: Optional[str] = None,
#330	config: Optional[dict[str, Any]] = None,
#331	auto_deploy: bool = False,
#332	yaml_path: Optional[str] = None,
#333	):
#334	"""
#335	Instantiate a App object from a configuration.
#336
#337	:param config_path: Path to the YAML or JSON configuration file.
#338	:type config_path: Optional[str]
#339	:param config: A dictionary containing the configuration.
#340	:type config: Optional[dict[str, Any]]
#341	:param auto_deploy: Whether to deploy the app automatically, defaults to False
#342	:type auto_deploy: bool, optional
#343	:param yaml_path: (Deprecated) Path to the YAML configuration file. Use config_path instead.
#344	:type yaml_path: Optional[str]
#345	:return: An instance of the App class.
#346	:rtype: App
#347	"""
#348	# Backward compatibility for yaml_path
#349	if yaml_path and not config_path:
#350	config_path = yaml_path
#351
#352	if config_path and config:
#353	raise ValueError("Please provide only one of config_path or config.")
#354
#355	config_data = None
#356
#357	if config_path:
#358	file_extension = os.path.splitext(config_path)[1]
#359	with open(config_path, "r", encoding="UTF-8") as file:
#360	if file_extension in [".yaml", ".yml"]:
#361	config_data = yaml.safe_load(file)
#362	elif file_extension == ".json":
#363	config_data = json.load(file)
#364	else:
#365	raise ValueError("config_path must be a path to a YAML or JSON file.")
#366	elif config and isinstance(config, dict):
#367	config_data = config
#368	else:
#369	logger.error(
#370	"Please provide either a config file path (YAML or JSON) or a config dictionary. Falling back to defaults because no config is provided.", # noqa: E501
#371	)
#372	config_data = {}
#373
#374	# Validate the config
#375	validate_config(config_data)
#376
#377	app_config_data = config_data.get("app", {}).get("config", {})
#378	vector_db_config_data = config_data.get("vectordb", {})
#379	embedding_model_config_data = config_data.get("embedding_model", config_data.get("embedder", {}))
#380	memory_config_data = config_data.get("memory", {})
#381	llm_config_data = config_data.get("llm", {})
#382	chunker_config_data = config_data.get("chunker", {})
#383	cache_config_data = config_data.get("cache", None)
#384
#385	app_config = AppConfig(**app_config_data)
#386	memory_config = Mem0Config(**memory_config_data) if memory_config_data else None
#387
#388	vector_db_provider = vector_db_config_data.get("provider", "chroma")
#389	vector_db = VectorDBFactory.create(vector_db_provider, vector_db_config_data.get("config", {}))
#390
#391	if llm_config_data:
#392	llm_provider = llm_config_data.get("provider", "openai")
#393	llm = LlmFactory.create(llm_provider, llm_config_data.get("config", {}))
#394	else:
#395	llm = None
#396
#397	embedding_model_provider = embedding_model_config_data.get("provider", "openai")
#398	embedding_model = EmbedderFactory.create(
#399	embedding_model_provider, embedding_model_config_data.get("config", {})
#400	)
#401
#402	if cache_config_data is not None:
#403	cache_config = CacheConfig.from_config(cache_config_data)
#404	else:
#405	cache_config = None
#406
#407	return cls(
#408	config=app_config,
#409	llm=llm,
#410	db=vector_db,
#411	embedding_model=embedding_model,
#412	config_data=config_data,
#413	auto_deploy=auto_deploy,
#414	chunker=chunker_config_data,
#415	cache_config=cache_config,
#416	memory_config=memory_config,
#417	)
#418
#419	def _eval(self, dataset: list[EvalData], metric: Union[BaseMetric, str]):
#420	"""
#421	Evaluate the app on a dataset for a given metric.
#422	"""
#423	metric_str = metric.name if isinstance(metric, BaseMetric) else metric
#424	eval_class_map = {
#425	EvalMetric.CONTEXT_RELEVANCY.value: ContextRelevance,
#426	EvalMetric.ANSWER_RELEVANCY.value: AnswerRelevance,
#427	EvalMetric.GROUNDEDNESS.value: Groundedness,
#428	}
#429
#430	if metric_str in eval_class_map:
#431	return eval_class_map[metric_str]().evaluate(dataset)
#432
#433	# Handle the case for custom metrics
#434	if isinstance(metric, BaseMetric):
#435	return metric.evaluate(dataset)
#436	else:
#437	raise ValueError(f"Invalid metric: {metric}")
#438
#439	def evaluate(
#440	self,
#441	questions: Union[str, list[str]],
#442	metrics: Optional[list[Union[BaseMetric, str]]] = None,
#443	num_workers: int = 4,
#444	):
#445	"""
#446	Evaluate the app on a question.
#447
#448	param: questions: A question or a list of questions to evaluate.
#449	type: questions: Union[str, list[str]]
#450	param: metrics: A list of metrics to evaluate. Defaults to all metrics.
#451	type: metrics: Optional[list[Union[BaseMetric, str]]]
#452	param: num_workers: Number of workers to use for parallel processing.
#453	type: num_workers: int
#454	return: A dictionary containing the evaluation results.
#455	rtype: dict
#456	"""
#457	if "OPENAI_API_KEY" not in os.environ:
#458	raise ValueError("Please set the OPENAI_API_KEY environment variable with permission to use `gpt4` model.")
#459
#460	queries, answers, contexts = [], [], []
#461	if isinstance(questions, list):
#462	with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
#463	future_to_data = {executor.submit(self.query, q, citations=True): q for q in questions}
#464	for future in tqdm(
#465	concurrent.futures.as_completed(future_to_data),
#466	total=len(future_to_data),
#467	desc="Getting answer and contexts for questions",
#468	):
#469	question = future_to_data[future]
#470	queries.append(question)
#471	answer, context = future.result()
#472	answers.append(answer)
#473	contexts.append(list(map(lambda x: x[0], context)))
#474	else:
#475	answer, context = self.query(questions, citations=True)
#476	queries = [questions]
#477	answers = [answer]
#478	contexts = [list(map(lambda x: x[0], context))]
#479
#480	metrics = metrics or [
#481	EvalMetric.CONTEXT_RELEVANCY.value,
#482	EvalMetric.ANSWER_RELEVANCY.value,
#483	EvalMetric.GROUNDEDNESS.value,
#484	]
#485
#486	logger.info(f"Collecting data from {len(queries)} questions for evaluation...")
#487	dataset = []
#488	for q, a, c in zip(queries, answers, contexts):
#489	dataset.append(EvalData(question=q, answer=a, contexts=c))
#490
#491	logger.info(f"Evaluating {len(dataset)} data points...")
#492	result = {}
#493	with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
#494	future_to_metric = {executor.submit(self._eval, dataset, metric): metric for metric in metrics}
#495	for future in tqdm(
#496	concurrent.futures.as_completed(future_to_metric),
#497	total=len(future_to_metric),
#498	desc="Evaluating metrics",
#499	):
#500	metric = future_to_metric[future]
#501	if isinstance(metric, BaseMetric):
#502	result[metric.name] = future.result()
#503	else:
#504	result[metric] = future.result()
#505
#506	if self.config.collect_metrics:
#507	telemetry_props = self._telemetry_props
#508	metrics_names = []
#509	for metric in metrics:
#510	if isinstance(metric, BaseMetric):
#511	metrics_names.append(metric.name)
#512	else:
#513	metrics_names.append(metric)
#514	telemetry_props["metrics"] = metrics_names
#515	self.telemetry.capture(event_name="evaluate", properties=telemetry_props)
#516
#517	return result
#518

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public