repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | import logging |
| #3 | from pathlib import Path |
| #4 | from typing import Any, Optional |
| #5 | |
| #6 | from embedchain.config import AddConfig |
| #7 | from embedchain.data_formatter.data_formatter import DataFormatter |
| #8 | from embedchain.helpers.json_serializable import register_deserializable |
| #9 | from embedchain.loaders.base_loader import BaseLoader |
| #10 | from embedchain.loaders.text_file import TextFileLoader |
| #11 | from embedchain.utils.misc import detect_datatype |
| #12 | |
| #13 | logger = logging.getLogger(__name__) |
| #14 | |
| #15 | |
| #16 | @register_deserializable |
| #17 | class DirectoryLoader(BaseLoader): |
| #18 | """Load data from a directory.""" |
| #19 | |
| #20 | def __init__(self, config: Optional[dict[str, Any]] = None): |
| #21 | super().__init__() |
| #22 | config = config or {} |
| #23 | self.recursive = config.get("recursive", True) |
| #24 | self.extensions = config.get("extensions", None) |
| #25 | self.errors = [] |
| #26 | |
| #27 | def load_data(self, path: str): |
| #28 | directory_path = Path(path) |
| #29 | if not directory_path.is_dir(): |
| #30 | raise ValueError(f"Invalid path: {path}") |
| #31 | |
| #32 | logger.info(f"Loading data from directory: {path}") |
| #33 | data_list = self._process_directory(directory_path) |
| #34 | doc_id = hashlib.sha256((str(data_list) + str(directory_path)).encode()).hexdigest() |
| #35 | |
| #36 | for error in self.errors: |
| #37 | logger.warning(error) |
| #38 | |
| #39 | return {"doc_id": doc_id, "data": data_list} |
| #40 | |
| #41 | def _process_directory(self, directory_path: Path): |
| #42 | data_list = [] |
| #43 | for file_path in directory_path.rglob("*") if self.recursive else directory_path.glob("*"): |
| #44 | # don't include dotfiles |
| #45 | if file_path.name.startswith("."): |
| #46 | continue |
| #47 | if file_path.is_file() and (not self.extensions or any(file_path.suffix == ext for ext in self.extensions)): |
| #48 | loader = self._predict_loader(file_path) |
| #49 | data_list.extend(loader.load_data(str(file_path))["data"]) |
| #50 | elif file_path.is_dir(): |
| #51 | logger.info(f"Loading data from directory: {file_path}") |
| #52 | return data_list |
| #53 | |
| #54 | def _predict_loader(self, file_path: Path) -> BaseLoader: |
| #55 | try: |
| #56 | data_type = detect_datatype(str(file_path)) |
| #57 | config = AddConfig() |
| #58 | return DataFormatter(data_type=data_type, config=config)._get_loader( |
| #59 | data_type=data_type, config=config.loader, loader=None |
| #60 | ) |
| #61 | except Exception as e: |
| #62 | self.errors.append(f"Error processing {file_path}: {e}") |
| #63 | return TextFileLoader() |
| #64 |