repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | import json |
| #3 | import os |
| #4 | import re |
| #5 | from typing import Union |
| #6 | |
| #7 | import requests |
| #8 | |
| #9 | from embedchain.loaders.base_loader import BaseLoader |
| #10 | from embedchain.utils.misc import clean_string, is_valid_json_string |
| #11 | |
| #12 | |
| #13 | class JSONReader: |
| #14 | def __init__(self) -> None: |
| #15 | """Initialize the JSONReader.""" |
| #16 | pass |
| #17 | |
| #18 | @staticmethod |
| #19 | def load_data(json_data: Union[dict, str]) -> list[str]: |
| #20 | """Load data from a JSON structure. |
| #21 | |
| #22 | Args: |
| #23 | json_data (Union[dict, str]): The JSON data to load. |
| #24 | |
| #25 | Returns: |
| #26 | list[str]: A list of strings representing the leaf nodes of the JSON. |
| #27 | """ |
| #28 | if isinstance(json_data, str): |
| #29 | json_data = json.loads(json_data) |
| #30 | else: |
| #31 | json_data = json_data |
| #32 | |
| #33 | json_output = json.dumps(json_data, indent=0) |
| #34 | lines = json_output.split("\n") |
| #35 | useful_lines = [line for line in lines if not re.match(r"^[{}\[\],]*$", line)] |
| #36 | return ["\n".join(useful_lines)] |
| #37 | |
| #38 | |
| #39 | VALID_URL_PATTERN = ( |
| #40 | "^https?://(?:www\.)?(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[a-zA-Z0-9.-]+)(?::\d+)?/(?:[^/\s]+/)*[^/\s]+\.json$" |
| #41 | ) |
| #42 | |
| #43 | |
| #44 | class JSONLoader(BaseLoader): |
| #45 | @staticmethod |
| #46 | def _check_content(content): |
| #47 | if not isinstance(content, str): |
| #48 | raise ValueError( |
| #49 | "Invaid content input. \ |
| #50 | If you want to upload (list, dict, etc.), do \ |
| #51 | `json.dump(data, indent=0)` and add the stringified JSON. \ |
| #52 | Check - `https://docs.embedchain.ai/data-sources/json`" |
| #53 | ) |
| #54 | |
| #55 | @staticmethod |
| #56 | def load_data(content): |
| #57 | """Load a json file. Each data point is a key value pair.""" |
| #58 | |
| #59 | JSONLoader._check_content(content) |
| #60 | loader = JSONReader() |
| #61 | |
| #62 | data = [] |
| #63 | data_content = [] |
| #64 | |
| #65 | content_url_str = content |
| #66 | |
| #67 | if os.path.isfile(content): |
| #68 | with open(content, "r", encoding="utf-8") as json_file: |
| #69 | json_data = json.load(json_file) |
| #70 | elif re.match(VALID_URL_PATTERN, content): |
| #71 | response = requests.get(content) |
| #72 | if response.status_code == 200: |
| #73 | json_data = response.json() |
| #74 | else: |
| #75 | raise ValueError( |
| #76 | f"Loading data from the given url: {content} failed. \ |
| #77 | Make sure the url is working." |
| #78 | ) |
| #79 | elif is_valid_json_string(content): |
| #80 | json_data = content |
| #81 | content_url_str = hashlib.sha256((content).encode("utf-8")).hexdigest() |
| #82 | else: |
| #83 | raise ValueError(f"Invalid content to load json data from: {content}") |
| #84 | |
| #85 | docs = loader.load_data(json_data) |
| #86 | for doc in docs: |
| #87 | text = doc if isinstance(doc, str) else doc["text"] |
| #88 | doc_content = clean_string(text) |
| #89 | data.append({"content": doc_content, "meta_data": {"url": content_url_str}}) |
| #90 | data_content.append(doc_content) |
| #91 | |
| #92 | doc_id = hashlib.sha256((content_url_str + ", ".join(data_content)).encode()).hexdigest() |
| #93 | return {"doc_id": doc_id, "data": data} |
| #94 |