repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | import logging |
| #3 | import os |
| #4 | from typing import Any, Optional |
| #5 | |
| #6 | import requests |
| #7 | |
| #8 | from embedchain.helpers.json_serializable import register_deserializable |
| #9 | from embedchain.loaders.base_loader import BaseLoader |
| #10 | from embedchain.utils.misc import clean_string |
| #11 | |
| #12 | logger = logging.getLogger(__name__) |
| #13 | |
| #14 | |
| #15 | class NotionDocument: |
| #16 | """ |
| #17 | A simple Document class to hold the text and additional information of a page. |
| #18 | """ |
| #19 | |
| #20 | def __init__(self, text: str, extra_info: dict[str, Any]): |
| #21 | self.text = text |
| #22 | self.extra_info = extra_info |
| #23 | |
| #24 | |
| #25 | class NotionPageLoader: |
| #26 | """ |
| #27 | Notion Page Loader. |
| #28 | Reads a set of Notion pages. |
| #29 | """ |
| #30 | |
| #31 | BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children" |
| #32 | |
| #33 | def __init__(self, integration_token: Optional[str] = None) -> None: |
| #34 | """Initialize with Notion integration token.""" |
| #35 | if integration_token is None: |
| #36 | integration_token = os.getenv("NOTION_INTEGRATION_TOKEN") |
| #37 | if integration_token is None: |
| #38 | raise ValueError( |
| #39 | "Must specify `integration_token` or set environment " "variable `NOTION_INTEGRATION_TOKEN`." |
| #40 | ) |
| #41 | self.token = integration_token |
| #42 | self.headers = { |
| #43 | "Authorization": "Bearer " + self.token, |
| #44 | "Content-Type": "application/json", |
| #45 | "Notion-Version": "2022-06-28", |
| #46 | } |
| #47 | |
| #48 | def _read_block(self, block_id: str, num_tabs: int = 0) -> str: |
| #49 | """Read a block from Notion.""" |
| #50 | done = False |
| #51 | result_lines_arr = [] |
| #52 | cur_block_id = block_id |
| #53 | while not done: |
| #54 | block_url = self.BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id) |
| #55 | res = requests.get(block_url, headers=self.headers) |
| #56 | data = res.json() |
| #57 | |
| #58 | for result in data["results"]: |
| #59 | result_type = result["type"] |
| #60 | result_obj = result[result_type] |
| #61 | |
| #62 | cur_result_text_arr = [] |
| #63 | if "rich_text" in result_obj: |
| #64 | for rich_text in result_obj["rich_text"]: |
| #65 | if "text" in rich_text: |
| #66 | text = rich_text["text"]["content"] |
| #67 | prefix = "\t" * num_tabs |
| #68 | cur_result_text_arr.append(prefix + text) |
| #69 | |
| #70 | result_block_id = result["id"] |
| #71 | has_children = result["has_children"] |
| #72 | if has_children: |
| #73 | children_text = self._read_block(result_block_id, num_tabs=num_tabs + 1) |
| #74 | cur_result_text_arr.append(children_text) |
| #75 | |
| #76 | cur_result_text = "\n".join(cur_result_text_arr) |
| #77 | result_lines_arr.append(cur_result_text) |
| #78 | |
| #79 | if data["next_cursor"] is None: |
| #80 | done = True |
| #81 | else: |
| #82 | cur_block_id = data["next_cursor"] |
| #83 | |
| #84 | result_lines = "\n".join(result_lines_arr) |
| #85 | return result_lines |
| #86 | |
| #87 | def load_data(self, page_ids: list[str]) -> list[NotionDocument]: |
| #88 | """Load data from the given list of page IDs.""" |
| #89 | docs = [] |
| #90 | for page_id in page_ids: |
| #91 | page_text = self._read_block(page_id) |
| #92 | docs.append(NotionDocument(text=page_text, extra_info={"page_id": page_id})) |
| #93 | return docs |
| #94 | |
| #95 | |
| #96 | @register_deserializable |
| #97 | class NotionLoader(BaseLoader): |
| #98 | def load_data(self, source): |
| #99 | """Load data from a Notion URL.""" |
| #100 | |
| #101 | id = source[-32:] |
| #102 | formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}" |
| #103 | logger.debug(f"Extracted notion page id as: {formatted_id}") |
| #104 | |
| #105 | integration_token = os.getenv("NOTION_INTEGRATION_TOKEN") |
| #106 | reader = NotionPageLoader(integration_token=integration_token) |
| #107 | documents = reader.load_data(page_ids=[formatted_id]) |
| #108 | |
| #109 | raw_text = documents[0].text |
| #110 | |
| #111 | text = clean_string(raw_text) |
| #112 | doc_id = hashlib.sha256((text + source).encode()).hexdigest() |
| #113 | return { |
| #114 | "doc_id": doc_id, |
| #115 | "data": [ |
| #116 | { |
| #117 | "content": text, |
| #118 | "meta_data": {"url": f"notion-{formatted_id}"}, |
| #119 | } |
| #120 | ], |
| #121 | } |
| #122 |