repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | import logging |
| #3 | import time |
| #4 | from xml.etree import ElementTree |
| #5 | |
| #6 | import requests |
| #7 | |
| #8 | from embedchain.helpers.json_serializable import register_deserializable |
| #9 | from embedchain.loaders.base_loader import BaseLoader |
| #10 | from embedchain.utils.misc import is_readable |
| #11 | |
| #12 | logger = logging.getLogger(__name__) |
| #13 | |
| #14 | |
| #15 | @register_deserializable |
| #16 | class BeehiivLoader(BaseLoader): |
| #17 | """ |
| #18 | This loader is used to load data from Beehiiv URLs. |
| #19 | """ |
| #20 | |
| #21 | def load_data(self, url: str): |
| #22 | try: |
| #23 | from bs4 import BeautifulSoup |
| #24 | from bs4.builder import ParserRejectedMarkup |
| #25 | except ImportError: |
| #26 | raise ImportError( |
| #27 | "Beehiiv requires extra dependencies. Install with `pip install beautifulsoup4==4.12.3`" |
| #28 | ) from None |
| #29 | |
| #30 | if not url.endswith("sitemap.xml"): |
| #31 | url = url + "/sitemap.xml" |
| #32 | |
| #33 | output = [] |
| #34 | # we need to set this as a header to avoid 403 |
| #35 | headers = { |
| #36 | "User-Agent": ( |
| #37 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) " |
| #38 | "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 " |
| #39 | "Safari/537.36" |
| #40 | ), |
| #41 | } |
| #42 | response = requests.get(url, headers=headers) |
| #43 | try: |
| #44 | response.raise_for_status() |
| #45 | except requests.exceptions.HTTPError as e: |
| #46 | raise ValueError( |
| #47 | f""" |
| #48 | Failed to load {url}: {e}. Please use the root substack URL. For example, https://example.substack.com |
| #49 | """ |
| #50 | ) |
| #51 | |
| #52 | try: |
| #53 | ElementTree.fromstring(response.content) |
| #54 | except ElementTree.ParseError: |
| #55 | raise ValueError( |
| #56 | f""" |
| #57 | Failed to parse {url}. Please use the root substack URL. For example, https://example.substack.com |
| #58 | """ |
| #59 | ) |
| #60 | soup = BeautifulSoup(response.text, "xml") |
| #61 | links = [link.text for link in soup.find_all("loc") if link.parent.name == "url" and "/p/" in link.text] |
| #62 | if len(links) == 0: |
| #63 | links = [link.text for link in soup.find_all("loc") if "/p/" in link.text] |
| #64 | |
| #65 | doc_id = hashlib.sha256((" ".join(links) + url).encode()).hexdigest() |
| #66 | |
| #67 | def serialize_response(soup: BeautifulSoup): |
| #68 | data = {} |
| #69 | |
| #70 | h1_el = soup.find("h1") |
| #71 | if h1_el is not None: |
| #72 | data["title"] = h1_el.text |
| #73 | |
| #74 | description_el = soup.find("meta", {"name": "description"}) |
| #75 | if description_el is not None: |
| #76 | data["description"] = description_el["content"] |
| #77 | |
| #78 | content_el = soup.find("div", {"id": "content-blocks"}) |
| #79 | if content_el is not None: |
| #80 | data["content"] = content_el.text |
| #81 | |
| #82 | return data |
| #83 | |
| #84 | def load_link(link: str): |
| #85 | try: |
| #86 | beehiiv_data = requests.get(link, headers=headers) |
| #87 | beehiiv_data.raise_for_status() |
| #88 | |
| #89 | soup = BeautifulSoup(beehiiv_data.text, "html.parser") |
| #90 | data = serialize_response(soup) |
| #91 | data = str(data) |
| #92 | if is_readable(data): |
| #93 | return data |
| #94 | else: |
| #95 | logger.warning(f"Page is not readable (too many invalid characters): {link}") |
| #96 | except ParserRejectedMarkup as e: |
| #97 | logger.error(f"Failed to parse {link}: {e}") |
| #98 | return None |
| #99 | |
| #100 | for link in links: |
| #101 | data = load_link(link) |
| #102 | if data: |
| #103 | output.append({"content": data, "meta_data": {"url": link}}) |
| #104 | # TODO: allow users to configure this |
| #105 | time.sleep(1.0) # added to avoid rate limiting |
| #106 | |
| #107 | return {"doc_id": doc_id, "data": output} |
| #108 |