repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | import logging |
| #3 | from urllib.parse import urljoin, urlparse |
| #4 | |
| #5 | import requests |
| #6 | |
| #7 | try: |
| #8 | from bs4 import BeautifulSoup |
| #9 | except ImportError: |
| #10 | raise ImportError( |
| #11 | "DocsSite requires extra dependencies. Install with `pip install beautifulsoup4==4.12.3`" |
| #12 | ) from None |
| #13 | |
| #14 | |
| #15 | from embedchain.helpers.json_serializable import register_deserializable |
| #16 | from embedchain.loaders.base_loader import BaseLoader |
| #17 | |
| #18 | logger = logging.getLogger(__name__) |
| #19 | |
| #20 | |
| #21 | @register_deserializable |
| #22 | class DocsSiteLoader(BaseLoader): |
| #23 | def __init__(self): |
| #24 | self.visited_links = set() |
| #25 | |
| #26 | def _get_child_links_recursive(self, url): |
| #27 | if url in self.visited_links: |
| #28 | return |
| #29 | |
| #30 | parsed_url = urlparse(url) |
| #31 | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" |
| #32 | current_path = parsed_url.path |
| #33 | |
| #34 | response = requests.get(url) |
| #35 | if response.status_code != 200: |
| #36 | logger.info(f"Failed to fetch the website: {response.status_code}") |
| #37 | return |
| #38 | |
| #39 | soup = BeautifulSoup(response.text, "html.parser") |
| #40 | all_links = (link.get("href") for link in soup.find_all("a", href=True)) |
| #41 | |
| #42 | child_links = (link for link in all_links if link.startswith(current_path) and link != current_path) |
| #43 | |
| #44 | absolute_paths = set(urljoin(base_url, link) for link in child_links) |
| #45 | |
| #46 | self.visited_links.update(absolute_paths) |
| #47 | |
| #48 | [self._get_child_links_recursive(link) for link in absolute_paths if link not in self.visited_links] |
| #49 | |
| #50 | def _get_all_urls(self, url): |
| #51 | self.visited_links = set() |
| #52 | self._get_child_links_recursive(url) |
| #53 | urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc] |
| #54 | return urls |
| #55 | |
| #56 | @staticmethod |
| #57 | def _load_data_from_url(url: str) -> list: |
| #58 | response = requests.get(url) |
| #59 | if response.status_code != 200: |
| #60 | logger.info(f"Failed to fetch the website: {response.status_code}") |
| #61 | return [] |
| #62 | |
| #63 | soup = BeautifulSoup(response.content, "html.parser") |
| #64 | selectors = [ |
| #65 | "article.bd-article", |
| #66 | 'article[role="main"]', |
| #67 | "div.md-content", |
| #68 | 'div[role="main"]', |
| #69 | "div.container", |
| #70 | "div.section", |
| #71 | "article", |
| #72 | "main", |
| #73 | ] |
| #74 | |
| #75 | output = [] |
| #76 | for selector in selectors: |
| #77 | element = soup.select_one(selector) |
| #78 | if element: |
| #79 | content = element.prettify() |
| #80 | break |
| #81 | else: |
| #82 | content = soup.get_text() |
| #83 | |
| #84 | soup = BeautifulSoup(content, "html.parser") |
| #85 | ignored_tags = [ |
| #86 | "nav", |
| #87 | "aside", |
| #88 | "form", |
| #89 | "header", |
| #90 | "noscript", |
| #91 | "svg", |
| #92 | "canvas", |
| #93 | "footer", |
| #94 | "script", |
| #95 | "style", |
| #96 | ] |
| #97 | for tag in soup(ignored_tags): |
| #98 | tag.decompose() |
| #99 | |
| #100 | content = " ".join(soup.stripped_strings) |
| #101 | output.append( |
| #102 | { |
| #103 | "content": content, |
| #104 | "meta_data": {"url": url}, |
| #105 | } |
| #106 | ) |
| #107 | |
| #108 | return output |
| #109 | |
| #110 | def load_data(self, url): |
| #111 | all_urls = self._get_all_urls(url) |
| #112 | output = [] |
| #113 | for u in all_urls: |
| #114 | output.extend(self._load_data_from_url(u)) |
| #115 | doc_id = hashlib.sha256((" ".join(all_urls) + url).encode()).hexdigest() |
| #116 | return { |
| #117 | "doc_id": doc_id, |
| #118 | "data": output, |
| #119 | } |
| #120 |