repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | import logging |
| #3 | from typing import Any, Optional |
| #4 | |
| #5 | import requests |
| #6 | |
| #7 | try: |
| #8 | from bs4 import BeautifulSoup |
| #9 | except ImportError: |
| #10 | raise ImportError( |
| #11 | "Webpage requires extra dependencies. Install with `pip install beautifulsoup4==4.12.3`" |
| #12 | ) from None |
| #13 | |
| #14 | from embedchain.helpers.json_serializable import register_deserializable |
| #15 | from embedchain.loaders.base_loader import BaseLoader |
| #16 | from embedchain.utils.misc import clean_string |
| #17 | |
| #18 | logger = logging.getLogger(__name__) |
| #19 | |
| #20 | |
| #21 | @register_deserializable |
| #22 | class WebPageLoader(BaseLoader): |
| #23 | # Shared session for all instances |
| #24 | _session = requests.Session() |
| #25 | |
| #26 | def load_data(self, url, **kwargs: Optional[dict[str, Any]]): |
| #27 | """Load data from a web page using a shared requests' session.""" |
| #28 | all_references = False |
| #29 | for key, value in kwargs.items(): |
| #30 | if key == "all_references": |
| #31 | all_references = kwargs["all_references"] |
| #32 | headers = { |
| #33 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501 |
| #34 | } |
| #35 | response = self._session.get(url, headers=headers, timeout=30) |
| #36 | response.raise_for_status() |
| #37 | data = response.content |
| #38 | reference_links = self.fetch_reference_links(response) |
| #39 | if all_references: |
| #40 | for i in reference_links: |
| #41 | try: |
| #42 | response = self._session.get(i, headers=headers, timeout=30) |
| #43 | response.raise_for_status() |
| #44 | data += response.content |
| #45 | except Exception as e: |
| #46 | logging.error(f"Failed to add URL {url}: {e}") |
| #47 | continue |
| #48 | |
| #49 | content = self._get_clean_content(data, url) |
| #50 | |
| #51 | metadata = {"url": url} |
| #52 | |
| #53 | doc_id = hashlib.sha256((content + url).encode()).hexdigest() |
| #54 | return { |
| #55 | "doc_id": doc_id, |
| #56 | "data": [ |
| #57 | { |
| #58 | "content": content, |
| #59 | "meta_data": metadata, |
| #60 | } |
| #61 | ], |
| #62 | } |
| #63 | |
| #64 | @staticmethod |
| #65 | def _get_clean_content(html, url) -> str: |
| #66 | soup = BeautifulSoup(html, "html.parser") |
| #67 | original_size = len(str(soup.get_text())) |
| #68 | |
| #69 | tags_to_exclude = [ |
| #70 | "nav", |
| #71 | "aside", |
| #72 | "form", |
| #73 | "header", |
| #74 | "noscript", |
| #75 | "svg", |
| #76 | "canvas", |
| #77 | "footer", |
| #78 | "script", |
| #79 | "style", |
| #80 | ] |
| #81 | for tag in soup(tags_to_exclude): |
| #82 | tag.decompose() |
| #83 | |
| #84 | ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"] |
| #85 | for id_ in ids_to_exclude: |
| #86 | tags = soup.find_all(id=id_) |
| #87 | for tag in tags: |
| #88 | tag.decompose() |
| #89 | |
| #90 | classes_to_exclude = [ |
| #91 | "elementor-location-header", |
| #92 | "navbar-header", |
| #93 | "nav", |
| #94 | "header-sidebar-wrapper", |
| #95 | "blog-sidebar-wrapper", |
| #96 | "related-posts", |
| #97 | ] |
| #98 | for class_name in classes_to_exclude: |
| #99 | tags = soup.find_all(class_=class_name) |
| #100 | for tag in tags: |
| #101 | tag.decompose() |
| #102 | |
| #103 | content = soup.get_text() |
| #104 | content = clean_string(content) |
| #105 | |
| #106 | cleaned_size = len(content) |
| #107 | if original_size != 0: |
| #108 | logger.info( |
| #109 | f"[{url}] Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)" # noqa:E501 |
| #110 | ) |
| #111 | |
| #112 | return content |
| #113 | |
| #114 | @classmethod |
| #115 | def close_session(cls): |
| #116 | cls._session.close() |
| #117 | |
| #118 | def fetch_reference_links(self, response): |
| #119 | if response.status_code == 200: |
| #120 | soup = BeautifulSoup(response.content, "html.parser") |
| #121 | a_tags = soup.find_all("a", href=True) |
| #122 | reference_links = [a["href"] for a in a_tags if a["href"].startswith("http")] |
| #123 | return reference_links |
| #124 | else: |
| #125 | print(f"Failed to retrieve the page. Status code: {response.status_code}") |
| #126 | return [] |
| #127 |