my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	import hashlib
#2	import logging
#3	from urllib.parse import urljoin, urlparse
#4
#5	import requests
#6
#7	try:
#8	from bs4 import BeautifulSoup
#9	except ImportError:
#10	raise ImportError(
#11	"DocsSite requires extra dependencies. Install with `pip install beautifulsoup4==4.12.3`"
#12	) from None
#13
#14
#15	from embedchain.helpers.json_serializable import register_deserializable
#16	from embedchain.loaders.base_loader import BaseLoader
#17
#18	logger = logging.getLogger(__name__)
#19
#20
#21	@register_deserializable
#22	class DocsSiteLoader(BaseLoader):
#23	def __init__(self):
#24	self.visited_links = set()
#25
#26	def _get_child_links_recursive(self, url):
#27	if url in self.visited_links:
#28	return
#29
#30	parsed_url = urlparse(url)
#31	base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
#32	current_path = parsed_url.path
#33
#34	response = requests.get(url)
#35	if response.status_code != 200:
#36	logger.info(f"Failed to fetch the website: {response.status_code}")
#37	return
#38
#39	soup = BeautifulSoup(response.text, "html.parser")
#40	all_links = (link.get("href") for link in soup.find_all("a", href=True))
#41
#42	child_links = (link for link in all_links if link.startswith(current_path) and link != current_path)
#43
#44	absolute_paths = set(urljoin(base_url, link) for link in child_links)
#45
#46	self.visited_links.update(absolute_paths)
#47
#48	[self._get_child_links_recursive(link) for link in absolute_paths if link not in self.visited_links]
#49
#50	def _get_all_urls(self, url):
#51	self.visited_links = set()
#52	self._get_child_links_recursive(url)
#53	urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
#54	return urls
#55
#56	@staticmethod
#57	def _load_data_from_url(url: str) -> list:
#58	response = requests.get(url)
#59	if response.status_code != 200:
#60	logger.info(f"Failed to fetch the website: {response.status_code}")
#61	return []
#62
#63	soup = BeautifulSoup(response.content, "html.parser")
#64	selectors = [
#65	"article.bd-article",
#66	'article[role="main"]',
#67	"div.md-content",
#68	'div[role="main"]',
#69	"div.container",
#70	"div.section",
#71	"article",
#72	"main",
#73	]
#74
#75	output = []
#76	for selector in selectors:
#77	element = soup.select_one(selector)
#78	if element:
#79	content = element.prettify()
#80	break
#81	else:
#82	content = soup.get_text()
#83
#84	soup = BeautifulSoup(content, "html.parser")
#85	ignored_tags = [
#86	"nav",
#87	"aside",
#88	"form",
#89	"header",
#90	"noscript",
#91	"svg",
#92	"canvas",
#93	"footer",
#94	"script",
#95	"style",
#96	]
#97	for tag in soup(ignored_tags):
#98	tag.decompose()
#99
#100	content = " ".join(soup.stripped_strings)
#101	output.append(
#102	{
#103	"content": content,
#104	"meta_data": {"url": url},
#105	}
#106	)
#107
#108	return output
#109
#110	def load_data(self, url):
#111	all_urls = self._get_all_urls(url)
#112	output = []
#113	for u in all_urls:
#114	output.extend(self._load_data_from_url(u))
#115	doc_id = hashlib.sha256((" ".join(all_urls) + url).encode()).hexdigest()
#116	return {
#117	"doc_id": doc_id,
#118	"data": output,
#119	}
#120

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public