my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

my-project-public — gitlawb

#1	import concurrent.futures
#2	import hashlib
#3	import logging
#4	import os
#5	from urllib.parse import urlparse
#6
#7	import requests
#8	from tqdm import tqdm
#9
#10	try:
#11	from bs4 import BeautifulSoup
#12	from bs4.builder import ParserRejectedMarkup
#13	except ImportError:
#14	raise ImportError(
#15	"Sitemap requires extra dependencies. Install with `pip install beautifulsoup4==4.12.3`"
#16	) from None
#17
#18	from embedchain.helpers.json_serializable import register_deserializable
#19	from embedchain.loaders.base_loader import BaseLoader
#20	from embedchain.loaders.web_page import WebPageLoader
#21
#22	logger = logging.getLogger(__name__)
#23
#24
#25	@register_deserializable
#26	class SitemapLoader(BaseLoader):
#27	"""
#28	This method takes a sitemap URL or local file path as input and retrieves
#29	all the URLs to use the WebPageLoader to load content
#30	of each page.
#31	"""
#32
#33	def load_data(self, sitemap_source):
#34	output = []
#35	web_page_loader = WebPageLoader()
#36	headers = {
#37	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501
#38	}
#39
#40	if urlparse(sitemap_source).scheme in ("http", "https"):
#41	try:
#42	response = requests.get(sitemap_source, headers=headers)
#43	response.raise_for_status()
#44	soup = BeautifulSoup(response.text, "xml")
#45	except requests.RequestException as e:
#46	logger.error(f"Error fetching sitemap from URL: {e}")
#47	return
#48	elif os.path.isfile(sitemap_source):
#49	with open(sitemap_source, "r") as file:
#50	soup = BeautifulSoup(file, "xml")
#51	else:
#52	raise ValueError("Invalid sitemap source. Please provide a valid URL or local file path.")
#53
#54	links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"]
#55	if len(links) == 0:
#56	links = [link.text for link in soup.find_all("loc")]
#57
#58	doc_id = hashlib.sha256((" ".join(links) + sitemap_source).encode()).hexdigest()
#59
#60	def load_web_page(link):
#61	try:
#62	loader_data = web_page_loader.load_data(link)
#63	return loader_data.get("data")
#64	except ParserRejectedMarkup as e:
#65	logger.error(f"Failed to parse {link}: {e}")
#66	return None
#67
#68	with concurrent.futures.ThreadPoolExecutor() as executor:
#69	future_to_link = {executor.submit(load_web_page, link): link for link in links}
#70	for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links), desc="Loading pages"):
#71	link = future_to_link[future]
#72	try:
#73	data = future.result()
#74	if data:
#75	output.extend(data)
#76	except Exception as e:
#77	logger.error(f"Error loading page {link}: {e}")
#78
#79	return {"doc_id": doc_id, "data": output}
#80

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public