my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	import hashlib
#2	import logging
#3	import time
#4	from xml.etree import ElementTree
#5
#6	import requests
#7
#8	from embedchain.helpers.json_serializable import register_deserializable
#9	from embedchain.loaders.base_loader import BaseLoader
#10	from embedchain.utils.misc import is_readable
#11
#12	logger = logging.getLogger(__name__)
#13
#14
#15	@register_deserializable
#16	class BeehiivLoader(BaseLoader):
#17	"""
#18	This loader is used to load data from Beehiiv URLs.
#19	"""
#20
#21	def load_data(self, url: str):
#22	try:
#23	from bs4 import BeautifulSoup
#24	from bs4.builder import ParserRejectedMarkup
#25	except ImportError:
#26	raise ImportError(
#27	"Beehiiv requires extra dependencies. Install with `pip install beautifulsoup4==4.12.3`"
#28	) from None
#29
#30	if not url.endswith("sitemap.xml"):
#31	url = url + "/sitemap.xml"
#32
#33	output = []
#34	# we need to set this as a header to avoid 403
#35	headers = {
#36	"User-Agent": (
#37	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
#38	"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 "
#39	"Safari/537.36"
#40	),
#41	}
#42	response = requests.get(url, headers=headers)
#43	try:
#44	response.raise_for_status()
#45	except requests.exceptions.HTTPError as e:
#46	raise ValueError(
#47	f"""
#48	Failed to load {url}: {e}. Please use the root substack URL. For example, https://example.substack.com
#49	"""
#50	)
#51
#52	try:
#53	ElementTree.fromstring(response.content)
#54	except ElementTree.ParseError:
#55	raise ValueError(
#56	f"""
#57	Failed to parse {url}. Please use the root substack URL. For example, https://example.substack.com
#58	"""
#59	)
#60	soup = BeautifulSoup(response.text, "xml")
#61	links = [link.text for link in soup.find_all("loc") if link.parent.name == "url" and "/p/" in link.text]
#62	if len(links) == 0:
#63	links = [link.text for link in soup.find_all("loc") if "/p/" in link.text]
#64
#65	doc_id = hashlib.sha256((" ".join(links) + url).encode()).hexdigest()
#66
#67	def serialize_response(soup: BeautifulSoup):
#68	data = {}
#69
#70	h1_el = soup.find("h1")
#71	if h1_el is not None:
#72	data["title"] = h1_el.text
#73
#74	description_el = soup.find("meta", {"name": "description"})
#75	if description_el is not None:
#76	data["description"] = description_el["content"]
#77
#78	content_el = soup.find("div", {"id": "content-blocks"})
#79	if content_el is not None:
#80	data["content"] = content_el.text
#81
#82	return data
#83
#84	def load_link(link: str):
#85	try:
#86	beehiiv_data = requests.get(link, headers=headers)
#87	beehiiv_data.raise_for_status()
#88
#89	soup = BeautifulSoup(beehiiv_data.text, "html.parser")
#90	data = serialize_response(soup)
#91	data = str(data)
#92	if is_readable(data):
#93	return data
#94	else:
#95	logger.warning(f"Page is not readable (too many invalid characters): {link}")
#96	except ParserRejectedMarkup as e:
#97	logger.error(f"Failed to parse {link}: {e}")
#98	return None
#99
#100	for link in links:
#101	data = load_link(link)
#102	if data:
#103	output.append({"content": data, "meta_data": {"url": link}})
#104	# TODO: allow users to configure this
#105	time.sleep(1.0) # added to avoid rate limiting
#106
#107	return {"doc_id": doc_id, "data": output}
#108

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public