my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

my-project-public — gitlawb

#1	import csv
#2	import hashlib
#3	from io import StringIO
#4	from urllib.parse import urlparse
#5
#6	import requests
#7
#8	from embedchain.loaders.base_loader import BaseLoader
#9
#10
#11	class CsvLoader(BaseLoader):
#12	@staticmethod
#13	def _detect_delimiter(first_line):
#14	delimiters = [",", "\t", ";", "\|"]
#15	counts = {delimiter: first_line.count(delimiter) for delimiter in delimiters}
#16	return max(counts, key=counts.get)
#17
#18	@staticmethod
#19	def _get_file_content(content):
#20	url = urlparse(content)
#21	if all([url.scheme, url.netloc]) and url.scheme not in ["file", "http", "https"]:
#22	raise ValueError("Not a valid URL.")
#23
#24	if url.scheme in ["http", "https"]:
#25	response = requests.get(content)
#26	response.raise_for_status()
#27	return StringIO(response.text)
#28	elif url.scheme == "file":
#29	path = url.path
#30	return open(path, newline="", encoding="utf-8") # Open the file using the path from the URI
#31	else:
#32	return open(content, newline="", encoding="utf-8") # Treat content as a regular file path
#33
#34	@staticmethod
#35	def load_data(content):
#36	"""Load a csv file with headers. Each line is a document"""
#37	result = []
#38	lines = []
#39	with CsvLoader._get_file_content(content) as file:
#40	first_line = file.readline()
#41	delimiter = CsvLoader._detect_delimiter(first_line)
#42	file.seek(0) # Reset the file pointer to the start
#43	reader = csv.DictReader(file, delimiter=delimiter)
#44	for i, row in enumerate(reader):
#45	line = ", ".join([f"{field}: {value}" for field, value in row.items()])
#46	lines.append(line)
#47	result.append({"content": line, "meta_data": {"url": content, "row": i + 1}})
#48	doc_id = hashlib.sha256((content + " ".join(lines)).encode()).hexdigest()
#49	return {"doc_id": doc_id, "data": result}
#50

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public