repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import csv |
| #2 | import hashlib |
| #3 | from io import StringIO |
| #4 | from urllib.parse import urlparse |
| #5 | |
| #6 | import requests |
| #7 | |
| #8 | from embedchain.loaders.base_loader import BaseLoader |
| #9 | |
| #10 | |
| #11 | class CsvLoader(BaseLoader): |
| #12 | @staticmethod |
| #13 | def _detect_delimiter(first_line): |
| #14 | delimiters = [",", "\t", ";", "|"] |
| #15 | counts = {delimiter: first_line.count(delimiter) for delimiter in delimiters} |
| #16 | return max(counts, key=counts.get) |
| #17 | |
| #18 | @staticmethod |
| #19 | def _get_file_content(content): |
| #20 | url = urlparse(content) |
| #21 | if all([url.scheme, url.netloc]) and url.scheme not in ["file", "http", "https"]: |
| #22 | raise ValueError("Not a valid URL.") |
| #23 | |
| #24 | if url.scheme in ["http", "https"]: |
| #25 | response = requests.get(content) |
| #26 | response.raise_for_status() |
| #27 | return StringIO(response.text) |
| #28 | elif url.scheme == "file": |
| #29 | path = url.path |
| #30 | return open(path, newline="", encoding="utf-8") # Open the file using the path from the URI |
| #31 | else: |
| #32 | return open(content, newline="", encoding="utf-8") # Treat content as a regular file path |
| #33 | |
| #34 | @staticmethod |
| #35 | def load_data(content): |
| #36 | """Load a csv file with headers. Each line is a document""" |
| #37 | result = [] |
| #38 | lines = [] |
| #39 | with CsvLoader._get_file_content(content) as file: |
| #40 | first_line = file.readline() |
| #41 | delimiter = CsvLoader._detect_delimiter(first_line) |
| #42 | file.seek(0) # Reset the file pointer to the start |
| #43 | reader = csv.DictReader(file, delimiter=delimiter) |
| #44 | for i, row in enumerate(reader): |
| #45 | line = ", ".join([f"{field}: {value}" for field, value in row.items()]) |
| #46 | lines.append(line) |
| #47 | result.append({"content": line, "meta_data": {"url": content, "row": i + 1}}) |
| #48 | doc_id = hashlib.sha256((content + " ".join(lines)).encode()).hexdigest() |
| #49 | return {"doc_id": doc_id, "data": result} |
| #50 |