repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | import json |
| #3 | import logging |
| #4 | |
| #5 | try: |
| #6 | from youtube_transcript_api import YouTubeTranscriptApi |
| #7 | except ImportError: |
| #8 | raise ImportError("YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api`") |
| #9 | try: |
| #10 | from langchain_community.document_loaders import YoutubeLoader |
| #11 | from langchain_community.document_loaders.youtube import _parse_video_id |
| #12 | except ImportError: |
| #13 | raise ImportError("YouTube video requires extra dependencies. Install with `pip install pytube==15.0.0`") from None |
| #14 | from embedchain.helpers.json_serializable import register_deserializable |
| #15 | from embedchain.loaders.base_loader import BaseLoader |
| #16 | from embedchain.utils.misc import clean_string |
| #17 | |
| #18 | |
| #19 | @register_deserializable |
| #20 | class YoutubeVideoLoader(BaseLoader): |
| #21 | def load_data(self, url): |
| #22 | """Load data from a Youtube video.""" |
| #23 | video_id = _parse_video_id(url) |
| #24 | |
| #25 | languages = ["en"] |
| #26 | try: |
| #27 | # Fetching transcript data |
| #28 | languages = [transcript.language_code for transcript in YouTubeTranscriptApi.list_transcripts(video_id)] |
| #29 | transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages) |
| #30 | # convert transcript to json to avoid unicode symboles |
| #31 | transcript = json.dumps(transcript, ensure_ascii=True) |
| #32 | except Exception: |
| #33 | logging.exception(f"Failed to fetch transcript for video {url}") |
| #34 | transcript = "Unavailable" |
| #35 | |
| #36 | loader = YoutubeLoader.from_youtube_url(url, add_video_info=True, language=languages) |
| #37 | doc = loader.load() |
| #38 | output = [] |
| #39 | if not len(doc): |
| #40 | raise ValueError(f"No data found for url: {url}") |
| #41 | content = doc[0].page_content |
| #42 | content = clean_string(content) |
| #43 | metadata = doc[0].metadata |
| #44 | metadata["url"] = url |
| #45 | metadata["transcript"] = transcript |
| #46 | |
| #47 | output.append( |
| #48 | { |
| #49 | "content": content, |
| #50 | "meta_data": metadata, |
| #51 | } |
| #52 | ) |
| #53 | doc_id = hashlib.sha256((content + url).encode()).hexdigest() |
| #54 | return { |
| #55 | "doc_id": doc_id, |
| #56 | "data": output, |
| #57 | } |
| #58 |