repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import concurrent.futures |
| #2 | import hashlib |
| #3 | import logging |
| #4 | import re |
| #5 | import shlex |
| #6 | from typing import Any, Optional |
| #7 | |
| #8 | from tqdm import tqdm |
| #9 | |
| #10 | from embedchain.loaders.base_loader import BaseLoader |
| #11 | from embedchain.utils.misc import clean_string |
| #12 | |
| #13 | GITHUB_URL = "https://github.com" |
| #14 | GITHUB_API_URL = "https://api.github.com" |
| #15 | |
| #16 | VALID_SEARCH_TYPES = set(["code", "repo", "pr", "issue", "discussion", "branch", "file"]) |
| #17 | |
| #18 | |
| #19 | class GithubLoader(BaseLoader): |
| #20 | """Load data from GitHub search query.""" |
| #21 | |
| #22 | def __init__(self, config: Optional[dict[str, Any]] = None): |
| #23 | super().__init__() |
| #24 | if not config: |
| #25 | raise ValueError( |
| #26 | "GithubLoader requires a personal access token to use github api. Check - `https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic`" # noqa: E501 |
| #27 | ) |
| #28 | |
| #29 | try: |
| #30 | from github import Github |
| #31 | except ImportError as e: |
| #32 | raise ValueError( |
| #33 | "GithubLoader requires extra dependencies. \ |
| #34 | Install with `pip install gitpython==3.1.38 PyGithub==1.59.1`" |
| #35 | ) from e |
| #36 | |
| #37 | self.config = config |
| #38 | token = config.get("token") |
| #39 | if not token: |
| #40 | raise ValueError( |
| #41 | "GithubLoader requires a personal access token to use github api. Check - `https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic`" # noqa: E501 |
| #42 | ) |
| #43 | |
| #44 | try: |
| #45 | self.client = Github(token) |
| #46 | except Exception as e: |
| #47 | logging.error(f"GithubLoader failed to initialize client: {e}") |
| #48 | self.client = None |
| #49 | |
| #50 | def _github_search_code(self, query: str): |
| #51 | """Search GitHub code.""" |
| #52 | data = [] |
| #53 | results = self.client.search_code(query) |
| #54 | for result in tqdm(results, total=results.totalCount, desc="Loading code files from github"): |
| #55 | url = result.html_url |
| #56 | logging.info(f"Added data from url: {url}") |
| #57 | content = result.decoded_content.decode("utf-8") |
| #58 | metadata = { |
| #59 | "url": url, |
| #60 | } |
| #61 | data.append( |
| #62 | { |
| #63 | "content": clean_string(content), |
| #64 | "meta_data": metadata, |
| #65 | } |
| #66 | ) |
| #67 | return data |
| #68 | |
| #69 | def _get_github_repo_data(self, repo_name: str, branch_name: str = None, file_path: str = None) -> list[dict]: |
| #70 | """Get file contents from Repo""" |
| #71 | data = [] |
| #72 | |
| #73 | repo = self.client.get_repo(repo_name) |
| #74 | repo_contents = repo.get_contents("") |
| #75 | |
| #76 | if branch_name: |
| #77 | repo_contents = repo.get_contents("", ref=branch_name) |
| #78 | if file_path: |
| #79 | repo_contents = [repo.get_contents(file_path)] |
| #80 | |
| #81 | with tqdm(desc="Loading files:", unit="item") as progress_bar: |
| #82 | while repo_contents: |
| #83 | file_content = repo_contents.pop(0) |
| #84 | if file_content.type == "dir": |
| #85 | try: |
| #86 | repo_contents.extend(repo.get_contents(file_content.path)) |
| #87 | except Exception: |
| #88 | logging.warning(f"Failed to read directory: {file_content.path}") |
| #89 | progress_bar.update(1) |
| #90 | continue |
| #91 | else: |
| #92 | try: |
| #93 | file_text = file_content.decoded_content.decode() |
| #94 | except Exception: |
| #95 | logging.warning(f"Failed to read file: {file_content.path}") |
| #96 | progress_bar.update(1) |
| #97 | continue |
| #98 | |
| #99 | file_path = file_content.path |
| #100 | data.append( |
| #101 | { |
| #102 | "content": clean_string(file_text), |
| #103 | "meta_data": { |
| #104 | "path": file_path, |
| #105 | }, |
| #106 | } |
| #107 | ) |
| #108 | |
| #109 | progress_bar.update(1) |
| #110 | |
| #111 | return data |
| #112 | |
| #113 | def _github_search_repo(self, query: str) -> list[dict]: |
| #114 | """Search GitHub repo.""" |
| #115 | |
| #116 | logging.info(f"Searching github repos with query: {query}") |
| #117 | updated_query = query.split(":")[-1] |
| #118 | data = self._get_github_repo_data(updated_query) |
| #119 | return data |
| #120 | |
| #121 | def _github_search_issues_and_pr(self, query: str, type: str) -> list[dict]: |
| #122 | """Search GitHub issues and PRs.""" |
| #123 | data = [] |
| #124 | |
| #125 | query = f"{query} is:{type}" |
| #126 | logging.info(f"Searching github for query: {query}") |
| #127 | |
| #128 | results = self.client.search_issues(query) |
| #129 | |
| #130 | logging.info(f"Total results: {results.totalCount}") |
| #131 | for result in tqdm(results, total=results.totalCount, desc=f"Loading {type} from github"): |
| #132 | url = result.html_url |
| #133 | title = result.title |
| #134 | body = result.body |
| #135 | if not body: |
| #136 | logging.warning(f"Skipping issue because empty content for: {url}") |
| #137 | continue |
| #138 | labels = " ".join([label.name for label in result.labels]) |
| #139 | issue_comments = result.get_comments() |
| #140 | comments = [] |
| #141 | comments_created_at = [] |
| #142 | for comment in issue_comments: |
| #143 | comments_created_at.append(str(comment.created_at)) |
| #144 | comments.append(f"{comment.user.name}:{comment.body}") |
| #145 | content = "\n".join([title, labels, body, *comments]) |
| #146 | metadata = { |
| #147 | "url": url, |
| #148 | "created_at": str(result.created_at), |
| #149 | "comments_created_at": " ".join(comments_created_at), |
| #150 | } |
| #151 | data.append( |
| #152 | { |
| #153 | "content": clean_string(content), |
| #154 | "meta_data": metadata, |
| #155 | } |
| #156 | ) |
| #157 | return data |
| #158 | |
| #159 | # need to test more for discussion |
| #160 | def _github_search_discussions(self, query: str): |
| #161 | """Search GitHub discussions.""" |
| #162 | data = [] |
| #163 | |
| #164 | query = f"{query} is:discussion" |
| #165 | logging.info(f"Searching github repo for query: {query}") |
| #166 | repos_results = self.client.search_repositories(query) |
| #167 | logging.info(f"Total repos found: {repos_results.totalCount}") |
| #168 | for repo_result in tqdm(repos_results, total=repos_results.totalCount, desc="Loading discussions from github"): |
| #169 | teams = repo_result.get_teams() |
| #170 | for team in teams: |
| #171 | team_discussions = team.get_discussions() |
| #172 | for discussion in team_discussions: |
| #173 | url = discussion.html_url |
| #174 | title = discussion.title |
| #175 | body = discussion.body |
| #176 | if not body: |
| #177 | logging.warning(f"Skipping discussion because empty content for: {url}") |
| #178 | continue |
| #179 | comments = [] |
| #180 | comments_created_at = [] |
| #181 | print("Discussion comments: ", discussion.comments_url) |
| #182 | content = "\n".join([title, body, *comments]) |
| #183 | metadata = { |
| #184 | "url": url, |
| #185 | "created_at": str(discussion.created_at), |
| #186 | "comments_created_at": " ".join(comments_created_at), |
| #187 | } |
| #188 | data.append( |
| #189 | { |
| #190 | "content": clean_string(content), |
| #191 | "meta_data": metadata, |
| #192 | } |
| #193 | ) |
| #194 | return data |
| #195 | |
| #196 | def _get_github_repo_branch(self, query: str, type: str) -> list[dict]: |
| #197 | """Get file contents for specific branch""" |
| #198 | |
| #199 | logging.info(f"Searching github repo for query: {query} is:{type}") |
| #200 | pattern = r"repo:(\S+) name:(\S+)" |
| #201 | match = re.search(pattern, query) |
| #202 | |
| #203 | if match: |
| #204 | repo_name = match.group(1) |
| #205 | branch_name = match.group(2) |
| #206 | else: |
| #207 | raise ValueError( |
| #208 | f"Repository name and Branch name not found, instead found this \ |
| #209 | Repo: {repo_name}, Branch: {branch_name}" |
| #210 | ) |
| #211 | |
| #212 | data = self._get_github_repo_data(repo_name=repo_name, branch_name=branch_name) |
| #213 | return data |
| #214 | |
| #215 | def _get_github_repo_file(self, query: str, type: str) -> list[dict]: |
| #216 | """Get specific file content""" |
| #217 | |
| #218 | logging.info(f"Searching github repo for query: {query} is:{type}") |
| #219 | pattern = r"repo:(\S+) path:(\S+)" |
| #220 | match = re.search(pattern, query) |
| #221 | |
| #222 | if match: |
| #223 | repo_name = match.group(1) |
| #224 | file_path = match.group(2) |
| #225 | else: |
| #226 | raise ValueError( |
| #227 | f"Repository name and File name not found, instead found this Repo: {repo_name}, File: {file_path}" |
| #228 | ) |
| #229 | |
| #230 | data = self._get_github_repo_data(repo_name=repo_name, file_path=file_path) |
| #231 | return data |
| #232 | |
| #233 | def _search_github_data(self, search_type: str, query: str): |
| #234 | """Search github data.""" |
| #235 | if search_type == "code": |
| #236 | data = self._github_search_code(query) |
| #237 | elif search_type == "repo": |
| #238 | data = self._github_search_repo(query) |
| #239 | elif search_type == "issue": |
| #240 | data = self._github_search_issues_and_pr(query, search_type) |
| #241 | elif search_type == "pr": |
| #242 | data = self._github_search_issues_and_pr(query, search_type) |
| #243 | elif search_type == "branch": |
| #244 | data = self._get_github_repo_branch(query, search_type) |
| #245 | elif search_type == "file": |
| #246 | data = self._get_github_repo_file(query, search_type) |
| #247 | elif search_type == "discussion": |
| #248 | raise ValueError("GithubLoader does not support searching discussions yet.") |
| #249 | else: |
| #250 | raise NotImplementedError(f"{search_type} not supported") |
| #251 | |
| #252 | return data |
| #253 | |
| #254 | @staticmethod |
| #255 | def _get_valid_github_query(query: str): |
| #256 | """Check if query is valid and return search types and valid GitHub query.""" |
| #257 | query_terms = shlex.split(query) |
| #258 | # query must provide repo to load data from |
| #259 | if len(query_terms) < 1 or "repo:" not in query: |
| #260 | raise ValueError( |
| #261 | "GithubLoader requires a search query with `repo:` term. Refer docs - `https://docs.embedchain.ai/data-sources/github`" # noqa: E501 |
| #262 | ) |
| #263 | |
| #264 | github_query = [] |
| #265 | types = set() |
| #266 | type_pattern = r"type:([a-zA-Z,]+)" |
| #267 | for term in query_terms: |
| #268 | term_match = re.search(type_pattern, term) |
| #269 | if term_match: |
| #270 | search_types = term_match.group(1).split(",") |
| #271 | types.update(search_types) |
| #272 | else: |
| #273 | github_query.append(term) |
| #274 | |
| #275 | # query must provide search type |
| #276 | if len(types) == 0: |
| #277 | raise ValueError( |
| #278 | "GithubLoader requires a search query with `type:` term. Refer docs - `https://docs.embedchain.ai/data-sources/github`" # noqa: E501 |
| #279 | ) |
| #280 | |
| #281 | for search_type in search_types: |
| #282 | if search_type not in VALID_SEARCH_TYPES: |
| #283 | raise ValueError( |
| #284 | f"Invalid search type: {search_type}. Valid types are: {', '.join(VALID_SEARCH_TYPES)}" |
| #285 | ) |
| #286 | |
| #287 | query = " ".join(github_query) |
| #288 | |
| #289 | return types, query |
| #290 | |
| #291 | def load_data(self, search_query: str, max_results: int = 1000): |
| #292 | """Load data from GitHub search query.""" |
| #293 | |
| #294 | if not self.client: |
| #295 | raise ValueError( |
| #296 | "GithubLoader client is not initialized, data will not be loaded. Refer docs - `https://docs.embedchain.ai/data-sources/github`" # noqa: E501 |
| #297 | ) |
| #298 | |
| #299 | search_types, query = self._get_valid_github_query(search_query) |
| #300 | logging.info(f"Searching github for query: {query}, with types: {', '.join(search_types)}") |
| #301 | |
| #302 | data = [] |
| #303 | |
| #304 | with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: |
| #305 | futures_map = executor.map(self._search_github_data, search_types, [query] * len(search_types)) |
| #306 | for search_data in tqdm(futures_map, total=len(search_types), desc="Searching data from github"): |
| #307 | data.extend(search_data) |
| #308 | |
| #309 | return { |
| #310 | "doc_id": hashlib.sha256(query.encode()).hexdigest(), |
| #311 | "data": data, |
| #312 | } |
| #313 |