my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	import concurrent.futures
#2	import hashlib
#3	import logging
#4	import re
#5	import shlex
#6	from typing import Any, Optional
#7
#8	from tqdm import tqdm
#9
#10	from embedchain.loaders.base_loader import BaseLoader
#11	from embedchain.utils.misc import clean_string
#12
#13	GITHUB_URL = "https://github.com"
#14	GITHUB_API_URL = "https://api.github.com"
#15
#16	VALID_SEARCH_TYPES = set(["code", "repo", "pr", "issue", "discussion", "branch", "file"])
#17
#18
#19	class GithubLoader(BaseLoader):
#20	"""Load data from GitHub search query."""
#21
#22	def __init__(self, config: Optional[dict[str, Any]] = None):
#23	super().__init__()
#24	if not config:
#25	raise ValueError(
#26	"GithubLoader requires a personal access token to use github api. Check - `https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic`" # noqa: E501
#27	)
#28
#29	try:
#30	from github import Github
#31	except ImportError as e:
#32	raise ValueError(
#33	"GithubLoader requires extra dependencies. \
#34	Install with `pip install gitpython==3.1.38 PyGithub==1.59.1`"
#35	) from e
#36
#37	self.config = config
#38	token = config.get("token")
#39	if not token:
#40	raise ValueError(
#41	"GithubLoader requires a personal access token to use github api. Check - `https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic`" # noqa: E501
#42	)
#43
#44	try:
#45	self.client = Github(token)
#46	except Exception as e:
#47	logging.error(f"GithubLoader failed to initialize client: {e}")
#48	self.client = None
#49
#50	def _github_search_code(self, query: str):
#51	"""Search GitHub code."""
#52	data = []
#53	results = self.client.search_code(query)
#54	for result in tqdm(results, total=results.totalCount, desc="Loading code files from github"):
#55	url = result.html_url
#56	logging.info(f"Added data from url: {url}")
#57	content = result.decoded_content.decode("utf-8")
#58	metadata = {
#59	"url": url,
#60	}
#61	data.append(
#62	{
#63	"content": clean_string(content),
#64	"meta_data": metadata,
#65	}
#66	)
#67	return data
#68
#69	def _get_github_repo_data(self, repo_name: str, branch_name: str = None, file_path: str = None) -> list[dict]:
#70	"""Get file contents from Repo"""
#71	data = []
#72
#73	repo = self.client.get_repo(repo_name)
#74	repo_contents = repo.get_contents("")
#75
#76	if branch_name:
#77	repo_contents = repo.get_contents("", ref=branch_name)
#78	if file_path:
#79	repo_contents = [repo.get_contents(file_path)]
#80
#81	with tqdm(desc="Loading files:", unit="item") as progress_bar:
#82	while repo_contents:
#83	file_content = repo_contents.pop(0)
#84	if file_content.type == "dir":
#85	try:
#86	repo_contents.extend(repo.get_contents(file_content.path))
#87	except Exception:
#88	logging.warning(f"Failed to read directory: {file_content.path}")
#89	progress_bar.update(1)
#90	continue
#91	else:
#92	try:
#93	file_text = file_content.decoded_content.decode()
#94	except Exception:
#95	logging.warning(f"Failed to read file: {file_content.path}")
#96	progress_bar.update(1)
#97	continue
#98
#99	file_path = file_content.path
#100	data.append(
#101	{
#102	"content": clean_string(file_text),
#103	"meta_data": {
#104	"path": file_path,
#105	},
#106	}
#107	)
#108
#109	progress_bar.update(1)
#110
#111	return data
#112
#113	def _github_search_repo(self, query: str) -> list[dict]:
#114	"""Search GitHub repo."""
#115
#116	logging.info(f"Searching github repos with query: {query}")
#117	updated_query = query.split(":")[-1]
#118	data = self._get_github_repo_data(updated_query)
#119	return data
#120
#121	def _github_search_issues_and_pr(self, query: str, type: str) -> list[dict]:
#122	"""Search GitHub issues and PRs."""
#123	data = []
#124
#125	query = f"{query} is:{type}"
#126	logging.info(f"Searching github for query: {query}")
#127
#128	results = self.client.search_issues(query)
#129
#130	logging.info(f"Total results: {results.totalCount}")
#131	for result in tqdm(results, total=results.totalCount, desc=f"Loading {type} from github"):
#132	url = result.html_url
#133	title = result.title
#134	body = result.body
#135	if not body:
#136	logging.warning(f"Skipping issue because empty content for: {url}")
#137	continue
#138	labels = " ".join([label.name for label in result.labels])
#139	issue_comments = result.get_comments()
#140	comments = []
#141	comments_created_at = []
#142	for comment in issue_comments:
#143	comments_created_at.append(str(comment.created_at))
#144	comments.append(f"{comment.user.name}:{comment.body}")
#145	content = "\n".join([title, labels, body, *comments])
#146	metadata = {
#147	"url": url,
#148	"created_at": str(result.created_at),
#149	"comments_created_at": " ".join(comments_created_at),
#150	}
#151	data.append(
#152	{
#153	"content": clean_string(content),
#154	"meta_data": metadata,
#155	}
#156	)
#157	return data
#158
#159	# need to test more for discussion
#160	def _github_search_discussions(self, query: str):
#161	"""Search GitHub discussions."""
#162	data = []
#163
#164	query = f"{query} is:discussion"
#165	logging.info(f"Searching github repo for query: {query}")
#166	repos_results = self.client.search_repositories(query)
#167	logging.info(f"Total repos found: {repos_results.totalCount}")
#168	for repo_result in tqdm(repos_results, total=repos_results.totalCount, desc="Loading discussions from github"):
#169	teams = repo_result.get_teams()
#170	for team in teams:
#171	team_discussions = team.get_discussions()
#172	for discussion in team_discussions:
#173	url = discussion.html_url
#174	title = discussion.title
#175	body = discussion.body
#176	if not body:
#177	logging.warning(f"Skipping discussion because empty content for: {url}")
#178	continue
#179	comments = []
#180	comments_created_at = []
#181	print("Discussion comments: ", discussion.comments_url)
#182	content = "\n".join([title, body, *comments])
#183	metadata = {
#184	"url": url,
#185	"created_at": str(discussion.created_at),
#186	"comments_created_at": " ".join(comments_created_at),
#187	}
#188	data.append(
#189	{
#190	"content": clean_string(content),
#191	"meta_data": metadata,
#192	}
#193	)
#194	return data
#195
#196	def _get_github_repo_branch(self, query: str, type: str) -> list[dict]:
#197	"""Get file contents for specific branch"""
#198
#199	logging.info(f"Searching github repo for query: {query} is:{type}")
#200	pattern = r"repo:(\S+) name:(\S+)"
#201	match = re.search(pattern, query)
#202
#203	if match:
#204	repo_name = match.group(1)
#205	branch_name = match.group(2)
#206	else:
#207	raise ValueError(
#208	f"Repository name and Branch name not found, instead found this \
#209	Repo: {repo_name}, Branch: {branch_name}"
#210	)
#211
#212	data = self._get_github_repo_data(repo_name=repo_name, branch_name=branch_name)
#213	return data
#214
#215	def _get_github_repo_file(self, query: str, type: str) -> list[dict]:
#216	"""Get specific file content"""
#217
#218	logging.info(f"Searching github repo for query: {query} is:{type}")
#219	pattern = r"repo:(\S+) path:(\S+)"
#220	match = re.search(pattern, query)
#221
#222	if match:
#223	repo_name = match.group(1)
#224	file_path = match.group(2)
#225	else:
#226	raise ValueError(
#227	f"Repository name and File name not found, instead found this Repo: {repo_name}, File: {file_path}"
#228	)
#229
#230	data = self._get_github_repo_data(repo_name=repo_name, file_path=file_path)
#231	return data
#232
#233	def _search_github_data(self, search_type: str, query: str):
#234	"""Search github data."""
#235	if search_type == "code":
#236	data = self._github_search_code(query)
#237	elif search_type == "repo":
#238	data = self._github_search_repo(query)
#239	elif search_type == "issue":
#240	data = self._github_search_issues_and_pr(query, search_type)
#241	elif search_type == "pr":
#242	data = self._github_search_issues_and_pr(query, search_type)
#243	elif search_type == "branch":
#244	data = self._get_github_repo_branch(query, search_type)
#245	elif search_type == "file":
#246	data = self._get_github_repo_file(query, search_type)
#247	elif search_type == "discussion":
#248	raise ValueError("GithubLoader does not support searching discussions yet.")
#249	else:
#250	raise NotImplementedError(f"{search_type} not supported")
#251
#252	return data
#253
#254	@staticmethod
#255	def _get_valid_github_query(query: str):
#256	"""Check if query is valid and return search types and valid GitHub query."""
#257	query_terms = shlex.split(query)
#258	# query must provide repo to load data from
#259	if len(query_terms) < 1 or "repo:" not in query:
#260	raise ValueError(
#261	"GithubLoader requires a search query with `repo:` term. Refer docs - `https://docs.embedchain.ai/data-sources/github`" # noqa: E501
#262	)
#263
#264	github_query = []
#265	types = set()
#266	type_pattern = r"type:([a-zA-Z,]+)"
#267	for term in query_terms:
#268	term_match = re.search(type_pattern, term)
#269	if term_match:
#270	search_types = term_match.group(1).split(",")
#271	types.update(search_types)
#272	else:
#273	github_query.append(term)
#274
#275	# query must provide search type
#276	if len(types) == 0:
#277	raise ValueError(
#278	"GithubLoader requires a search query with `type:` term. Refer docs - `https://docs.embedchain.ai/data-sources/github`" # noqa: E501
#279	)
#280
#281	for search_type in search_types:
#282	if search_type not in VALID_SEARCH_TYPES:
#283	raise ValueError(
#284	f"Invalid search type: {search_type}. Valid types are: {', '.join(VALID_SEARCH_TYPES)}"
#285	)
#286
#287	query = " ".join(github_query)
#288
#289	return types, query
#290
#291	def load_data(self, search_query: str, max_results: int = 1000):
#292	"""Load data from GitHub search query."""
#293
#294	if not self.client:
#295	raise ValueError(
#296	"GithubLoader client is not initialized, data will not be loaded. Refer docs - `https://docs.embedchain.ai/data-sources/github`" # noqa: E501
#297	)
#298
#299	search_types, query = self._get_valid_github_query(search_query)
#300	logging.info(f"Searching github for query: {query}, with types: {', '.join(search_types)}")
#301
#302	data = []
#303
#304	with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
#305	futures_map = executor.map(self._search_github_data, search_types, [query] * len(search_types))
#306	for search_data in tqdm(futures_map, total=len(search_types), desc="Searching data from github"):
#307	data.extend(search_data)
#308
#309	return {
#310	"doc_id": hashlib.sha256(query.encode()).hexdigest(),
#311	"data": data,
#312	}
#313

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public