diff --git a/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/base.py b/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/base.py index 4c2360667d1b359ca5647cc831da61f78a2d2647..749e24f0e680e3cb71e7e8e85dce603cb5f9520f 100644 --- a/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/base.py +++ b/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/base.py @@ -21,6 +21,7 @@ from llama_index.core.readers.file.base import _try_loading_included_file_format from llama_index.core.schema import Document from llama_index.readers.github.repository.github_client import ( + BaseGithubClient, GitBranchResponseModel, GitCommitResponseModel, GithubClient, @@ -68,7 +69,7 @@ class GithubRepositoryReader(BaseReader): def __init__( self, - github_client: GithubClient, + github_client: BaseGithubClient, owner: str, repo: str, use_parser: bool = False, @@ -548,7 +549,7 @@ if __name__ == "__main__": reader1 = GithubRepositoryReader( github_client=github_client, - owner="jerryjliu", + owner="run-llama", repo="llama_index", use_parser=False, verbose=True, @@ -575,7 +576,7 @@ if __name__ == "__main__": def load_data_from_commit() -> None: """Load data from a commit.""" documents = reader1.load_data( - commit_sha="22e198b3b166b5facd2843d6a62ac0db07894a13" + commit_sha="23a0f76cbaf663de67f9951c59400e9d0644970a" ) for document in documents: print(document.extra_info) diff --git a/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/github_client.py b/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/github_client.py index 6c3bc387621c19a03087ba388109132b3444056d..e87e0fe17c9a0149a9a62a3dab7aa280877534a7 100644 --- a/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/github_client.py +++ b/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/github_client.py @@ -1,13 +1,14 @@ """ -Github API client for the LlamaIndex library. +Github API client for the GPT-Index library. -This module contains the Github API client for the LlamaIndex library. +This module contains the Github API client for the GPT-Index library. It is used by the Github readers to retrieve the data from Github. """ import os + from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Protocol from dataclasses_json import DataClassJsonMixin @@ -104,6 +105,8 @@ class GitCommitResponseModel(DataClassJsonMixin): tree: Tree commit: Commit + url: str + sha: str @dataclass @@ -137,7 +140,61 @@ class GitBranchResponseModel(DataClassJsonMixin): commit: Commit + @dataclass + class Links(DataClassJsonMixin): + self: str + html: str + commit: Commit + name: str + _links: Links + + +class BaseGithubClient(Protocol): + def get_all_endpoints(self) -> Dict[str, str]: + ... + + async def request( + self, + endpoint: str, + method: str, + headers: Dict[str, Any] = {}, + **kwargs: Any, + ) -> Any: + ... + + async def get_tree( + self, + owner: str, + repo: str, + tree_sha: str, + ) -> GitTreeResponseModel: + ... + + async def get_blob( + self, + owner: str, + repo: str, + file_sha: str, + ) -> GitBlobResponseModel: + ... + + async def get_commit( + self, + owner: str, + repo: str, + commit_sha: str, + ) -> GitCommitResponseModel: + ... + + async def get_branch( + self, + owner: str, + repo: str, + branch: Optional[str], + branch_name: Optional[str], + ) -> GitBranchResponseModel: + ... class GithubClient: @@ -214,6 +271,7 @@ class GithubClient: endpoint: str, method: str, headers: Dict[str, Any] = {}, + timeout: Optional[int] = 5, **kwargs: Any, ) -> Any: """ @@ -226,6 +284,7 @@ class GithubClient: - `endpoint (str)`: Name of the endpoint to make the request to. - `method (str)`: HTTP method to use for the request. - `headers (dict)`: HTTP headers to include in the request. + - `timeout (int or None)`: Timeout for the request in seconds. Default is 5. - `**kwargs`: Keyword arguments to pass to the endpoint URL. Returns: @@ -238,7 +297,7 @@ class GithubClient: Examples: >>> response = client.request("getTree", "GET", owner="owner", repo="repo", - tree_sha="tree_sha") + tree_sha="tree_sha", timeout=5) """ try: import httpx @@ -252,20 +311,26 @@ class GithubClient: _client: httpx.AsyncClient async with httpx.AsyncClient( - headers=_headers, base_url=self._base_url + headers=_headers, + base_url=self._base_url, + timeout=timeout, ) as _client: try: response = await _client.request( method, url=self._endpoints[endpoint].format(**kwargs) ) - response.raise_for_status() except httpx.HTTPError as excp: print(f"HTTP Exception for {excp.request.url} - {excp}") - raise + raise excp # noqa: TRY201 return response async def get_branch( - self, owner: str, repo: str, branch: str + self, + owner: str, + repo: str, + branch: Optional[str] = None, + branch_name: Optional[str] = None, + timeout: Optional[int] = 5, ) -> GitBranchResponseModel: """ Get information about a branch. (Github API endpoint: getBranch). @@ -281,16 +346,30 @@ class GithubClient: Examples: >>> branch_info = client.get_branch("owner", "repo", "branch") """ + if branch is None: + if branch_name is None: + raise ValueError("Either branch or branch_name must be provided.") + branch = branch_name + return GitBranchResponseModel.from_json( ( await self.request( - "getBranch", "GET", owner=owner, repo=repo, branch=branch + "getBranch", + "GET", + owner=owner, + repo=repo, + branch=branch, + timeout=timeout, ) ).text ) async def get_tree( - self, owner: str, repo: str, tree_sha: str + self, + owner: str, + repo: str, + tree_sha: str, + timeout: Optional[int] = 5, ) -> GitTreeResponseModel: """ Get information about a tree. (Github API endpoint: getTree). @@ -299,6 +378,7 @@ class GithubClient: - `owner (str)`: Owner of the repository. - `repo (str)`: Name of the repository. - `tree_sha (str)`: SHA of the tree. + - `timeout (int or None)`: Timeout for the request in seconds. Default is 5. Returns: - `tree_info (GitTreeResponseModel)`: Information about the tree. @@ -309,13 +389,22 @@ class GithubClient: return GitTreeResponseModel.from_json( ( await self.request( - "getTree", "GET", owner=owner, repo=repo, tree_sha=tree_sha + "getTree", + "GET", + owner=owner, + repo=repo, + tree_sha=tree_sha, + timeout=timeout, ) ).text ) async def get_blob( - self, owner: str, repo: str, file_sha: str + self, + owner: str, + repo: str, + file_sha: str, + timeout: Optional[int] = 5, ) -> GitBlobResponseModel: """ Get information about a blob. (Github API endpoint: getBlob). @@ -324,6 +413,7 @@ class GithubClient: - `owner (str)`: Owner of the repository. - `repo (str)`: Name of the repository. - `file_sha (str)`: SHA of the file. + - `timeout (int or None)`: Timeout for the request in seconds. Default is 5. Returns: - `blob_info (GitBlobResponseModel)`: Information about the blob. @@ -334,13 +424,22 @@ class GithubClient: return GitBlobResponseModel.from_json( ( await self.request( - "getBlob", "GET", owner=owner, repo=repo, file_sha=file_sha + "getBlob", + "GET", + owner=owner, + repo=repo, + file_sha=file_sha, + timeout=timeout, ) ).text ) async def get_commit( - self, owner: str, repo: str, commit_sha: str + self, + owner: str, + repo: str, + commit_sha: str, + timeout: Optional[int] = 5, ) -> GitCommitResponseModel: """ Get information about a commit. (Github API endpoint: getCommit). @@ -349,6 +448,7 @@ class GithubClient: - `owner (str)`: Owner of the repository. - `repo (str)`: Name of the repository. - `commit_sha (str)`: SHA of the commit. + - `timeout (int or None)`: Timeout for the request in seconds. Default is 5. Returns: - `commit_info (GitCommitResponseModel)`: Information about the commit. @@ -359,7 +459,12 @@ class GithubClient: return GitCommitResponseModel.from_json( ( await self.request( - "getCommit", "GET", owner=owner, repo=repo, commit_sha=commit_sha + "getCommit", + "GET", + owner=owner, + repo=repo, + commit_sha=commit_sha, + timeout=timeout, ) ).text ) diff --git a/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/utils.py b/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/utils.py index 6795a6b2e3215936818d3479b9dd88178ecf23f8..c22bc19773e59df7c2113c92c972fa196cd4a121 100644 --- a/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/utils.py +++ b/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/utils.py @@ -3,6 +3,7 @@ Github readers utils. This module contains utility functions for the Github readers. """ + import asyncio import os import time diff --git a/llama-index-integrations/readers/llama-index-readers-github/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-github/pyproject.toml index adf90af5dbe3c06baa8c6f3251de681e342c0ef1..d9f7e7d6c094b01c34c4da3efd19303c7d03ccdf 100644 --- a/llama-index-integrations/readers/llama-index-readers-github/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-github/pyproject.toml @@ -26,7 +26,7 @@ license = "MIT" maintainers = ["ahmetkca", "moncho", "rwood-97"] name = "llama-index-readers-github" readme = "README.md" -version = "0.1.3" +version = "0.1.4" [tool.poetry.dependencies] python = ">=3.8.1,<3.12"