From f60869ab2258df737be2ba0fde8291aa0c4197e5 Mon Sep 17 00:00:00 2001 From: Elia Bracci <106666672+elia-bracci-hs@users.noreply.github.com> Date: Tue, 12 Nov 2024 23:54:44 +0100 Subject: [PATCH] [fix] Add base URL extraction method to GithubRepositoryReader and update version to 0.4.0 (#16926) --- .../readers/github/repository/base.py | 15 ++++++- .../llama-index-readers-github/pyproject.toml | 2 +- .../tests/test_gh_base_url.py | 43 +++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 llama-index-integrations/readers/llama-index-readers-github/tests/test_gh_base_url.py diff --git a/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/base.py b/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/base.py index 1b1d34d01b..7fc3a7120a 100644 --- a/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/base.py +++ b/llama-index-integrations/readers/llama-index-readers-github/llama_index/readers/github/repository/base.py @@ -13,6 +13,7 @@ import enum import logging import os import pathlib +import re import tempfile from typing import Any, Callable, Dict, List, Optional, Tuple @@ -385,6 +386,13 @@ class GithubRepositoryReader(BaseReader): ) return blobs_and_full_paths + def _get_base_url(self, blob_url): + match = re.match(r"(https://[^/]+\.com/)", blob_url) + if match: + return match.group(1) + else: + return "https://github.com/" + async def _generate_documents( self, blobs_and_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]], @@ -455,7 +463,12 @@ class GithubRepositoryReader(BaseReader): + f"- adding to documents - {full_path}", ) url = os.path.join( - "https://github.com/", self._owner, self._repo, "blob/", id, full_path + self._get_base_url(blob_data.url), + self._owner, + self._repo, + "blob/", + id, + full_path, ) document = Document( text=decoded_text, diff --git a/llama-index-integrations/readers/llama-index-readers-github/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-github/pyproject.toml index 265b8c4333..a42630ff65 100644 --- a/llama-index-integrations/readers/llama-index-readers-github/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-github/pyproject.toml @@ -31,7 +31,7 @@ license = "MIT" maintainers = ["ahmetkca", "moncho", "rwood-97"] name = "llama-index-readers-github" readme = "README.md" -version = "0.3.0" +version = "0.4.0" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" diff --git a/llama-index-integrations/readers/llama-index-readers-github/tests/test_gh_base_url.py b/llama-index-integrations/readers/llama-index-readers-github/tests/test_gh_base_url.py new file mode 100644 index 0000000000..37ea20da5c --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-github/tests/test_gh_base_url.py @@ -0,0 +1,43 @@ +import pytest +from llama_index.readers.github import GithubRepositoryReader + + +class MockGithubClient: + pass + + +@pytest.fixture() +def github_reader(): + return GithubRepositoryReader( + github_client=MockGithubClient(), owner="owner", repo="repo" + ) + + +@pytest.mark.parametrize( + ("blob_url", "expected_base_url"), + [ + ("https://github.com/owner/repo/blob/main/file.py", "https://github.com/"), + ( + "https://github-enterprise.com/owner/repo/blob/main/file.py", + "https://github-enterprise.com/", + ), + ( + "https://custom-domain.com/owner/repo/blob/main/file.py", + "https://custom-domain.com/", + ), + ( + "https://subdomain.github.com/owner/repo/blob/main/file.py", + "https://subdomain.github.com/", + ), + ( + "https://something.org/owner/repo/blob/main/file.py", + "https://github.com/", + ), + ("", "https://github.com/"), + ], +) +def test_get_base_url(github_reader, blob_url, expected_base_url): + base_url = github_reader._get_base_url(blob_url) + assert ( + base_url == expected_base_url + ), f"Expected {expected_base_url}, but got {base_url}" -- GitLab