From 934d0486e035bae82011dcd13482e6c13a9fab3d Mon Sep 17 00:00:00 2001 From: Andrei Fajardo <92402603+nerdai@users.noreply.github.com> Date: Thu, 22 Feb 2024 11:38:36 -0500 Subject: [PATCH] [FIX] download_llama_pack for python packages containing multiple packs (#11272) * use recursive download to get all files * fix constants * remove branch and repo constants * remove f string --- .../llama_index/core/download/pack.py | 25 ++++++----- .../llama_index/core/download/utils.py | 41 +++++++++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/llama-index-core/llama_index/core/download/pack.py b/llama-index-core/llama_index/core/download/pack.py index bcf7430470..cdd99de658 100644 --- a/llama-index-core/llama_index/core/download/pack.py +++ b/llama-index-core/llama_index/core/download/pack.py @@ -13,17 +13,15 @@ import requests from llama_index.core.download.utils import ( ChangeDirectory, get_file_content, - get_source_files_list, initialize_directory, + get_source_files_recursive, ) -BRANCH = "nerdai/migration-v0_10_0" -REPO = "nerdai" LLAMA_PACKS_CONTENTS_URL = ( - f"https://raw.githubusercontent.com/{REPO}/llama_index/{BRANCH}/llama-index-packs" + "https://raw.githubusercontent.com/run-llama/llama_index/main/llama-index-packs" ) LLAMA_PACKS_SOURCE_FILES_GITHUB_TREE_URL = ( - f"https://github.com/{REPO}/llama_index/tree/{BRANCH}/llama-index-packs" + "https://github.com/run-llama/llama_index/tree/main" ) PY_NAMESPACE = "llama_index/packs" @@ -51,17 +49,24 @@ def download_module_and_reqs( os.makedirs(module_path, exist_ok=True) # download all source files - source_files = get_source_files_list( + source_files = get_source_files_recursive( str(remote_source_dir_path), - f"/{package}/{PY_NAMESPACE}/{sub_module}", + f"/llama-index-packs/{package}/{PY_NAMESPACE}/{sub_module}", ) for source_file in source_files: source_file_raw_content, _ = get_file_content( str(remote_dir_path), - f"/{package}/{PY_NAMESPACE}/{sub_module}/{source_file}", + f"{source_file}", ) - with open(f"{module_path}/{source_file}", "w") as f: + local_source_file_path = ( + f"{local_dir_path}/{'/'.join(source_file.split('/')[2:])}" + ) + # ensure parent dir of file exists + Path(local_source_file_path).parent.absolute().mkdir( + parents=True, exist_ok=True + ) + with open(local_source_file_path, "w") as f: f.write(source_file_raw_content) # pyproject.toml and README @@ -99,7 +104,7 @@ def download_llama_pack_template( refresh_cache: bool = False, custom_dir: Optional[str] = None, custom_path: Optional[str] = None, - base_file_name: str = "base.py", + base_file_name: str = "__init__.py", ) -> Any: # create directory / get path dirpath = initialize_directory(custom_path=custom_path, custom_dir=custom_dir) diff --git a/llama-index-core/llama_index/core/download/utils.py b/llama-index-core/llama_index/core/download/utils.py index 933e2ebd18..e8dd91c284 100644 --- a/llama-index-core/llama_index/core/download/utils.py +++ b/llama-index-core/llama_index/core/download/utils.py @@ -95,6 +95,47 @@ def get_source_files_list(source_tree_url: str, path: str) -> List[str]: return [item["name"] for item in payload["tree"]["items"]] +def recursive_tree_traverse( + tree_urls: List[Tuple[str, str]], acc: List[str], source_tree_url: str +): + """Recursively traversge Github trees to get all file paths in a folder.""" + if not tree_urls: + return acc + else: + url = tree_urls[0] + + try: + res = requests.get(url) + tree_elements = res.json()["payload"]["tree"]["items"] + except Exception: + raise ValueError("Failed to traverse github tree source.") + + new_trees = [ + source_tree_url + "/" + el["path"] + for el in tree_elements + if el["contentType"] == "directory" + ] + + acc += [ + el["path"].replace("llama-index-packs/", "/") + for el in tree_elements + if el["contentType"] == "file" + ] + + return recursive_tree_traverse( + tree_urls=tree_urls[1:] + new_trees, + acc=acc, + source_tree_url=source_tree_url, + ) + + +def get_source_files_recursive(source_tree_url: str, path: str) -> List[str]: + """Get source files of a Github folder recursively.""" + initial_url = source_tree_url + path + "?recursive=1" + initial_tree_urls = [initial_url] + return recursive_tree_traverse(initial_tree_urls, [], source_tree_url) + + class ChangeDirectory: """Context manager for changing the current working directory.""" -- GitLab