From 276e36bc8ec96d3bfd6cb67cadc0dee0c50c0b08 Mon Sep 17 00:00:00 2001 From: betteryz <141388234+BetterAndBetterII@users.noreply.github.com> Date: Sat, 17 Feb 2024 12:19:32 +0800 Subject: [PATCH] Fix llama-hub Data Loader "GPTRepo" encoding issue, supporting gemini embedding more configurations. (#10802) * Support Gemini "transport" configuration Added Gemini transportation method configuration support. * Sync updates in multi_modal_llms\gemini * Updated Dashscope qwen llm defaults Setting qwen default num_outputs and temperature * cr * support gemini embedding configuration support configuring api_base, api_key, transport method * fix gptrepo data connector encoding issue reading a file in default encoding(GBK) will cause error characters problem. Added encoding configuration * sync latest repo * sync latest repo * cr * cr --------- Co-authored-by: Haotian Zhang <socool.king@gmail.com> --- .../llama_index/readers/gpt_repo/base.py | 10 ++++++++-- .../llama_index/legacy/embeddings/gemini.py | 12 +++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-gpt-repo/llama_index/readers/gpt_repo/base.py b/llama-index-integrations/readers/llama-index-readers-gpt-repo/llama_index/readers/gpt_repo/base.py index 1c2683294..409d2df2a 100644 --- a/llama-index-integrations/readers/llama-index-readers-gpt-repo/llama_index/readers/gpt_repo/base.py +++ b/llama-index-integrations/readers/llama-index-readers-gpt-repo/llama_index/readers/gpt_repo/base.py @@ -58,6 +58,7 @@ def process_repository( ignore_list, concatenate: bool = False, extensions: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", ) -> List[str]: """Process repository.""" result_texts = [] @@ -74,7 +75,7 @@ def process_repository( not should_ignore(relative_file_path, ignore_list) and is_correct_extension ): - with open(file_path, errors="ignore") as file: + with open(file_path, errors="ignore", encoding=encoding) as file: contents = file.read() result_text += "-" * 4 + "\n" result_text += f"{relative_file_path}\n" @@ -105,6 +106,7 @@ class GPTRepoReader(BaseReader): repo_path: str, preamble_str: Optional[str] = None, extensions: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", ) -> List[Document]: """Load data from the input directory. @@ -146,7 +148,11 @@ class GPTRepoReader(BaseReader): "aforementioned file as context.\n" ) text_list = process_repository( - repo_path, ignore_list, concatenate=self.concatenate, extensions=extensions + repo_path, + ignore_list, + concatenate=self.concatenate, + extensions=extensions, + encoding=encoding, ) docs = [] for text in text_list: diff --git a/llama-index-legacy/llama_index/legacy/embeddings/gemini.py b/llama-index-legacy/llama_index/legacy/embeddings/gemini.py index 67278a3be..838fa1447 100644 --- a/llama-index-legacy/llama_index/legacy/embeddings/gemini.py +++ b/llama-index-legacy/llama_index/legacy/embeddings/gemini.py @@ -47,7 +47,17 @@ class GeminiEmbedding(BaseEmbedding): "google-generativeai package not found, install with" "'pip install google-generativeai'" ) - gemini.configure(api_key=api_key) + # API keys are optional. The API can be authorised via OAuth (detected + # environmentally) or by the GOOGLE_API_KEY environment variable. + config_params: Dict[str, Any] = { + "api_key": api_key or os.getenv("GOOGLE_API_KEY"), + } + if api_base: + config_params["client_options"] = {"api_endpoint": api_base} + if transport: + config_params["transport"] = transport + # transport: A string, one of: [`rest`, `grpc`, `grpc_asyncio`]. + gemini.configure(**config_params) self._model = gemini super().__init__( -- GitLab