From 276e36bc8ec96d3bfd6cb67cadc0dee0c50c0b08 Mon Sep 17 00:00:00 2001
From: betteryz <141388234+BetterAndBetterII@users.noreply.github.com>
Date: Sat, 17 Feb 2024 12:19:32 +0800
Subject: [PATCH] Fix llama-hub Data Loader "GPTRepo" encoding issue,
 supporting gemini embedding more configurations. (#10802)

* Support Gemini "transport" configuration

Added Gemini transportation method configuration support.

* Sync updates in multi_modal_llms\gemini

* Updated Dashscope qwen llm defaults

Setting qwen default num_outputs and temperature

* cr

* support gemini embedding configuration

support configuring api_base, api_key, transport method

* fix gptrepo data connector encoding issue

reading a file in default encoding(GBK) will cause error characters problem. Added encoding configuration

* sync latest repo

* sync latest repo

* cr

* cr

---------

Co-authored-by: Haotian Zhang <socool.king@gmail.com>
---
 .../llama_index/readers/gpt_repo/base.py             | 10 ++++++++--
 .../llama_index/legacy/embeddings/gemini.py          | 12 +++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/llama-index-integrations/readers/llama-index-readers-gpt-repo/llama_index/readers/gpt_repo/base.py b/llama-index-integrations/readers/llama-index-readers-gpt-repo/llama_index/readers/gpt_repo/base.py
index 1c2683294..409d2df2a 100644
--- a/llama-index-integrations/readers/llama-index-readers-gpt-repo/llama_index/readers/gpt_repo/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-gpt-repo/llama_index/readers/gpt_repo/base.py
@@ -58,6 +58,7 @@ def process_repository(
     ignore_list,
     concatenate: bool = False,
     extensions: Optional[List[str]] = None,
+    encoding: Optional[str] = "utf-8",
 ) -> List[str]:
     """Process repository."""
     result_texts = []
@@ -74,7 +75,7 @@ def process_repository(
                 not should_ignore(relative_file_path, ignore_list)
                 and is_correct_extension
             ):
-                with open(file_path, errors="ignore") as file:
+                with open(file_path, errors="ignore", encoding=encoding) as file:
                     contents = file.read()
                 result_text += "-" * 4 + "\n"
                 result_text += f"{relative_file_path}\n"
@@ -105,6 +106,7 @@ class GPTRepoReader(BaseReader):
         repo_path: str,
         preamble_str: Optional[str] = None,
         extensions: Optional[List[str]] = None,
+        encoding: Optional[str] = "utf-8",
     ) -> List[Document]:
         """Load data from the input directory.
 
@@ -146,7 +148,11 @@ class GPTRepoReader(BaseReader):
                 "aforementioned file as context.\n"
             )
         text_list = process_repository(
-            repo_path, ignore_list, concatenate=self.concatenate, extensions=extensions
+            repo_path,
+            ignore_list,
+            concatenate=self.concatenate,
+            extensions=extensions,
+            encoding=encoding,
         )
         docs = []
         for text in text_list:
diff --git a/llama-index-legacy/llama_index/legacy/embeddings/gemini.py b/llama-index-legacy/llama_index/legacy/embeddings/gemini.py
index 67278a3be..838fa1447 100644
--- a/llama-index-legacy/llama_index/legacy/embeddings/gemini.py
+++ b/llama-index-legacy/llama_index/legacy/embeddings/gemini.py
@@ -47,7 +47,17 @@ class GeminiEmbedding(BaseEmbedding):
                 "google-generativeai package not found, install with"
                 "'pip install google-generativeai'"
             )
-        gemini.configure(api_key=api_key)
+        # API keys are optional. The API can be authorised via OAuth (detected
+        # environmentally) or by the GOOGLE_API_KEY environment variable.
+        config_params: Dict[str, Any] = {
+            "api_key": api_key or os.getenv("GOOGLE_API_KEY"),
+        }
+        if api_base:
+            config_params["client_options"] = {"api_endpoint": api_base}
+        if transport:
+            config_params["transport"] = transport
+        # transport: A string, one of: [`rest`, `grpc`, `grpc_asyncio`].
+        gemini.configure(**config_params)
         self._model = gemini
 
         super().__init__(
-- 
GitLab