diff --git a/llama-index-integrations/readers/llama-index-readers-google/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-google/CHANGELOG.md index 6b55fba293062146ad70d4d6fc82e1a0fe392050..34e9ee4085e9cfc32fcfe7630c9886f4cfdfc516 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/CHANGELOG.md +++ b/llama-index-integrations/readers/llama-index-readers-google/CHANGELOG.md @@ -1,5 +1,11 @@ # CHANGELOG +## [0.2.1] - 2024-03-26 + +- Allow passing credentials directly as a string +- Make the reader serializable +- Don't write credentials to disk in cloud mode + ## [0.2.0] - 2024-03-26 - Use separate arg for service account key file, don't conflate client secrets with service account key diff --git a/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py b/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py index 0e0bea6015fd6289cad723872c3b96fecbfd1e95..96c9b6750882782cebe535ec2f1355a4e908724b 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py +++ b/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py @@ -2,16 +2,18 @@ import logging import os +import json import tempfile from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, List, Optional, Tuple from google.auth.transport.requests import Request from google.oauth2 import service_account from google.oauth2.credentials import Credentials from llama_index.core.readers import SimpleDirectoryReader -from llama_index.core.readers.base import BaseReader +from llama_index.core.readers.base import BasePydanticReader +from llama_index.core.bridge.pydantic import PrivateAttr from llama_index.core.schema import Document logger = logging.getLogger(__name__) @@ -20,22 +22,55 @@ logger = logging.getLogger(__name__) SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] -class GoogleDriveReader(BaseReader): - """Google drive reader.""" +class GoogleDriveReader(BasePydanticReader): + """Google Drive Reader. + + Reads files from Google Drive. Credentials passed directly to the constructor + will take precedence over those passed as file paths. + + Args: + is_cloud (Optional[bool]): Whether the reader is being used in + a cloud environment. Will not save credentials to disk if so. + Defaults to False. + credentials_path (Optional[str]): Path to client config file. + Defaults to None. + token_path (Optional[str]): Path to authorized user info file. Defaults + to None. + service_account_key_path (Optional[str]): Path to service account key + file. Defaults to None. + client_config (Optional[dict]): Dictionary containing client config. + Defaults to None. + authorized_user_info (Optional[dict]): Dicstionary containing authorized + user info. Defaults to None. + service_account_key (Optional[dict]): Dictionary containing service + account key. Defaults to None. + + + """ + + client_config: Optional[dict] = None + authorized_user_info: Optional[dict] = None + service_account_key: Optional[dict] = None + token_path: Optional[str] = None + + _is_cloud: bool = PrivateAttr(default=False) + _creds: Credentials = PrivateAttr() + _mimetypes: dict = PrivateAttr() def __init__( self, + is_cloud: Optional[bool] = False, credentials_path: str = "credentials.json", token_path: str = "token.json", service_account_key_path: str = "service_account_key.json", + client_config: Optional[dict] = None, + authorized_user_info: Optional[dict] = None, + service_account_key: Optional[dict] = None, + **kwargs: Any, ) -> None: """Initialize with parameters.""" - self.service_account_key_path = service_account_key_path - self.credentials_path = credentials_path - self.token_path = token_path - self._creds = None - + self._is_cloud = (is_cloud,) # Download Google Docs/Slides/Sheets as actual files # See https://developers.google.com/drive/v3/web/mime-types self._mimetypes = { @@ -55,6 +90,30 @@ class GoogleDriveReader(BaseReader): }, } + # Read the file contents so they can be serialized and stored. + if client_config is None and os.path.isfile(credentials_path): + with open(credentials_path, encoding="utf-8") as json_file: + client_config = json.load(json_file) + + if authorized_user_info is None and os.path.isfile(token_path): + with open(token_path, encoding="utf-8") as json_file: + authorized_user_info = json.load(json_file) + + if service_account_key is None and os.path.isfile(service_account_key_path): + with open(service_account_key_path, encoding="utf-8") as json_file: + service_account_key = json.load(json_file) + + if client_config is None and service_account_key is None: + raise ValueError("Must specify `client_config` or `service_account_key`.") + + super().__init__( + client_config=client_config, + authorized_user_info=authorized_user_info, + service_account_key=service_account_key, + token_path=token_path, + **kwargs, + ) + def _get_credentials(self) -> Tuple[Credentials]: """Authenticate with Google and save credentials. Download the service_account_key.json file with these instructions: https://cloud.google.com/iam/docs/keys-create-delete. @@ -67,11 +126,13 @@ class GoogleDriveReader(BaseReader): # First, we need the Google API credentials for the app creds = None - if Path(self.token_path).exists(): - creds = Credentials.from_authorized_user_file(self.token_path, SCOPES) - elif Path(self.service_account_key_path).exists(): - return service_account.Credentials.from_service_account_file( - self.service_account_key_path, scopes=SCOPES + if Path(self.authorized_user_info).exists(): + creds = Credentials.from_authorized_user_info( + self.authorized_user_info, SCOPES + ) + elif Path(self.service_account_key).exists(): + return service_account.Credentials.from_service_account_info( + self.service_account_key, scopes=SCOPES ) # If there are no (valid) credentials available, let the user log in. @@ -79,13 +140,13 @@ class GoogleDriveReader(BaseReader): if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: - flow = InstalledAppFlow.from_credentials_file( - self.credentials_path, SCOPES - ) + flow = InstalledAppFlow.from_client_config(self.client_config, SCOPES) creds = flow.run_local_server(port=0) + # Save the credentials for the next run - with open(self.token_path, "w", encoding="utf-8") as token: - token.write(creds.to_json()) + if not self._is_cloud: + with open(self.token_path, "w", encoding="utf-8") as token: + token.write(creds.to_json()) return creds diff --git a/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml index fd24e91a66db56249ef88d6ae8a49ef54b21b818..8c858b15e921e16bd8902404294369105fd862d1 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml @@ -45,7 +45,7 @@ maintainers = [ ] name = "llama-index-readers-google" readme = "README.md" -version = "0.2.0" +version = "0.2.1" [tool.poetry.dependencies] python = ">=3.10,<4.0" diff --git a/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py b/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py index 07825d26623608fd7eeaa3d4a3bbe833fae8f5c1..871f4ca7ad87eff2884f7006d13013061504fde7 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py +++ b/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py @@ -1,7 +1,60 @@ +import json +from tempfile import TemporaryDirectory + from llama_index.core.readers.base import BaseReader +import pytest from llama_index.readers.google import GoogleDriveReader +test_client_config = {"client_config": {"key": "value"}} +test_authorized_user_info = {"authorized_user_info": {"key": "value"}} +test_service_account_key = {"service_account_key": {"key": "value"}} + def test_class(): names_of_base_classes = [b.__name__ for b in GoogleDriveReader.__mro__] assert BaseReader.__name__ in names_of_base_classes + + +def test_serialize(): + reader = GoogleDriveReader( + client_config=test_client_config, + authorized_user_info=test_authorized_user_info, + ) + + schema = reader.schema() + assert schema is not None + assert len(schema) > 0 + assert "client_config" in schema["properties"] + + json_reader = reader.json(exclude_unset=True) + + new_reader = GoogleDriveReader.parse_raw(json_reader) + assert new_reader.client_config == reader.client_config + assert new_reader.authorized_user_info == reader.authorized_user_info + + +def test_serialize_from_file(): + with TemporaryDirectory() as tmp_dir: + file_name = f"{tmp_dir}/test_service_account_key.json" + with open(file_name, "w") as f: + f.write(json.dumps(test_service_account_key)) + + reader = GoogleDriveReader( + service_account_key_path=file_name, + ) + + schema = reader.schema() + assert schema is not None + assert len(schema) > 0 + assert "service_account_key" in schema["properties"] + + json_reader = reader.json(exclude_unset=True) + + new_reader = GoogleDriveReader.parse_raw(json_reader) + assert new_reader.service_account_key == reader.service_account_key + assert new_reader.service_account_key == test_service_account_key + + +def test_error_on_missing_args(): + with pytest.raises(ValueError): + GoogleDriveReader()