diff --git a/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py b/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py index 58f7632f42cf290b775ae2821c335283b84aad80..db0e647d9607a6b434a9007960492b9ccab00a2c 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py +++ b/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py @@ -193,6 +193,56 @@ class GoogleDriveReader( def _get_drive_link(self, file_id: str) -> str: return f"https://drive.google.com/file/d/{file_id}/view" + def _get_relative_path( + self, service, file_id: str, root_folder_id: Optional[str] = None + ) -> str: + """Get the relative path from root_folder_id to file_id.""" + try: + # Get file details including parents + file = ( + service.files() + .get(fileId=file_id, supportsAllDrives=True, fields="name, parents") + .execute() + ) + + path_parts = [file["name"]] + + if not root_folder_id: + return file["name"] + + # Traverse up through parents until we reach root_folder_id or can't access anymore + try: + current_parent = file.get("parents", [None])[0] + while current_parent: + # If we reach the root folder, stop + if current_parent == root_folder_id: + break + + try: + parent = ( + service.files() + .get( + fileId=current_parent, + supportsAllDrives=True, + fields="name, parents", + ) + .execute() + ) + path_parts.insert(0, parent["name"]) + current_parent = parent.get("parents", [None])[0] + except Exception as e: + logger.debug(f"Stopped at parent {current_parent}: {e!s}") + break + + except Exception as e: + logger.debug(f"Could not access parents for {file_id}: {e!s}") + + return "/".join(path_parts) + + except Exception as e: + logger.warning(f"Could not get path for file {file_id}: {e}") + return file["name"] + def _get_fileids_meta( self, drive_id: Optional[str] = None, @@ -200,6 +250,7 @@ class GoogleDriveReader( file_id: Optional[str] = None, mime_types: Optional[List[str]] = None, query_string: Optional[str] = None, + current_path: Optional[str] = None, ) -> List[List[str]]: """Get file ids present in folder/ file id Args: @@ -217,7 +268,22 @@ class GoogleDriveReader( try: service = build("drive", "v3", credentials=self._creds) fileids_meta = [] - if folder_id: + + if folder_id and not file_id: + try: + folder = ( + service.files() + .get(fileId=folder_id, supportsAllDrives=True, fields="name") + .execute() + ) + current_path = ( + f"{current_path}/{folder['name']}" + if current_path + else folder["name"] + ) + except Exception as e: + logger.warning(f"Could not get folder name: {e}") + folder_mime_type = "application/vnd.google-apps.folder" query = "('" + folder_id + "' in parents)" @@ -273,6 +339,12 @@ class GoogleDriveReader( break for item in items: + item_path = ( + f"{current_path}/{item['name']}" + if current_path + else item["name"] + ) + if item["mimeType"] == folder_mime_type: if drive_id: fileids_meta.extend( @@ -281,6 +353,7 @@ class GoogleDriveReader( folder_id=item["id"], mime_types=mime_types, query_string=query_string, + current_path=current_path, ) ) else: @@ -289,6 +362,7 @@ class GoogleDriveReader( folder_id=item["id"], mime_types=mime_types, query_string=query_string, + current_path=current_path, ) ) else: @@ -299,12 +373,11 @@ class GoogleDriveReader( if not is_shared_drive else "Shared Drive" ) - fileids_meta.append( ( item["id"], author, - item["name"], + item_path, item["mimeType"], item["createdTime"], item["modifiedTime"], @@ -319,7 +392,6 @@ class GoogleDriveReader( .execute() ) # Get metadata of the file - # Check if file doesn't belong to a Shared Drive. "owners" doesn't exist in a Shared Drive is_shared_drive = "driveId" in file author = ( file["owners"][0]["displayName"] @@ -327,11 +399,16 @@ class GoogleDriveReader( else "Shared Drive" ) + # Get the full file path + file_path = self._get_relative_path( + service, file_id, folder_id or self.folder_id + ) + fileids_meta.append( ( file["id"], author, - file["name"], + file_path, file["mimeType"], file["createdTime"], file["modifiedTime"], @@ -423,7 +500,7 @@ class GoogleDriveReader( metadata[final_filepath] = { "file id": fileid_meta[0], "author": fileid_meta[1], - "file name": fileid_meta[2], + "file path": fileid_meta[2], "mime type": fileid_meta[3], "created at": fileid_meta[4], "modified at": fileid_meta[5], diff --git a/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml index 2cce97748e1418d87e9d5ebb2b837aa81fc11267..4b9f2b9493d09905e38c37265b8235cfd1d2ad1f 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml @@ -47,7 +47,7 @@ maintainers = [ ] name = "llama-index-readers-google" readme = "README.md" -version = "0.4.2" +version = "0.4.3" [tool.poetry.dependencies] python = ">=3.10,<4.0" diff --git a/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py b/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py index 9be7af1e7667a6544d2c6416aacd9ccd6686716b..27c8d5967883c665fc3424c6ef999032ab203267 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py +++ b/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py @@ -120,3 +120,40 @@ class TestGoogleDriveReader(unittest.TestCase): mock_credentials.to_json.assert_not_called() assert result == mock_credentials assert os.path.exists(reader.token_path) is False + + def test_get_relative_path(self): + # Mock the necessary objects and methods + mock_credentials = MagicMock() + mock_service = MagicMock() + GoogleDriveReader._get_credentials = MagicMock(return_value=mock_credentials) + + reader = GoogleDriveReader( + client_config={ + "client_id": "example_client_id", + "client_secret": "example_client_secret", + }, + ) + + # Test case 1: Simple file without root_folder_id + file_id = "example_file_id" + mock_file_response = {"name": "test_file", "parents": ["parent_id"]} + mock_service.files().get().execute.return_value = mock_file_response + + result = reader._get_relative_path(mock_service, file_id) + assert result == "test_file" + + # Test case 2: File with path traversal to root_folder_id + root_folder_id = "root_folder_id" + mock_file_responses = [ + {"name": "test_file", "parents": ["parent1_id"]}, # File + {"name": "parent1", "parents": ["parent2_id"]}, # Parent 1 + {"name": "parent2", "parents": ["root_folder_id"]}, # Parent 2 + ] + + mock_service.files().get().execute.side_effect = mock_file_responses + + result = reader._get_relative_path(mock_service, file_id, root_folder_id) + assert result == "parent2/parent1/test_file" + + # Verify API calls + assert mock_service.files().get.call_count >= 1