From 54edec04a01ad43a2d092c6b8daae7e5069517d0 Mon Sep 17 00:00:00 2001
From: Emanuel Ferreira <contatoferreirads@gmail.com>
Date: Mon, 11 Nov 2024 19:22:52 -0300
Subject: [PATCH] feat: add relative file path google drive (#16907)

---
 .../llama_index/readers/google/drive/base.py  | 89 +++++++++++++++++--
 .../llama-index-readers-google/pyproject.toml |  2 +-
 .../tests/test_readers_google_drive.py        | 37 ++++++++
 3 files changed, 121 insertions(+), 7 deletions(-)

diff --git a/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py b/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py
index 58f7632f42..db0e647d96 100644
--- a/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/drive/base.py
@@ -193,6 +193,56 @@ class GoogleDriveReader(
     def _get_drive_link(self, file_id: str) -> str:
         return f"https://drive.google.com/file/d/{file_id}/view"
 
+    def _get_relative_path(
+        self, service, file_id: str, root_folder_id: Optional[str] = None
+    ) -> str:
+        """Get the relative path from root_folder_id to file_id."""
+        try:
+            # Get file details including parents
+            file = (
+                service.files()
+                .get(fileId=file_id, supportsAllDrives=True, fields="name, parents")
+                .execute()
+            )
+
+            path_parts = [file["name"]]
+
+            if not root_folder_id:
+                return file["name"]
+
+            # Traverse up through parents until we reach root_folder_id or can't access anymore
+            try:
+                current_parent = file.get("parents", [None])[0]
+                while current_parent:
+                    # If we reach the root folder, stop
+                    if current_parent == root_folder_id:
+                        break
+
+                    try:
+                        parent = (
+                            service.files()
+                            .get(
+                                fileId=current_parent,
+                                supportsAllDrives=True,
+                                fields="name, parents",
+                            )
+                            .execute()
+                        )
+                        path_parts.insert(0, parent["name"])
+                        current_parent = parent.get("parents", [None])[0]
+                    except Exception as e:
+                        logger.debug(f"Stopped at parent {current_parent}: {e!s}")
+                        break
+
+            except Exception as e:
+                logger.debug(f"Could not access parents for {file_id}: {e!s}")
+
+            return "/".join(path_parts)
+
+        except Exception as e:
+            logger.warning(f"Could not get path for file {file_id}: {e}")
+            return file["name"]
+
     def _get_fileids_meta(
         self,
         drive_id: Optional[str] = None,
@@ -200,6 +250,7 @@ class GoogleDriveReader(
         file_id: Optional[str] = None,
         mime_types: Optional[List[str]] = None,
         query_string: Optional[str] = None,
+        current_path: Optional[str] = None,
     ) -> List[List[str]]:
         """Get file ids present in folder/ file id
         Args:
@@ -217,7 +268,22 @@ class GoogleDriveReader(
         try:
             service = build("drive", "v3", credentials=self._creds)
             fileids_meta = []
-            if folder_id:
+
+            if folder_id and not file_id:
+                try:
+                    folder = (
+                        service.files()
+                        .get(fileId=folder_id, supportsAllDrives=True, fields="name")
+                        .execute()
+                    )
+                    current_path = (
+                        f"{current_path}/{folder['name']}"
+                        if current_path
+                        else folder["name"]
+                    )
+                except Exception as e:
+                    logger.warning(f"Could not get folder name: {e}")
+
                 folder_mime_type = "application/vnd.google-apps.folder"
                 query = "('" + folder_id + "' in parents)"
 
@@ -273,6 +339,12 @@ class GoogleDriveReader(
                         break
 
                 for item in items:
+                    item_path = (
+                        f"{current_path}/{item['name']}"
+                        if current_path
+                        else item["name"]
+                    )
+
                     if item["mimeType"] == folder_mime_type:
                         if drive_id:
                             fileids_meta.extend(
@@ -281,6 +353,7 @@ class GoogleDriveReader(
                                     folder_id=item["id"],
                                     mime_types=mime_types,
                                     query_string=query_string,
+                                    current_path=current_path,
                                 )
                             )
                         else:
@@ -289,6 +362,7 @@ class GoogleDriveReader(
                                     folder_id=item["id"],
                                     mime_types=mime_types,
                                     query_string=query_string,
+                                    current_path=current_path,
                                 )
                             )
                     else:
@@ -299,12 +373,11 @@ class GoogleDriveReader(
                             if not is_shared_drive
                             else "Shared Drive"
                         )
-
                         fileids_meta.append(
                             (
                                 item["id"],
                                 author,
-                                item["name"],
+                                item_path,
                                 item["mimeType"],
                                 item["createdTime"],
                                 item["modifiedTime"],
@@ -319,7 +392,6 @@ class GoogleDriveReader(
                     .execute()
                 )
                 # Get metadata of the file
-                # Check if file doesn't belong to a Shared Drive. "owners" doesn't exist in a Shared Drive
                 is_shared_drive = "driveId" in file
                 author = (
                     file["owners"][0]["displayName"]
@@ -327,11 +399,16 @@ class GoogleDriveReader(
                     else "Shared Drive"
                 )
 
+                # Get the full file path
+                file_path = self._get_relative_path(
+                    service, file_id, folder_id or self.folder_id
+                )
+
                 fileids_meta.append(
                     (
                         file["id"],
                         author,
-                        file["name"],
+                        file_path,
                         file["mimeType"],
                         file["createdTime"],
                         file["modifiedTime"],
@@ -423,7 +500,7 @@ class GoogleDriveReader(
                     metadata[final_filepath] = {
                         "file id": fileid_meta[0],
                         "author": fileid_meta[1],
-                        "file name": fileid_meta[2],
+                        "file path": fileid_meta[2],
                         "mime type": fileid_meta[3],
                         "created at": fileid_meta[4],
                         "modified at": fileid_meta[5],
diff --git a/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml
index 2cce97748e..4b9f2b9493 100644
--- a/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml
@@ -47,7 +47,7 @@ maintainers = [
 ]
 name = "llama-index-readers-google"
 readme = "README.md"
-version = "0.4.2"
+version = "0.4.3"
 
 [tool.poetry.dependencies]
 python = ">=3.10,<4.0"
diff --git a/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py b/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py
index 9be7af1e76..27c8d59678 100644
--- a/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py
+++ b/llama-index-integrations/readers/llama-index-readers-google/tests/test_readers_google_drive.py
@@ -120,3 +120,40 @@ class TestGoogleDriveReader(unittest.TestCase):
         mock_credentials.to_json.assert_not_called()
         assert result == mock_credentials
         assert os.path.exists(reader.token_path) is False
+
+    def test_get_relative_path(self):
+        # Mock the necessary objects and methods
+        mock_credentials = MagicMock()
+        mock_service = MagicMock()
+        GoogleDriveReader._get_credentials = MagicMock(return_value=mock_credentials)
+
+        reader = GoogleDriveReader(
+            client_config={
+                "client_id": "example_client_id",
+                "client_secret": "example_client_secret",
+            },
+        )
+
+        # Test case 1: Simple file without root_folder_id
+        file_id = "example_file_id"
+        mock_file_response = {"name": "test_file", "parents": ["parent_id"]}
+        mock_service.files().get().execute.return_value = mock_file_response
+
+        result = reader._get_relative_path(mock_service, file_id)
+        assert result == "test_file"
+
+        # Test case 2: File with path traversal to root_folder_id
+        root_folder_id = "root_folder_id"
+        mock_file_responses = [
+            {"name": "test_file", "parents": ["parent1_id"]},  # File
+            {"name": "parent1", "parents": ["parent2_id"]},  # Parent 1
+            {"name": "parent2", "parents": ["root_folder_id"]},  # Parent 2
+        ]
+
+        mock_service.files().get().execute.side_effect = mock_file_responses
+
+        result = reader._get_relative_path(mock_service, file_id, root_folder_id)
+        assert result == "parent2/parent1/test_file"
+
+        # Verify API calls
+        assert mock_service.files().get.call_count >= 1
-- 
GitLab