From 95e107423664812eeece1af0f162c9dcd4bfe670 Mon Sep 17 00:00:00 2001
From: David Chiu <david20571015@gmail.com>
Date: Mon, 30 Oct 2023 01:45:03 -0500
Subject: [PATCH] Lazy data loading for `readers` (#8509)

* feat: lazy data loading

* feat: `lazy_load_data` of mongodb

---------

Co-authored-by: Simon Suo <simonsdsuo@gmail.com>
---
 CONTRIBUTING.md              | 11 ++++++++---
 llama_index/readers/base.py  | 14 ++++++++++----
 llama_index/readers/mongo.py | 18 +++++++-----------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5bf966409c..0395050177 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -14,7 +14,7 @@ The best part of LlamaIndex is our community of users and contributors.
 4. 🧪 Add experimental features
 5. 📄 Improve code quality & documentation
 
-Also, join our Discord for ideas and discussions: https://discord.gg/dGcwcsnxhU.
+Also, join our Discord for ideas and discussions: <https://discord.gg/dGcwcsnxhU>.
 
 ### 1. 🆕 Extend Core Modules
 
@@ -38,7 +38,12 @@ Below, we will describe what each module does, give a high-level idea of the int
 
 A data loader ingests data of any format from anywhere into `Document` objects, which can then be parsed and indexed.
 
-**Interface**: `load_data` takes arbitrary arguments as input (e.g. path to data), and outputs a sequence of `Document` objects.
+**Interface**:
+
+- `load_data` takes arbitrary arguments as input (e.g. path to data), and outputs a sequence of `Document` objects.
+- `lazy_load_data` takes arbitrary arguments as input (e.g. path to data), and outputs an iterable object of `Document` objects. This is a lazy version of `load_data`, which is useful for large datasets.
+
+> **Note**: If only `lazy_load_data` is implemented, `load_data` will be delegated to it.
 
 **Examples**:
 
@@ -328,7 +333,7 @@ make test
 For changes that involve entirely new features, it may be worth adding an example Jupyter notebook to showcase
 this feature.
 
-Example notebooks can be found in this folder: https://github.com/jerryjliu/llama_index/tree/main/examples.
+Example notebooks can be found in this folder: <https://github.com/jerryjliu/llama_index/tree/main/examples>.
 
 ### Creating a pull request
 
diff --git a/llama_index/readers/base.py b/llama_index/readers/base.py
index 3d0944eb1d..7fe1fd795e 100644
--- a/llama_index/readers/base.py
+++ b/llama_index/readers/base.py
@@ -1,18 +1,24 @@
 """Base reader class."""
-from abc import abstractmethod
-from typing import Any, Dict, List
+from abc import ABC
+from typing import Any, Dict, Iterable, List
 
 from llama_index.bridge.langchain import Document as LCDocument
 from llama_index.bridge.pydantic import Field
 from llama_index.schema import BaseComponent, Document
 
 
-class BaseReader:
+class BaseReader(ABC):
     """Utilities for loading data from a directory."""
 
-    @abstractmethod
+    def lazy_load_data(self, *args: Any, **load_kwargs: Any) -> Iterable[Document]:
+        """Load data from the input directory lazily."""
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not provide lazy_load_data method currently"
+        )
+
     def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
         """Load data from the input directory."""
+        return list(self.lazy_load_data(*args, **load_kwargs))
 
     def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
         """Load data in LangChain document format."""
diff --git a/llama_index/readers/mongo.py b/llama_index/readers/mongo.py
index f86bc13fc5..557597b8c3 100644
--- a/llama_index/readers/mongo.py
+++ b/llama_index/readers/mongo.py
@@ -1,6 +1,6 @@
 """Mongo client."""
 
-from typing import Dict, List, Optional, Union
+from typing import Dict, Iterable, List, Optional, Union
 
 from llama_index.readers.base import BaseReader
 from llama_index.schema import Document
@@ -27,12 +27,11 @@ class SimpleMongoReader(BaseReader):
     ) -> None:
         """Initialize with parameters."""
         try:
-            import pymongo
             from pymongo import MongoClient
-        except ImportError:
+        except ImportError as err:
             raise ImportError(
                 "`pymongo` package not found, please run `pip install pymongo`"
-            )
+            ) from err
 
         client: MongoClient
         if uri:
@@ -51,7 +50,7 @@ class SimpleMongoReader(BaseReader):
             result += text if isinstance(text, list) else [text]
         return result
 
-    def load_data(
+    def lazy_load_data(
         self,
         db_name: str,
         collection_name: str,
@@ -59,7 +58,7 @@ class SimpleMongoReader(BaseReader):
         separator: str = "",
         query_dict: Optional[Dict] = None,
         metadata_names: Optional[List[str]] = None,
-    ) -> List[Document]:
+    ) -> Iterable[Document]:
         """Load data from the input directory.
 
         Args:
@@ -78,7 +77,6 @@ class SimpleMongoReader(BaseReader):
             List[Document]: A list of documents.
 
         """
-        documents = []
         db = self.client[db_name]
         cursor = db[collection_name].find(filter=query_dict or {}, limit=self.max_docs)
 
@@ -94,7 +92,7 @@ class SimpleMongoReader(BaseReader):
             text = separator.join(texts)
 
             if metadata_names is None:
-                documents.append(Document(text=text))
+                yield Document(text=text)
             else:
                 try:
                     metadata = {name: item[name] for name in metadata_names}
@@ -102,6 +100,4 @@ class SimpleMongoReader(BaseReader):
                     raise ValueError(
                         f"{err.args[0]} field not found in Mongo document."
                     ) from err
-                documents.append(Document(text=text, metadata=metadata))
-
-        return documents
+                yield Document(text=text, metadata=metadata)
-- 
GitLab