From 95e107423664812eeece1af0f162c9dcd4bfe670 Mon Sep 17 00:00:00 2001 From: David Chiu <david20571015@gmail.com> Date: Mon, 30 Oct 2023 01:45:03 -0500 Subject: [PATCH] Lazy data loading for `readers` (#8509) * feat: lazy data loading * feat: `lazy_load_data` of mongodb --------- Co-authored-by: Simon Suo <simonsdsuo@gmail.com> --- CONTRIBUTING.md | 11 ++++++++--- llama_index/readers/base.py | 14 ++++++++++---- llama_index/readers/mongo.py | 18 +++++++----------- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5bf966409c..0395050177 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,7 +14,7 @@ The best part of LlamaIndex is our community of users and contributors. 4. 🧪 Add experimental features 5. 📄 Improve code quality & documentation -Also, join our Discord for ideas and discussions: https://discord.gg/dGcwcsnxhU. +Also, join our Discord for ideas and discussions: <https://discord.gg/dGcwcsnxhU>. ### 1. 🆕 Extend Core Modules @@ -38,7 +38,12 @@ Below, we will describe what each module does, give a high-level idea of the int A data loader ingests data of any format from anywhere into `Document` objects, which can then be parsed and indexed. -**Interface**: `load_data` takes arbitrary arguments as input (e.g. path to data), and outputs a sequence of `Document` objects. +**Interface**: + +- `load_data` takes arbitrary arguments as input (e.g. path to data), and outputs a sequence of `Document` objects. +- `lazy_load_data` takes arbitrary arguments as input (e.g. path to data), and outputs an iterable object of `Document` objects. This is a lazy version of `load_data`, which is useful for large datasets. + +> **Note**: If only `lazy_load_data` is implemented, `load_data` will be delegated to it. **Examples**: @@ -328,7 +333,7 @@ make test For changes that involve entirely new features, it may be worth adding an example Jupyter notebook to showcase this feature. -Example notebooks can be found in this folder: https://github.com/jerryjliu/llama_index/tree/main/examples. +Example notebooks can be found in this folder: <https://github.com/jerryjliu/llama_index/tree/main/examples>. ### Creating a pull request diff --git a/llama_index/readers/base.py b/llama_index/readers/base.py index 3d0944eb1d..7fe1fd795e 100644 --- a/llama_index/readers/base.py +++ b/llama_index/readers/base.py @@ -1,18 +1,24 @@ """Base reader class.""" -from abc import abstractmethod -from typing import Any, Dict, List +from abc import ABC +from typing import Any, Dict, Iterable, List from llama_index.bridge.langchain import Document as LCDocument from llama_index.bridge.pydantic import Field from llama_index.schema import BaseComponent, Document -class BaseReader: +class BaseReader(ABC): """Utilities for loading data from a directory.""" - @abstractmethod + def lazy_load_data(self, *args: Any, **load_kwargs: Any) -> Iterable[Document]: + """Load data from the input directory lazily.""" + raise NotImplementedError( + f"{self.__class__.__name__} does not provide lazy_load_data method currently" + ) + def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: """Load data from the input directory.""" + return list(self.lazy_load_data(*args, **load_kwargs)) def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: """Load data in LangChain document format.""" diff --git a/llama_index/readers/mongo.py b/llama_index/readers/mongo.py index f86bc13fc5..557597b8c3 100644 --- a/llama_index/readers/mongo.py +++ b/llama_index/readers/mongo.py @@ -1,6 +1,6 @@ """Mongo client.""" -from typing import Dict, List, Optional, Union +from typing import Dict, Iterable, List, Optional, Union from llama_index.readers.base import BaseReader from llama_index.schema import Document @@ -27,12 +27,11 @@ class SimpleMongoReader(BaseReader): ) -> None: """Initialize with parameters.""" try: - import pymongo from pymongo import MongoClient - except ImportError: + except ImportError as err: raise ImportError( "`pymongo` package not found, please run `pip install pymongo`" - ) + ) from err client: MongoClient if uri: @@ -51,7 +50,7 @@ class SimpleMongoReader(BaseReader): result += text if isinstance(text, list) else [text] return result - def load_data( + def lazy_load_data( self, db_name: str, collection_name: str, @@ -59,7 +58,7 @@ class SimpleMongoReader(BaseReader): separator: str = "", query_dict: Optional[Dict] = None, metadata_names: Optional[List[str]] = None, - ) -> List[Document]: + ) -> Iterable[Document]: """Load data from the input directory. Args: @@ -78,7 +77,6 @@ class SimpleMongoReader(BaseReader): List[Document]: A list of documents. """ - documents = [] db = self.client[db_name] cursor = db[collection_name].find(filter=query_dict or {}, limit=self.max_docs) @@ -94,7 +92,7 @@ class SimpleMongoReader(BaseReader): text = separator.join(texts) if metadata_names is None: - documents.append(Document(text=text)) + yield Document(text=text) else: try: metadata = {name: item[name] for name in metadata_names} @@ -102,6 +100,4 @@ class SimpleMongoReader(BaseReader): raise ValueError( f"{err.args[0]} field not found in Mongo document." ) from err - documents.append(Document(text=text, metadata=metadata)) - - return documents + yield Document(text=text, metadata=metadata) -- GitLab