diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py index 3378f3beeb5386c6742338651a91e9136544c3d4..a53d0a7d2b75948ee3742877e1ceb10164f33336 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py @@ -16,6 +16,7 @@ from llama_index.readers.file.tabular import PandasCSVReader, CSVReader from llama_index.readers.file.unstructured import UnstructuredReader from llama_index.readers.file.video_audio import VideoAudioReader from llama_index.readers.file.xml import XMLReader +from llama_index.readers.file.rtf import RTFReader __all__ = [ "DocxReader", @@ -39,4 +40,5 @@ __all__ = [ "XMLReader", "PagedCSVReader", "CSVReader", + "RTFReader", ] diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f45c22c55b09604fca288576247a4b9015f370cb --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md @@ -0,0 +1,19 @@ +# RTF (Rich Text Format) Loader + +This loader strips all RTF formatting from file and create a Document. + +## Usage + +To use this loader, you need to pass a `Path` object or a `str` to a local file. + +```python +from pathlib import Path +from llama_index import download_loader + +RTFReader = download_loader("RTFReader") + +loader = RTFReader() +documents = RTFReader().load_data(file=Path("./example.rtf")) +``` + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main/llama_hub) for examples. diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..54c8ed19317f662dfaa0b62176e332eb641694b0 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py @@ -0,0 +1,3 @@ +from llama_index.readers.file.rtf.base import RTFReader + +__all__ = ["RTFReader"] diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py new file mode 100644 index 0000000000000000000000000000000000000000..772c4734a6a37b990f37930c5aa04714cc46737f --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py @@ -0,0 +1,34 @@ +"""RTF (Rich Text Format) reader.""" +from pathlib import Path +from typing import List, Union, Any, Dict + +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + + +class RTFReader(BaseReader): + """RTF (Rich Text Format) Reader. Reads rtf file and convert to Document.""" + + def load_data( + self, + input_file: Union[Path, str], + extra_info=Dict[str, Any], + **load_kwargs: Any + ) -> List[Document]: + """Load data from RTF file. + + Args: + input_file (Path | str): Path for the RTF file. + extra_info (Dict[str, Any]): Path for the RTF file. + + Returns: + List[Document]: List of documents. + """ + try: + from striprtf.striprtf import rtf_to_text + except ImportError: + raise ImportError("striprtf is required to read RTF files.") + + with open(str(input_file)) as f: + text = rtf_to_text(f.read()) + return [Document(text=text.strip())] diff --git a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml index 983c964b9088e8b7cb3d4f2a4e74c27988d8b2ae..827be3b0c5b0ce24877a56d566ce3c58758386fb 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml @@ -30,6 +30,7 @@ PagedCSVReader = "thejessezhang" PandasCSVReader = "ephe-meral" PptxReader = "thejessezhang" PyMuPDFReader = "iamarunbrahma" +RTFReader = "FunkyOz" UnstructuredReader = "thejessezhang" VideoAudioReader = "llama-index" XMLReader = "mmaatouk" @@ -49,7 +50,7 @@ license = "MIT" maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"] name = "llama-index-readers-file" readme = "README.md" -version = "0.1.8" +version = "0.1.9" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" @@ -58,6 +59,7 @@ pymupdf = "^1.23.21" bs4 = "^0.0.2" beautifulsoup4 = "^4.12.3" pypdf = "^4.0.1" +striprtf = "^0.0.26" [tool.poetry.group.dev.dependencies] ipython = "8.10.0" diff --git a/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py b/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py new file mode 100644 index 0000000000000000000000000000000000000000..7f8d15a20d155c6a2449efd1ba2b0e7dd0e60931 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py @@ -0,0 +1,27 @@ +import pytest +from striprtf.striprtf import rtf_to_text + +from llama_index.readers.file.rtf import RTFReader + +# Sample XML data for testing +SAMPLE_RTF = """{\\rtf + Hello!\\par + This is a rtf file {\\b bolded}.\\par +}""" + + +# Fixture to create a temporary XML file +@pytest.fixture() +def rtf_file(tmp_path): + file = tmp_path / "test.rtf" + with open(file, "w") as f: + f.write(SAMPLE_RTF) + return file + + +def test_load_data_rtf(rtf_file): + reader = RTFReader() + text = rtf_to_text(SAMPLE_RTF).strip() + documents = reader.load_data(rtf_file) + assert len(documents) == 1 + assert text == documents[0].text