From 4df8748993ec7ecfe471d599d95de7370b186db7 Mon Sep 17 00:00:00 2001 From: Lorenzo Dessimoni <lorenzo.dessimoni@gmail.com> Date: Sat, 9 Mar 2024 13:21:04 +0100 Subject: [PATCH] Create new RTF integration (#11466) --- .../llama_index/readers/file/__init__.py | 2 ++ .../llama_index/readers/file/rtf/BUILD | 1 + .../llama_index/readers/file/rtf/README.md | 19 +++++++++++ .../llama_index/readers/file/rtf/__init__.py | 3 ++ .../llama_index/readers/file/rtf/base.py | 34 +++++++++++++++++++ .../llama-index-readers-file/pyproject.toml | 4 ++- .../tests/test_rtf.py | 27 +++++++++++++++ 7 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md create mode 100644 llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py create mode 100644 llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py create mode 100644 llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py index 3378f3bee..a53d0a7d2 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py @@ -16,6 +16,7 @@ from llama_index.readers.file.tabular import PandasCSVReader, CSVReader from llama_index.readers.file.unstructured import UnstructuredReader from llama_index.readers.file.video_audio import VideoAudioReader from llama_index.readers.file.xml import XMLReader +from llama_index.readers.file.rtf import RTFReader __all__ = [ "DocxReader", @@ -39,4 +40,5 @@ __all__ = [ "XMLReader", "PagedCSVReader", "CSVReader", + "RTFReader", ] diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD new file mode 100644 index 000000000..db46e8d6c --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md new file mode 100644 index 000000000..f45c22c55 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md @@ -0,0 +1,19 @@ +# RTF (Rich Text Format) Loader + +This loader strips all RTF formatting from file and create a Document. + +## Usage + +To use this loader, you need to pass a `Path` object or a `str` to a local file. + +```python +from pathlib import Path +from llama_index import download_loader + +RTFReader = download_loader("RTFReader") + +loader = RTFReader() +documents = RTFReader().load_data(file=Path("./example.rtf")) +``` + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main/llama_hub) for examples. diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py new file mode 100644 index 000000000..54c8ed193 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py @@ -0,0 +1,3 @@ +from llama_index.readers.file.rtf.base import RTFReader + +__all__ = ["RTFReader"] diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py new file mode 100644 index 000000000..772c4734a --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py @@ -0,0 +1,34 @@ +"""RTF (Rich Text Format) reader.""" +from pathlib import Path +from typing import List, Union, Any, Dict + +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + + +class RTFReader(BaseReader): + """RTF (Rich Text Format) Reader. Reads rtf file and convert to Document.""" + + def load_data( + self, + input_file: Union[Path, str], + extra_info=Dict[str, Any], + **load_kwargs: Any + ) -> List[Document]: + """Load data from RTF file. + + Args: + input_file (Path | str): Path for the RTF file. + extra_info (Dict[str, Any]): Path for the RTF file. + + Returns: + List[Document]: List of documents. + """ + try: + from striprtf.striprtf import rtf_to_text + except ImportError: + raise ImportError("striprtf is required to read RTF files.") + + with open(str(input_file)) as f: + text = rtf_to_text(f.read()) + return [Document(text=text.strip())] diff --git a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml index 983c964b9..827be3b0c 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml @@ -30,6 +30,7 @@ PagedCSVReader = "thejessezhang" PandasCSVReader = "ephe-meral" PptxReader = "thejessezhang" PyMuPDFReader = "iamarunbrahma" +RTFReader = "FunkyOz" UnstructuredReader = "thejessezhang" VideoAudioReader = "llama-index" XMLReader = "mmaatouk" @@ -49,7 +50,7 @@ license = "MIT" maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"] name = "llama-index-readers-file" readme = "README.md" -version = "0.1.8" +version = "0.1.9" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" @@ -58,6 +59,7 @@ pymupdf = "^1.23.21" bs4 = "^0.0.2" beautifulsoup4 = "^4.12.3" pypdf = "^4.0.1" +striprtf = "^0.0.26" [tool.poetry.group.dev.dependencies] ipython = "8.10.0" diff --git a/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py b/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py new file mode 100644 index 000000000..7f8d15a20 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py @@ -0,0 +1,27 @@ +import pytest +from striprtf.striprtf import rtf_to_text + +from llama_index.readers.file.rtf import RTFReader + +# Sample XML data for testing +SAMPLE_RTF = """{\\rtf + Hello!\\par + This is a rtf file {\\b bolded}.\\par +}""" + + +# Fixture to create a temporary XML file +@pytest.fixture() +def rtf_file(tmp_path): + file = tmp_path / "test.rtf" + with open(file, "w") as f: + f.write(SAMPLE_RTF) + return file + + +def test_load_data_rtf(rtf_file): + reader = RTFReader() + text = rtf_to_text(SAMPLE_RTF).strip() + documents = reader.load_data(rtf_file) + assert len(documents) == 1 + assert text == documents[0].text -- GitLab