From 4df8748993ec7ecfe471d599d95de7370b186db7 Mon Sep 17 00:00:00 2001
From: Lorenzo Dessimoni <lorenzo.dessimoni@gmail.com>
Date: Sat, 9 Mar 2024 13:21:04 +0100
Subject: [PATCH] Create new RTF integration (#11466)

---
 .../llama_index/readers/file/__init__.py      |  2 ++
 .../llama_index/readers/file/rtf/BUILD        |  1 +
 .../llama_index/readers/file/rtf/README.md    | 19 +++++++++++
 .../llama_index/readers/file/rtf/__init__.py  |  3 ++
 .../llama_index/readers/file/rtf/base.py      | 34 +++++++++++++++++++
 .../llama-index-readers-file/pyproject.toml   |  4 ++-
 .../tests/test_rtf.py                         | 27 +++++++++++++++
 7 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD
 create mode 100644 llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md
 create mode 100644 llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py
 create mode 100644 llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py
 create mode 100644 llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py

diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py
index 3378f3bee..a53d0a7d2 100644
--- a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py
+++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/__init__.py
@@ -16,6 +16,7 @@ from llama_index.readers.file.tabular import PandasCSVReader, CSVReader
 from llama_index.readers.file.unstructured import UnstructuredReader
 from llama_index.readers.file.video_audio import VideoAudioReader
 from llama_index.readers.file.xml import XMLReader
+from llama_index.readers.file.rtf import RTFReader
 
 __all__ = [
     "DocxReader",
@@ -39,4 +40,5 @@ __all__ = [
     "XMLReader",
     "PagedCSVReader",
     "CSVReader",
+    "RTFReader",
 ]
diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD
new file mode 100644
index 000000000..db46e8d6c
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md
new file mode 100644
index 000000000..f45c22c55
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/README.md
@@ -0,0 +1,19 @@
+# RTF (Rich Text Format) Loader
+
+This loader strips all RTF formatting from file and create a Document.
+
+## Usage
+
+To use this loader, you need to pass a `Path` object or a `str` to a local file.
+
+```python
+from pathlib import Path
+from llama_index import download_loader
+
+RTFReader = download_loader("RTFReader")
+
+loader = RTFReader()
+documents = RTFReader().load_data(file=Path("./example.rtf"))
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main/llama_hub) for examples.
diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py
new file mode 100644
index 000000000..54c8ed193
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/__init__.py
@@ -0,0 +1,3 @@
+from llama_index.readers.file.rtf.base import RTFReader
+
+__all__ = ["RTFReader"]
diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py
new file mode 100644
index 000000000..772c4734a
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/rtf/base.py
@@ -0,0 +1,34 @@
+"""RTF (Rich Text Format) reader."""
+from pathlib import Path
+from typing import List, Union, Any, Dict
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+class RTFReader(BaseReader):
+    """RTF (Rich Text Format) Reader. Reads rtf file and convert to Document."""
+
+    def load_data(
+        self,
+        input_file: Union[Path, str],
+        extra_info=Dict[str, Any],
+        **load_kwargs: Any
+    ) -> List[Document]:
+        """Load data from RTF file.
+
+        Args:
+            input_file (Path | str): Path for the RTF file.
+            extra_info (Dict[str, Any]): Path for the RTF file.
+
+        Returns:
+            List[Document]: List of documents.
+        """
+        try:
+            from striprtf.striprtf import rtf_to_text
+        except ImportError:
+            raise ImportError("striprtf is required to read RTF files.")
+
+        with open(str(input_file)) as f:
+            text = rtf_to_text(f.read())
+            return [Document(text=text.strip())]
diff --git a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
index 983c964b9..827be3b0c 100644
--- a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
@@ -30,6 +30,7 @@ PagedCSVReader = "thejessezhang"
 PandasCSVReader = "ephe-meral"
 PptxReader = "thejessezhang"
 PyMuPDFReader = "iamarunbrahma"
+RTFReader = "FunkyOz"
 UnstructuredReader = "thejessezhang"
 VideoAudioReader = "llama-index"
 XMLReader = "mmaatouk"
@@ -49,7 +50,7 @@ license = "MIT"
 maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"]
 name = "llama-index-readers-file"
 readme = "README.md"
-version = "0.1.8"
+version = "0.1.9"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
@@ -58,6 +59,7 @@ pymupdf = "^1.23.21"
 bs4 = "^0.0.2"
 beautifulsoup4 = "^4.12.3"
 pypdf = "^4.0.1"
+striprtf = "^0.0.26"
 
 [tool.poetry.group.dev.dependencies]
 ipython = "8.10.0"
diff --git a/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py b/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py
new file mode 100644
index 000000000..7f8d15a20
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-file/tests/test_rtf.py
@@ -0,0 +1,27 @@
+import pytest
+from striprtf.striprtf import rtf_to_text
+
+from llama_index.readers.file.rtf import RTFReader
+
+# Sample XML data for testing
+SAMPLE_RTF = """{\\rtf
+    Hello!\\par
+    This is a rtf file {\\b bolded}.\\par
+}"""
+
+
+# Fixture to create a temporary XML file
+@pytest.fixture()
+def rtf_file(tmp_path):
+    file = tmp_path / "test.rtf"
+    with open(file, "w") as f:
+        f.write(SAMPLE_RTF)
+    return file
+
+
+def test_load_data_rtf(rtf_file):
+    reader = RTFReader()
+    text = rtf_to_text(SAMPLE_RTF).strip()
+    documents = reader.load_data(rtf_file)
+    assert len(documents) == 1
+    assert text == documents[0].text
-- 
GitLab