Trafilatura kwargs and progress bar for trafilatura web reader (#13454)

7e9d1c68 · Wayne Lau · GitHub · 4adf1f33 · 7e9d1c68 · 7e9d1c68
Unverified Commit 7e9d1c68 authored 11 months ago by Wayne Lau Committed by GitHub 11 months ago
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/trafilatura_web/README.md
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/trafilatura_web/README.md
@@ -17,6 +17,16 @@ loader = TrafilaturaWebReader()
 documents = loader.load_data(urls=["https://google.com"])
 ```

+### Additional Parameters
+
+You can also pass in additional parameters to the `load_data` function.
+
+Most of the functions follow the original `trafilatura.extract` API. You can find more information [here](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
+
+```python
+documents = loader.load_data(urls=["https://google.com"], favor_recall=True)
+```
+
 ## Examples

 This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/).

--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/trafilatura_web/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/trafilatura_web/base.py
@@ -28,6 +28,8 @@ class TrafilaturaWebReader(BasePydanticReader):
        include_images=False,
        include_formatting=False,
        include_links=False,
+        show_progress=False,
+        **kwargs,
    ) -> List[Document]:
        """Load data from the urls.

@@ -39,6 +41,8 @@ class TrafilaturaWebReader(BasePydanticReader):
            include_images (bool, optional): Include images in the output. Defaults to False.
            include_formatting (bool, optional): Include formatting in the output. Defaults to False.
            include_links (bool, optional): Include links in the output. Defaults to False.
+            show_progress (bool, optional): Show progress bar. Defaults to False
+            kwargs: Additional keyword arguments for the `trafilatura.extract` function.

        Returns:
            List[Document]: List of documents.
@@ -49,7 +53,14 @@ class TrafilaturaWebReader(BasePydanticReader):
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []
-        for url in urls:
+
+        if show_progress:
+            from tqdm import tqdm
+
+            iterator = tqdm(urls, desc="Downloading pages")
+        else:
+            iterator = urls
+        for url in iterator:
            downloaded = trafilatura.fetch_url(url)
            response = trafilatura.extract(
                downloaded,
@@ -59,6 +70,7 @@ class TrafilaturaWebReader(BasePydanticReader):
                include_images=include_images,
                include_formatting=include_formatting,
                include_links=include_links,
+                **kwargs,
            )
            documents.append(Document(text=response, id_=url))


--- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -42,7 +42,7 @@ license = "MIT"
 maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
 name = "llama-index-readers-web"
 readme = "README.md"
-version = "0.1.13"
+version = "0.1.14"

 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"