Skip to content
Snippets Groups Projects
Unverified Commit 7e9d1c68 authored by Wayne Lau's avatar Wayne Lau Committed by GitHub
Browse files

Trafilatura kwargs and progress bar for trafilatura web reader (#13454)

parent 4adf1f33
No related branches found
No related tags found
No related merge requests found
......@@ -17,6 +17,16 @@ loader = TrafilaturaWebReader()
documents = loader.load_data(urls=["https://google.com"])
```
### Additional Parameters
You can also pass in additional parameters to the `load_data` function.
Most of the functions follow the original `trafilatura.extract` API. You can find more information [here](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
```python
documents = loader.load_data(urls=["https://google.com"], favor_recall=True)
```
## Examples
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/).
......
......@@ -28,6 +28,8 @@ class TrafilaturaWebReader(BasePydanticReader):
include_images=False,
include_formatting=False,
include_links=False,
show_progress=False,
**kwargs,
) -> List[Document]:
"""Load data from the urls.
......@@ -39,6 +41,8 @@ class TrafilaturaWebReader(BasePydanticReader):
include_images (bool, optional): Include images in the output. Defaults to False.
include_formatting (bool, optional): Include formatting in the output. Defaults to False.
include_links (bool, optional): Include links in the output. Defaults to False.
show_progress (bool, optional): Show progress bar. Defaults to False
kwargs: Additional keyword arguments for the `trafilatura.extract` function.
Returns:
List[Document]: List of documents.
......@@ -49,7 +53,14 @@ class TrafilaturaWebReader(BasePydanticReader):
if not isinstance(urls, list):
raise ValueError("urls must be a list of strings.")
documents = []
for url in urls:
if show_progress:
from tqdm import tqdm
iterator = tqdm(urls, desc="Downloading pages")
else:
iterator = urls
for url in iterator:
downloaded = trafilatura.fetch_url(url)
response = trafilatura.extract(
downloaded,
......@@ -59,6 +70,7 @@ class TrafilaturaWebReader(BasePydanticReader):
include_images=include_images,
include_formatting=include_formatting,
include_links=include_links,
**kwargs,
)
documents.append(Document(text=response, id_=url))
......
......@@ -42,7 +42,7 @@ license = "MIT"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.1.13"
version = "0.1.14"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment