From 35fc3b992cba2545d81b38281a76ccb3e216284a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?crypticG=C3=B8=C3=B8se?= <christogoosen@gmail.com> Date: Fri, 15 Mar 2024 18:06:03 +0200 Subject: [PATCH] Wholesitereader pass driver arg (#11861) * Add configurable chrome driver that allows for more options. * Update base.py Update to optional * from typing import Optional missing dependency * Update pyproject.toml Update version of package * Update CHANGELOG.md Update changelog * Add docs to configurable chrome driver * Fix ruff formatting issue * Update README.md Remove double import --- .../llama-index-readers-web/CHANGELOG.md | 4 +++ .../readers/web/whole_site/README.md | 32 +++++++++++++++++++ .../readers/web/whole_site/base.py | 11 +++++-- .../llama-index-readers-web/pyproject.toml | 2 +- 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md index 36bff877a..7c1c8dead 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md +++ b/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md @@ -3,3 +3,7 @@ ## [0.1.2] - 2024-02-13 - Add maintainers and keywords from library.json (llamahub) + +## [0.1.7] - 2024-03-12 + +- Add option to WholeSiteReader to pass chrome driver with own options to class **init** diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md index 7a758d467..2ae94d7d5 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md @@ -12,10 +12,12 @@ The WholeSiteReader is a sophisticated web scraping tool that employs a breadth- - **Depth Control:** Limits scraping to a specified depth within a site's structure. - **URL Prefix Focus:** Targets scraping efforts to specific subsections of a site based on URL prefixes. - **Selenium-Based:** Leverages Selenium for dynamic interaction with web pages, supporting JavaScript-rendered content. +- **Add your own chromedriver with options:** Configurable ```python from llama_index.readers.web import WholeSiteReader + # Initialize the scraper with a prefix URL and maximum depth scraper = WholeSiteReader( prefix="https://www.paulgraham.com/", max_depth=10 # Example prefix @@ -27,6 +29,36 @@ documents = scraper.load_data( ) # Example base URL ``` +Configure with chromedriver options: + +```python +try: + import chromedriver_autoinstaller +except ImportError: + raise ImportError("Please install chromedriver_autoinstaller") +from llama_index.readers.web import WholeSiteReader +from selenium import webdriver + +options = webdriver.ChromeOptions() +options.binary_location = "/usr/bin/google-chrome" +options.add_argument("--start-maximized") +options.add_argument("--headless") +chromedriver_autoinstaller.install() +driver = webdriver.Chrome(options=options) + +# Initialize the scraper with a prefix URL and maximum depth +scraper = WholeSiteReader( + prefix="https://www.paulgraham.com/", + max_depth=10, # Example prefix + driver=driver, # Your custom driver with correct options +) + +# Start scraping from a base URL +documents = scraper.load_data( + base_url="https://www.paulgraham.com/articles.html" +) # Example base URL +``` + ## Examples This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py index e8a58cdcc..284bcb56a 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py @@ -1,5 +1,5 @@ import time -from typing import List +from typing import List, Optional from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document @@ -26,13 +26,18 @@ class WholeSiteReader(BaseReader): max_depth (int, optional): Maximum depth for BFS. Defaults to 10. """ - def __init__(self, prefix: str, max_depth: int = 10) -> None: + def __init__( + self, + prefix: str, + max_depth: int = 10, + driver: Optional[webdriver.Chrome] = None, + ) -> None: """ Initialize the WholeSiteReader with the provided prefix and maximum depth. """ self.prefix = prefix self.max_depth = max_depth - self.driver = self.setup_driver() + self.driver = driver if driver else self.setup_driver() def setup_driver(self): """ diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index 840d2e191..a1285616d 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -41,7 +41,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.1.6" +version = "0.1.7" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" -- GitLab