diff --git a/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md index 36bff877abcbe4f7bc2164d23d5bf7c44c2072e3..7c1c8deadec71ce64feba46ac360cf20ffd8bc05 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md +++ b/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md @@ -3,3 +3,7 @@ ## [0.1.2] - 2024-02-13 - Add maintainers and keywords from library.json (llamahub) + +## [0.1.7] - 2024-03-12 + +- Add option to WholeSiteReader to pass chrome driver with own options to class **init** diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md index 7a758d467393ce39ed2ddc40abef0cf1cc66170f..2ae94d7d56fca17a03372ab6b1899d14731ec5f2 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md @@ -12,10 +12,12 @@ The WholeSiteReader is a sophisticated web scraping tool that employs a breadth- - **Depth Control:** Limits scraping to a specified depth within a site's structure. - **URL Prefix Focus:** Targets scraping efforts to specific subsections of a site based on URL prefixes. - **Selenium-Based:** Leverages Selenium for dynamic interaction with web pages, supporting JavaScript-rendered content. +- **Add your own chromedriver with options:** Configurable ```python from llama_index.readers.web import WholeSiteReader + # Initialize the scraper with a prefix URL and maximum depth scraper = WholeSiteReader( prefix="https://www.paulgraham.com/", max_depth=10 # Example prefix @@ -27,6 +29,36 @@ documents = scraper.load_data( ) # Example base URL ``` +Configure with chromedriver options: + +```python +try: + import chromedriver_autoinstaller +except ImportError: + raise ImportError("Please install chromedriver_autoinstaller") +from llama_index.readers.web import WholeSiteReader +from selenium import webdriver + +options = webdriver.ChromeOptions() +options.binary_location = "/usr/bin/google-chrome" +options.add_argument("--start-maximized") +options.add_argument("--headless") +chromedriver_autoinstaller.install() +driver = webdriver.Chrome(options=options) + +# Initialize the scraper with a prefix URL and maximum depth +scraper = WholeSiteReader( + prefix="https://www.paulgraham.com/", + max_depth=10, # Example prefix + driver=driver, # Your custom driver with correct options +) + +# Start scraping from a base URL +documents = scraper.load_data( + base_url="https://www.paulgraham.com/articles.html" +) # Example base URL +``` + ## Examples This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py index e8a58cdcc3c4baccf78e45271d70f83f40603e6a..284bcb56a798fa22152befa934b85351acf4bffb 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py @@ -1,5 +1,5 @@ import time -from typing import List +from typing import List, Optional from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document @@ -26,13 +26,18 @@ class WholeSiteReader(BaseReader): max_depth (int, optional): Maximum depth for BFS. Defaults to 10. """ - def __init__(self, prefix: str, max_depth: int = 10) -> None: + def __init__( + self, + prefix: str, + max_depth: int = 10, + driver: Optional[webdriver.Chrome] = None, + ) -> None: """ Initialize the WholeSiteReader with the provided prefix and maximum depth. """ self.prefix = prefix self.max_depth = max_depth - self.driver = self.setup_driver() + self.driver = driver if driver else self.setup_driver() def setup_driver(self): """ diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index 840d2e191528152ec4973e50df243891b637c48c..a1285616d6e6c6c3659f3a5f60746729caad9579 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -41,7 +41,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.1.6" +version = "0.1.7" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"