From 35fc3b992cba2545d81b38281a76ccb3e216284a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?crypticG=C3=B8=C3=B8se?= <christogoosen@gmail.com>
Date: Fri, 15 Mar 2024 18:06:03 +0200
Subject: [PATCH] Wholesitereader pass driver arg (#11861)

* Add configurable chrome driver that allows for more options.

* Update base.py

Update to optional

* from typing import Optional
missing dependency

* Update pyproject.toml

Update version of package

* Update CHANGELOG.md

Update changelog

* Add docs to configurable chrome driver

* Fix ruff formatting issue

* Update README.md

Remove double import
---
 .../llama-index-readers-web/CHANGELOG.md      |  4 +++
 .../readers/web/whole_site/README.md          | 32 +++++++++++++++++++
 .../readers/web/whole_site/base.py            | 11 +++++--
 .../llama-index-readers-web/pyproject.toml    |  2 +-
 4 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md
index 36bff877a..7c1c8dead 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md
+++ b/llama-index-integrations/readers/llama-index-readers-web/CHANGELOG.md
@@ -3,3 +3,7 @@
 ## [0.1.2] - 2024-02-13
 
 - Add maintainers and keywords from library.json (llamahub)
+
+## [0.1.7] - 2024-03-12
+
+- Add option to WholeSiteReader to pass chrome driver with own options to class **init**
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md
index 7a758d467..2ae94d7d5 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/README.md
@@ -12,10 +12,12 @@ The WholeSiteReader is a sophisticated web scraping tool that employs a breadth-
 - **Depth Control:** Limits scraping to a specified depth within a site's structure.
 - **URL Prefix Focus:** Targets scraping efforts to specific subsections of a site based on URL prefixes.
 - **Selenium-Based:** Leverages Selenium for dynamic interaction with web pages, supporting JavaScript-rendered content.
+- **Add your own chromedriver with options:** Configurable
 
 ```python
 from llama_index.readers.web import WholeSiteReader
 
+
 # Initialize the scraper with a prefix URL and maximum depth
 scraper = WholeSiteReader(
     prefix="https://www.paulgraham.com/", max_depth=10  # Example prefix
@@ -27,6 +29,36 @@ documents = scraper.load_data(
 )  # Example base URL
 ```
 
+Configure with chromedriver options:
+
+```python
+try:
+    import chromedriver_autoinstaller
+except ImportError:
+    raise ImportError("Please install chromedriver_autoinstaller")
+from llama_index.readers.web import WholeSiteReader
+from selenium import webdriver
+
+options = webdriver.ChromeOptions()
+options.binary_location = "/usr/bin/google-chrome"
+options.add_argument("--start-maximized")
+options.add_argument("--headless")
+chromedriver_autoinstaller.install()
+driver = webdriver.Chrome(options=options)
+
+# Initialize the scraper with a prefix URL and maximum depth
+scraper = WholeSiteReader(
+    prefix="https://www.paulgraham.com/",
+    max_depth=10,  # Example prefix
+    driver=driver,  # Your custom driver with correct options
+)
+
+# Start scraping from a base URL
+documents = scraper.load_data(
+    base_url="https://www.paulgraham.com/articles.html"
+)  # Example base URL
+```
+
 ## Examples
 
 This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py
index e8a58cdcc..284bcb56a 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py
@@ -1,5 +1,5 @@
 import time
-from typing import List
+from typing import List, Optional
 
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.schema import Document
@@ -26,13 +26,18 @@ class WholeSiteReader(BaseReader):
         max_depth (int, optional): Maximum depth for BFS. Defaults to 10.
     """
 
-    def __init__(self, prefix: str, max_depth: int = 10) -> None:
+    def __init__(
+        self,
+        prefix: str,
+        max_depth: int = 10,
+        driver: Optional[webdriver.Chrome] = None,
+    ) -> None:
         """
         Initialize the WholeSiteReader with the provided prefix and maximum depth.
         """
         self.prefix = prefix
         self.max_depth = max_depth
-        self.driver = self.setup_driver()
+        self.driver = driver if driver else self.setup_driver()
 
     def setup_driver(self):
         """
diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
index 840d2e191..a1285616d 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -41,7 +41,7 @@ license = "MIT"
 maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
 name = "llama-index-readers-web"
 readme = "README.md"
-version = "0.1.6"
+version = "0.1.7"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
-- 
GitLab