From 8ed61b739ce9f114830ef7c543d35379c159c92c Mon Sep 17 00:00:00 2001
From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com>
Date: Fri, 17 May 2024 06:18:42 +0300
Subject: [PATCH] Updated Browserbase web reader (#13535)

---
 docs/docs/api_reference/readers/web.md        |  1 +
 .../data_connectors/WebPageDemo.ipynb         |  2 +-
 .../llama_index/cli/upgrade/mappings.json     |  1 +
 .../readers/web/browserbase_web/README.md     | 26 +++++++++++++------
 .../readers/web/browserbase_web/base.py       | 11 +++++---
 .../llama-index-readers-web/pyproject.toml    |  2 +-
 6 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/docs/docs/api_reference/readers/web.md b/docs/docs/api_reference/readers/web.md
index 37500251ed..4c7ef15dd9 100644
--- a/docs/docs/api_reference/readers/web.md
+++ b/docs/docs/api_reference/readers/web.md
@@ -3,6 +3,7 @@
       members:
         - AsyncWebPageReader
         - BeautifulSoupWebReader
+        - BrowserbaseWebReader
         - KnowledgeBaseWebReader
         - MainContentExtractorReader
         - NewsArticleReader
diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
index dce71d477e..178651c8ff 100644
--- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb
+++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
@@ -226,7 +226,7 @@
     "\n",
     "## Installation and Setup\n",
     "\n",
-    "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).\n",
+    "- Get an API key and Project ID from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`).\n",
     "- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):"
    ]
   },
diff --git a/llama-index-cli/llama_index/cli/upgrade/mappings.json b/llama-index-cli/llama_index/cli/upgrade/mappings.json
index c4bcfce530..8f95c04f14 100644
--- a/llama-index-cli/llama_index/cli/upgrade/mappings.json
+++ b/llama-index-cli/llama_index/cli/upgrade/mappings.json
@@ -739,6 +739,7 @@
   "PineconeReader": "llama_index.readers.pinecone",
   "PandasAIReader": "llama_index.readers.pandas_ai",
   "AirtableReader": "llama_index.readers.airtable",
+  "BrowserbaseWebReader": "llama_index.readers.web",
   "ZulipReader": "llama_index.readers.zulip",
   "ReadmeReader": "llama_index.readers.readme",
   "ZendeskReader": "llama_index.readers.zendesk",
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
index 3985ccb9ba..dd762f48f4 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
@@ -1,19 +1,24 @@
 # Browserbase Web Reader
 
-[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.
+[Browserbase](https://browserbase.com) is a developer platform to reliably run, manage, and monitor headless browsers.
 
-## Installation and Setup
+Power your AI data retrievals with:
 
-- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).
+- [Serverless Infrastructure](https://docs.browserbase.com/under-the-hood) providing reliable browsers to extract data from complex UIs
+- [Stealth Mode](https://docs.browserbase.com/features/stealth-mode) with included fingerprinting tactics and automatic captcha solving
+- [Session Debugger](https://docs.browserbase.com/features/sessions) to inspect your Browser Session with networks timeline and logs
+- [Live Debug](https://docs.browserbase.com/guides/session-debug-connection/browser-remote-control) to quickly debug your automation
+
+## Installation and setup
+
+- Get an API key and Project ID from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`).
 - Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):
 
-```
+```bash
 pip install browserbase
 ```
 
-## Usage
-
-### Loading documents
+## Loading documents
 
 You can load webpages into LlamaIndex using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation.
 
@@ -31,7 +36,12 @@ docs = reader.load_data(
 )
 ```
 
-### Loading images
+### Parameters
+
+- `urls` Required. A list of URLs to fetch.
+- `text_content` Retrieve only text content. Default is `False`.
+- `session_id` Optional. Provide an existing Session ID.
+- `proxy` Optional. Enable/Disable Proxies.## Loading images
 
 You can also load screenshots of webpages (as bytes) for multi-modal models.
 
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
index 63ad080473..a71d78a73a 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
@@ -18,6 +18,7 @@ class BrowserbaseWebReader(BaseReader):
     def __init__(
         self,
         api_key: Optional[str] = None,
+        project_id: Optional[str] = None,
     ) -> None:
         try:
             from browserbase import Browserbase
@@ -26,13 +27,17 @@ class BrowserbaseWebReader(BaseReader):
                 "`browserbase` package not found, please run `pip install browserbase`"
             )
 
-        self.browserbase = Browserbase(api_key=api_key)
+        self.browserbase = Browserbase(api_key, project_id)
 
     def lazy_load_data(
-        self, urls: Sequence[str], text_content: bool = False
+        self,
+        urls: Sequence[str],
+        text_content: bool = False,
+        session_id: Optional[str] = None,
+        proxy: Optional[bool] = None,
     ) -> Iterator[Document]:
         """Load pages from URLs."""
-        pages = self.browserbase.load_urls(urls, text_content)
+        pages = self.browserbase.load_urls(urls, text_content, session_id, proxy)
 
         for i, page in enumerate(pages):
             yield Document(
diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
index ed26813f48..b1baf13a23 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -42,7 +42,7 @@ license = "MIT"
 maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
 name = "llama-index-readers-web"
 readme = "README.md"
-version = "0.1.14"
+version = "0.1.15"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
-- 
GitLab