From 8ed61b739ce9f114830ef7c543d35379c159c92c Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Fri, 17 May 2024 06:18:42 +0300 Subject: [PATCH] Updated Browserbase web reader (#13535) --- docs/docs/api_reference/readers/web.md | 1 + .../data_connectors/WebPageDemo.ipynb | 2 +- .../llama_index/cli/upgrade/mappings.json | 1 + .../readers/web/browserbase_web/README.md | 26 +++++++++++++------ .../readers/web/browserbase_web/base.py | 11 +++++--- .../llama-index-readers-web/pyproject.toml | 2 +- 6 files changed, 30 insertions(+), 13 deletions(-) diff --git a/docs/docs/api_reference/readers/web.md b/docs/docs/api_reference/readers/web.md index 37500251ed..4c7ef15dd9 100644 --- a/docs/docs/api_reference/readers/web.md +++ b/docs/docs/api_reference/readers/web.md @@ -3,6 +3,7 @@ members: - AsyncWebPageReader - BeautifulSoupWebReader + - BrowserbaseWebReader - KnowledgeBaseWebReader - MainContentExtractorReader - NewsArticleReader diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb index dce71d477e..178651c8ff 100644 --- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb +++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb @@ -226,7 +226,7 @@ "\n", "## Installation and Setup\n", "\n", - "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).\n", + "- Get an API key and Project ID from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`).\n", "- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):" ] }, diff --git a/llama-index-cli/llama_index/cli/upgrade/mappings.json b/llama-index-cli/llama_index/cli/upgrade/mappings.json index c4bcfce530..8f95c04f14 100644 --- a/llama-index-cli/llama_index/cli/upgrade/mappings.json +++ b/llama-index-cli/llama_index/cli/upgrade/mappings.json @@ -739,6 +739,7 @@ "PineconeReader": "llama_index.readers.pinecone", "PandasAIReader": "llama_index.readers.pandas_ai", "AirtableReader": "llama_index.readers.airtable", + "BrowserbaseWebReader": "llama_index.readers.web", "ZulipReader": "llama_index.readers.zulip", "ReadmeReader": "llama_index.readers.readme", "ZendeskReader": "llama_index.readers.zendesk", diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md index 3985ccb9ba..dd762f48f4 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md @@ -1,19 +1,24 @@ # Browserbase Web Reader -[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving. +[Browserbase](https://browserbase.com) is a developer platform to reliably run, manage, and monitor headless browsers. -## Installation and Setup +Power your AI data retrievals with: -- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`). +- [Serverless Infrastructure](https://docs.browserbase.com/under-the-hood) providing reliable browsers to extract data from complex UIs +- [Stealth Mode](https://docs.browserbase.com/features/stealth-mode) with included fingerprinting tactics and automatic captcha solving +- [Session Debugger](https://docs.browserbase.com/features/sessions) to inspect your Browser Session with networks timeline and logs +- [Live Debug](https://docs.browserbase.com/guides/session-debug-connection/browser-remote-control) to quickly debug your automation + +## Installation and setup + +- Get an API key and Project ID from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`). - Install the [Browserbase SDK](http://github.com/browserbase/python-sdk): -``` +```bash pip install browserbase ``` -## Usage - -### Loading documents +## Loading documents You can load webpages into LlamaIndex using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation. @@ -31,7 +36,12 @@ docs = reader.load_data( ) ``` -### Loading images +### Parameters + +- `urls` Required. A list of URLs to fetch. +- `text_content` Retrieve only text content. Default is `False`. +- `session_id` Optional. Provide an existing Session ID. +- `proxy` Optional. Enable/Disable Proxies.## Loading images You can also load screenshots of webpages (as bytes) for multi-modal models. diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py index 63ad080473..a71d78a73a 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py @@ -18,6 +18,7 @@ class BrowserbaseWebReader(BaseReader): def __init__( self, api_key: Optional[str] = None, + project_id: Optional[str] = None, ) -> None: try: from browserbase import Browserbase @@ -26,13 +27,17 @@ class BrowserbaseWebReader(BaseReader): "`browserbase` package not found, please run `pip install browserbase`" ) - self.browserbase = Browserbase(api_key=api_key) + self.browserbase = Browserbase(api_key, project_id) def lazy_load_data( - self, urls: Sequence[str], text_content: bool = False + self, + urls: Sequence[str], + text_content: bool = False, + session_id: Optional[str] = None, + proxy: Optional[bool] = None, ) -> Iterator[Document]: """Load pages from URLs.""" - pages = self.browserbase.load_urls(urls, text_content) + pages = self.browserbase.load_urls(urls, text_content, session_id, proxy) for i, page in enumerate(pages): yield Document( diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index ed26813f48..b1baf13a23 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -42,7 +42,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.1.14" +version = "0.1.15" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" -- GitLab