From 3ad9f8c9f5d41545080c3f465332f27bba237fe4 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 17 May 2024 14:30:01 -0300 Subject: [PATCH] Feat/Added firecrawl search mode (#13560) --- .../readers/web/firecrawl_web/README.md | 16 ++++++---- .../readers/web/firecrawl_web/base.py | 29 +++++++++++++++++-- .../llama-index-readers-web/pyproject.toml | 2 +- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md index cc6a0e4f3..9a65172ab 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md @@ -14,22 +14,26 @@ ### Using Firecrawl Web Loader -- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl` or `scrape`), and any optional parameters for the Firecrawl API. +- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl`, `scrape`, or `search`), and any optional parameters for the Firecrawl API. ```python from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader firecrawl_reader = FireCrawlWebReader( api_key="your_api_key_here", - mode="crawl", # or "scrape" + mode="crawl", # or "scrape" or "search" params={"additional": "parameters"}, ) ``` - **Loading Data**: To load data, use the `load_data` method with the URL you wish to process. - ```python - documents = firecrawl_reader.load_data(url="http://example.com") - ``` + +```python +# For crawl or scrape mode +documents = firecrawl_reader.load_data(url="http://example.com") +# For search mode +documents = firecrawl_reader.load_data(query="search term") +``` ### Example Usage @@ -39,7 +43,7 @@ Here is an example demonstrating how to initialize the FireCrawlWebReader, load # Initialize the FireCrawlWebReader with your API key and desired mode firecrawl_reader = FireCrawlWebReader( api_key="your_api_key_here", # Replace with your actual API key - mode="crawl", # Choose between "crawl" and "scrape" + mode="crawl", # Choose between "crawl", "scrape", and "search" params={"additional": "parameters"}, # Optional additional parameters ) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py index 27f274a7e..7533de467 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py @@ -49,16 +49,26 @@ class FireCrawlWebReader(BasePydanticReader): def class_name(cls) -> str: return "Firecrawl_reader" - def load_data(self, url: str) -> List[Document]: + def load_data( + self, url: Optional[str] = None, query: Optional[str] = None + ) -> List[Document]: """Load data from the input directory. Args: - urls (List[str]): List of URLs to scrape. + url (Optional[str]): URL to scrape or crawl. + query (Optional[str]): Query to search for. Returns: List[Document]: List of documents. + Raises: + ValueError: If neither or both url and query are provided. """ + if url is None and query is None: + raise ValueError("Either url or query must be provided.") + if url is not None and query is not None: + raise ValueError("Only one of url or query must be provided.") + documents = [] if self.mode == "scrape": @@ -69,7 +79,7 @@ class FireCrawlWebReader(BasePydanticReader): metadata=firecrawl_docs.get("metadata", {}), ) ) - else: + elif self.mode == "crawl": firecrawl_docs = self.firecrawl.crawl_url(url, params=self.params) for doc in firecrawl_docs: documents.append( @@ -78,5 +88,18 @@ class FireCrawlWebReader(BasePydanticReader): metadata=doc.get("metadata", {}), ) ) + elif self.mode == "search": + firecrawl_docs = self.firecrawl.search(query, params=self.params) + for doc in firecrawl_docs: + documents.append( + Document( + page_content=doc.get("markdown", ""), + metadata=doc.get("metadata", {}), + ) + ) + else: + raise ValueError( + "Invalid mode. Please choose 'scrape', 'crawl' or 'search'." + ) return documents diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index b1baf13a2..d1c6e41bb 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -42,7 +42,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.1.15" +version = "0.1.16" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" -- GitLab