From 3ad9f8c9f5d41545080c3f465332f27bba237fe4 Mon Sep 17 00:00:00 2001
From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Fri, 17 May 2024 14:30:01 -0300
Subject: [PATCH] Feat/Added firecrawl search mode (#13560)

---
 .../readers/web/firecrawl_web/README.md       | 16 ++++++----
 .../readers/web/firecrawl_web/base.py         | 29 +++++++++++++++++--
 .../llama-index-readers-web/pyproject.toml    |  2 +-
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md
index cc6a0e4f3..9a65172ab 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md
@@ -14,22 +14,26 @@
 
 ### Using Firecrawl Web Loader
 
-- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl` or `scrape`), and any optional parameters for the Firecrawl API.
+- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl`, `scrape`, or `search`), and any optional parameters for the Firecrawl API.
 
   ```python
   from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader
 
   firecrawl_reader = FireCrawlWebReader(
       api_key="your_api_key_here",
-      mode="crawl",  # or "scrape"
+      mode="crawl",  # or "scrape" or "search"
       params={"additional": "parameters"},
   )
   ```
 
 - **Loading Data**: To load data, use the `load_data` method with the URL you wish to process.
-  ```python
-  documents = firecrawl_reader.load_data(url="http://example.com")
-  ```
+
+```python
+# For crawl or scrape mode
+documents = firecrawl_reader.load_data(url="http://example.com")
+# For search mode
+documents = firecrawl_reader.load_data(query="search term")
+```
 
 ### Example Usage
 
@@ -39,7 +43,7 @@ Here is an example demonstrating how to initialize the FireCrawlWebReader, load
 # Initialize the FireCrawlWebReader with your API key and desired mode
 firecrawl_reader = FireCrawlWebReader(
     api_key="your_api_key_here",  # Replace with your actual API key
-    mode="crawl",  # Choose between "crawl" and "scrape"
+    mode="crawl",  # Choose between "crawl", "scrape", and "search"
     params={"additional": "parameters"},  # Optional additional parameters
 )
 
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py
index 27f274a7e..7533de467 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py
@@ -49,16 +49,26 @@ class FireCrawlWebReader(BasePydanticReader):
     def class_name(cls) -> str:
         return "Firecrawl_reader"
 
-    def load_data(self, url: str) -> List[Document]:
+    def load_data(
+        self, url: Optional[str] = None, query: Optional[str] = None
+    ) -> List[Document]:
         """Load data from the input directory.
 
         Args:
-            urls (List[str]): List of URLs to scrape.
+            url (Optional[str]): URL to scrape or crawl.
+            query (Optional[str]): Query to search for.
 
         Returns:
             List[Document]: List of documents.
 
+        Raises:
+            ValueError: If neither or both url and query are provided.
         """
+        if url is None and query is None:
+            raise ValueError("Either url or query must be provided.")
+        if url is not None and query is not None:
+            raise ValueError("Only one of url or query must be provided.")
+
         documents = []
 
         if self.mode == "scrape":
@@ -69,7 +79,7 @@ class FireCrawlWebReader(BasePydanticReader):
                     metadata=firecrawl_docs.get("metadata", {}),
                 )
             )
-        else:
+        elif self.mode == "crawl":
             firecrawl_docs = self.firecrawl.crawl_url(url, params=self.params)
             for doc in firecrawl_docs:
                 documents.append(
@@ -78,5 +88,18 @@ class FireCrawlWebReader(BasePydanticReader):
                         metadata=doc.get("metadata", {}),
                     )
                 )
+        elif self.mode == "search":
+            firecrawl_docs = self.firecrawl.search(query, params=self.params)
+            for doc in firecrawl_docs:
+                documents.append(
+                    Document(
+                        page_content=doc.get("markdown", ""),
+                        metadata=doc.get("metadata", {}),
+                    )
+                )
+        else:
+            raise ValueError(
+                "Invalid mode. Please choose 'scrape', 'crawl' or 'search'."
+            )
 
         return documents
diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
index b1baf13a2..d1c6e41bb 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -42,7 +42,7 @@ license = "MIT"
 maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
 name = "llama-index-readers-web"
 readme = "README.md"
-version = "0.1.15"
+version = "0.1.16"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
-- 
GitLab