diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md index cc6a0e4f32e3d79ed81577ded462871a5239c4d5..9a65172ab834fea4898fa4976e27970cfb196245 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md @@ -14,22 +14,26 @@ ### Using Firecrawl Web Loader -- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl` or `scrape`), and any optional parameters for the Firecrawl API. +- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl`, `scrape`, or `search`), and any optional parameters for the Firecrawl API. ```python from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader firecrawl_reader = FireCrawlWebReader( api_key="your_api_key_here", - mode="crawl", # or "scrape" + mode="crawl", # or "scrape" or "search" params={"additional": "parameters"}, ) ``` - **Loading Data**: To load data, use the `load_data` method with the URL you wish to process. - ```python - documents = firecrawl_reader.load_data(url="http://example.com") - ``` + +```python +# For crawl or scrape mode +documents = firecrawl_reader.load_data(url="http://example.com") +# For search mode +documents = firecrawl_reader.load_data(query="search term") +``` ### Example Usage @@ -39,7 +43,7 @@ Here is an example demonstrating how to initialize the FireCrawlWebReader, load # Initialize the FireCrawlWebReader with your API key and desired mode firecrawl_reader = FireCrawlWebReader( api_key="your_api_key_here", # Replace with your actual API key - mode="crawl", # Choose between "crawl" and "scrape" + mode="crawl", # Choose between "crawl", "scrape", and "search" params={"additional": "parameters"}, # Optional additional parameters ) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py index 27f274a7ef761b7c9ce7e8a847a1e7349179cdbc..7533de467516098de67d3db3977ba2b556a7155b 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py @@ -49,16 +49,26 @@ class FireCrawlWebReader(BasePydanticReader): def class_name(cls) -> str: return "Firecrawl_reader" - def load_data(self, url: str) -> List[Document]: + def load_data( + self, url: Optional[str] = None, query: Optional[str] = None + ) -> List[Document]: """Load data from the input directory. Args: - urls (List[str]): List of URLs to scrape. + url (Optional[str]): URL to scrape or crawl. + query (Optional[str]): Query to search for. Returns: List[Document]: List of documents. + Raises: + ValueError: If neither or both url and query are provided. """ + if url is None and query is None: + raise ValueError("Either url or query must be provided.") + if url is not None and query is not None: + raise ValueError("Only one of url or query must be provided.") + documents = [] if self.mode == "scrape": @@ -69,7 +79,7 @@ class FireCrawlWebReader(BasePydanticReader): metadata=firecrawl_docs.get("metadata", {}), ) ) - else: + elif self.mode == "crawl": firecrawl_docs = self.firecrawl.crawl_url(url, params=self.params) for doc in firecrawl_docs: documents.append( @@ -78,5 +88,18 @@ class FireCrawlWebReader(BasePydanticReader): metadata=doc.get("metadata", {}), ) ) + elif self.mode == "search": + firecrawl_docs = self.firecrawl.search(query, params=self.params) + for doc in firecrawl_docs: + documents.append( + Document( + page_content=doc.get("markdown", ""), + metadata=doc.get("metadata", {}), + ) + ) + else: + raise ValueError( + "Invalid mode. Please choose 'scrape', 'crawl' or 'search'." + ) return documents diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index b1baf13a23b6f5764936a13c60ccf514fd37faa3..d1c6e41bbae6af7ffc744b4e4c6cbab3c06679ba 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -42,7 +42,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.1.15" +version = "0.1.16" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"