Skip to content
Snippets Groups Projects
Unverified Commit 3ad9f8c9 authored by Rafael Miller's avatar Rafael Miller Committed by GitHub
Browse files

Feat/Added firecrawl search mode (#13560)

parent a52fcd60
No related branches found
No related tags found
No related merge requests found
......@@ -14,22 +14,26 @@
### Using Firecrawl Web Loader
- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl` or `scrape`), and any optional parameters for the Firecrawl API.
- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl`, `scrape`, or `search`), and any optional parameters for the Firecrawl API.
```python
from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader
firecrawl_reader = FireCrawlWebReader(
api_key="your_api_key_here",
mode="crawl", # or "scrape"
mode="crawl", # or "scrape" or "search"
params={"additional": "parameters"},
)
```
- **Loading Data**: To load data, use the `load_data` method with the URL you wish to process.
```python
documents = firecrawl_reader.load_data(url="http://example.com")
```
```python
# For crawl or scrape mode
documents = firecrawl_reader.load_data(url="http://example.com")
# For search mode
documents = firecrawl_reader.load_data(query="search term")
```
### Example Usage
......@@ -39,7 +43,7 @@ Here is an example demonstrating how to initialize the FireCrawlWebReader, load
# Initialize the FireCrawlWebReader with your API key and desired mode
firecrawl_reader = FireCrawlWebReader(
api_key="your_api_key_here", # Replace with your actual API key
mode="crawl", # Choose between "crawl" and "scrape"
mode="crawl", # Choose between "crawl", "scrape", and "search"
params={"additional": "parameters"}, # Optional additional parameters
)
......
......@@ -49,16 +49,26 @@ class FireCrawlWebReader(BasePydanticReader):
def class_name(cls) -> str:
return "Firecrawl_reader"
def load_data(self, url: str) -> List[Document]:
def load_data(
self, url: Optional[str] = None, query: Optional[str] = None
) -> List[Document]:
"""Load data from the input directory.
Args:
urls (List[str]): List of URLs to scrape.
url (Optional[str]): URL to scrape or crawl.
query (Optional[str]): Query to search for.
Returns:
List[Document]: List of documents.
Raises:
ValueError: If neither or both url and query are provided.
"""
if url is None and query is None:
raise ValueError("Either url or query must be provided.")
if url is not None and query is not None:
raise ValueError("Only one of url or query must be provided.")
documents = []
if self.mode == "scrape":
......@@ -69,7 +79,7 @@ class FireCrawlWebReader(BasePydanticReader):
metadata=firecrawl_docs.get("metadata", {}),
)
)
else:
elif self.mode == "crawl":
firecrawl_docs = self.firecrawl.crawl_url(url, params=self.params)
for doc in firecrawl_docs:
documents.append(
......@@ -78,5 +88,18 @@ class FireCrawlWebReader(BasePydanticReader):
metadata=doc.get("metadata", {}),
)
)
elif self.mode == "search":
firecrawl_docs = self.firecrawl.search(query, params=self.params)
for doc in firecrawl_docs:
documents.append(
Document(
page_content=doc.get("markdown", ""),
metadata=doc.get("metadata", {}),
)
)
else:
raise ValueError(
"Invalid mode. Please choose 'scrape', 'crawl' or 'search'."
)
return documents
......@@ -42,7 +42,7 @@ license = "MIT"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.1.15"
version = "0.1.16"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment