diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb index 7ab66e3e9b41c609ba36dc4b0fe662c915cbb937..8d92d046a4f91eb4d2a8f33f7574ffd4ebf2a634 100644 --- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb +++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb @@ -421,6 +421,65 @@ "display(Markdown(f\"<b>{response}</b>\"))" ] }, + { + "cell_type": "markdown", + "id": "a57351a5", + "metadata": {}, + "source": [ + "Using FireCrawl's extract mode to extract structured data from URLs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "008a7724", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the FireCrawlWebReader with your API key and extract mode\n", + "from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader\n", + "\n", + "firecrawl_reader = FireCrawlWebReader(\n", + " api_key=\"<your_api_key>\", # Replace with your actual API key from https://www.firecrawl.dev/\n", + " mode=\"extract\", # Use extract mode to extract structured data\n", + " params={\n", + " \"prompt\": \"Extract the title, author, and main points from this essay\", # Required prompt parameter for extract mode\n", + " },\n", + ")\n", + "\n", + "# Load documents by providing a list of URLs to extract data from\n", + "documents = firecrawl_reader.load_data(\n", + " urls=[\n", + " \"https://www.paulgraham.com\",\n", + " \"https://www.paulgraham.com/worked.html\",\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "693592bb", + "metadata": {}, + "outputs": [], + "source": [ + "index = SummaryIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50a5292e", + "metadata": {}, + "outputs": [], + "source": [ + "# Query the extracted structured data\n", + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"What are the main points from these essays?\")\n", + "\n", + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, { "cell_type": "markdown", "id": "e73ad2c0", diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md index 9a65172ab834fea4898fa4976e27970cfb196245..8b1e65db4b38259fb3e8ce339f637f6071b5f243 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/README.md @@ -14,14 +14,14 @@ ### Using Firecrawl Web Loader -- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl`, `scrape`, or `search`), and any optional parameters for the Firecrawl API. +- **Initialization**: Initialize the FireCrawlWebReader by providing the API key, the desired mode of operation (`crawl`, `scrape`, `search`, or `extract`), and any optional parameters for the Firecrawl API. ```python from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader firecrawl_reader = FireCrawlWebReader( api_key="your_api_key_here", - mode="crawl", # or "scrape" or "search" + mode="crawl", # or "scrape" or "search" or "extract" params={"additional": "parameters"}, ) ``` @@ -43,7 +43,7 @@ Here is an example demonstrating how to initialize the FireCrawlWebReader, load # Initialize the FireCrawlWebReader with your API key and desired mode firecrawl_reader = FireCrawlWebReader( api_key="your_api_key_here", # Replace with your actual API key - mode="crawl", # Choose between "crawl", "scrape", and "search" + mode="crawl", # Choose between "crawl", "scrape", "search" and "extract" params={"additional": "parameters"}, # Optional additional parameters ) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py index 7367dc57e3e5a354ac1e8e05a240852705f9fbe8..bb48b7a8bec77bd80c6ae4bf67847729e6190b0f 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/firecrawl_web/base.py @@ -15,11 +15,13 @@ class FireCrawlWebReader(BasePydanticReader): api_url: url to be passed to FirecrawlApp for local deployment url: The url to be crawled (or) mode: The mode to run the loader in. Default is "crawl". - Options include "scrape" (single url) and - "crawl" (all accessible sub pages). + Options include "scrape" (single url), + "crawl" (all accessible sub pages), + "search" (search for content), and + "extract" (extract structured data from URLs using a prompt). params: The parameters to pass to the Firecrawl API. Examples include crawlerOptions. - For more details, visit: https://github.com/mendableai/firecrawl-py + For more details, visit: https://docs.firecrawl.dev/sdks/python """ @@ -56,28 +58,33 @@ class FireCrawlWebReader(BasePydanticReader): return "Firecrawl_reader" def load_data( - self, url: Optional[str] = None, query: Optional[str] = None + self, + url: Optional[str] = None, + query: Optional[str] = None, + urls: Optional[List[str]] = None, ) -> List[Document]: """Load data from the input directory. Args: url (Optional[str]): URL to scrape or crawl. query (Optional[str]): Query to search for. + urls (Optional[List[str]]): List of URLs for extract mode. Returns: List[Document]: List of documents. Raises: - ValueError: If neither or both url and query are provided. + ValueError: If invalid combination of parameters is provided. """ - if url is None and query is None: - raise ValueError("Either url or query must be provided.") - if url is not None and query is not None: - raise ValueError("Only one of url or query must be provided.") + if sum(x is not None for x in [url, query, urls]) != 1: + raise ValueError("Exactly one of url, query, or urls must be provided.") documents = [] if self.mode == "scrape": + # [SCRAPE] params: https://docs.firecrawl.dev/api-reference/endpoint/scrape + if url is None: + raise ValueError("URL must be provided for scrape mode.") firecrawl_docs = self.firecrawl.scrape_url(url, params=self.params) documents.append( Document( @@ -86,6 +93,9 @@ class FireCrawlWebReader(BasePydanticReader): ) ) elif self.mode == "crawl": + # [CRAWL] params: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post + if url is None: + raise ValueError("URL must be provided for crawl mode.") firecrawl_docs = self.firecrawl.crawl_url(url, params=self.params) firecrawl_docs = firecrawl_docs.get("data", []) for doc in firecrawl_docs: @@ -96,17 +106,171 @@ class FireCrawlWebReader(BasePydanticReader): ) ) elif self.mode == "search": - firecrawl_docs = self.firecrawl.search(query, params=self.params) - for doc in firecrawl_docs: + # [SEARCH] params: https://docs.firecrawl.dev/api-reference/endpoint/search + if query is None: + raise ValueError("Query must be provided for search mode.") + + # Remove query from params if it exists to avoid duplicate + search_params = self.params.copy() if self.params else {} + if "query" in search_params: + del search_params["query"] + + # Get search results + search_response = self.firecrawl.search(query, params=search_params) + + # Handle the search response format + if isinstance(search_response, dict): + # Check for success + if search_response.get("success", False): + # Get the data array + search_results = search_response.get("data", []) + + # Process each search result + for result in search_results: + # Extract text content (prefer markdown if available) + text = result.get("markdown", "") + if not text: + # Fall back to description if markdown is not available + text = result.get("description", "") + + # Extract metadata + metadata = { + "title": result.get("title", ""), + "url": result.get("url", ""), + "description": result.get("description", ""), + "source": "search", + "query": query, + } + + # Add additional metadata if available + if "metadata" in result and isinstance( + result["metadata"], dict + ): + metadata.update(result["metadata"]) + + # Create document + documents.append( + Document( + text=text, + metadata=metadata, + ) + ) + else: + # Handle unsuccessful response + warning = search_response.get("warning", "Unknown error") + print(f"Search was unsuccessful: {warning}") + documents.append( + Document( + text=f"Search for '{query}' was unsuccessful: {warning}", + metadata={ + "source": "search", + "query": query, + "error": warning, + }, + ) + ) + else: + # Handle unexpected response format + print(f"Unexpected search response format: {type(search_response)}") documents.append( Document( - text=doc.get("markdown", ""), - metadata=doc.get("metadata", {}), + text=str(search_response), + metadata={"source": "search", "query": query}, + ) + ) + elif self.mode == "extract": + # [EXTRACT] params: https://docs.firecrawl.dev/api-reference/endpoint/extract + if urls is None: + # For backward compatibility, convert single URL to list if provided + if url is not None: + urls = [url] + else: + raise ValueError("URLs must be provided for extract mode.") + + # Ensure we have a prompt in params + extract_params = self.params.copy() if self.params else {} + if "prompt" not in extract_params: + raise ValueError("A 'prompt' parameter is required for extract mode.") + + # Prepare the payload according to the new API structure + payload = {"prompt": extract_params.pop("prompt")} + + # Call the extract method with the urls and params + extract_response = self.firecrawl.extract(urls=urls, params=payload) + + # Handle the extract response format + if isinstance(extract_response, dict): + # Check for success + if extract_response.get("success", False): + # Get the data from the response + extract_data = extract_response.get("data", {}) + + # Get the sources if available + sources = extract_response.get("sources", {}) + + # Convert the extracted data to text + if extract_data: + # Convert the data to a formatted string + text_parts = [] + for key, value in extract_data.items(): + text_parts.append(f"{key}: {value}") + + text = "\n".join(text_parts) + + # Create metadata + metadata = { + "urls": urls, + "source": "extract", + "status": extract_response.get("status"), + "expires_at": extract_response.get("expiresAt"), + } + + # Add sources to metadata if available + if sources: + metadata["sources"] = sources + + # Create document + documents.append( + Document( + text=text, + metadata=metadata, + ) + ) + else: + # Handle empty data in successful response + print("Extract response successful but no data returned") + documents.append( + Document( + text="Extraction was successful but no data was returned", + metadata={"urls": urls, "source": "extract"}, + ) + ) + else: + # Handle unsuccessful response + warning = extract_response.get("warning", "Unknown error") + print(f"Extraction was unsuccessful: {warning}") + documents.append( + Document( + text=f"Extraction was unsuccessful: {warning}", + metadata={ + "urls": urls, + "source": "extract", + "error": warning, + }, + ) + ) + else: + # Handle unexpected response format + print(f"Unexpected extract response format: {type(extract_response)}") + documents.append( + Document( + text=str(extract_response), + metadata={"urls": urls, "source": "extract"}, ) ) else: raise ValueError( - "Invalid mode. Please choose 'scrape', 'crawl' or 'search'." + "Invalid mode. Please choose 'scrape', 'crawl', 'search', or 'extract'." ) return documents diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/scrapfly_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/scrapfly_web/README.md index 4ccc8a31c9adca6b06eaf4397c87637d2d43dd8d..e9eeda74431070da36e320a43d0b84fc8e248fa2 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/scrapfly_web/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/scrapfly_web/README.md @@ -46,7 +46,7 @@ See the [ScrapFly documentation](https://scrapfly.io/docs/scrape-api/getting-sta ### Example Usage -Here is an example demonstrating how to initialize the FireCrawlWebReader, load documents from a URL, and then create a summary index from those documents for querying. +Here is an example demonstrating how to initialize the ScrapflyReader, load documents from a URL, and then create a summary index from those documents for querying. ```python from llama_index.core import SummaryIndex diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index d8bb4dff767841c9b70fd4753a7ba004f2144405..5e2d11906d9a0e5943bbd9ea3d6da39cf10eaf38 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -46,7 +46,7 @@ license = "GPL-3.0-or-later" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.3.6" +version = "0.3.7" [tool.poetry.dependencies] python = ">=3.9,<4.0"