From 08a4df02d4e7bc81f0a5297a518114f46a1ec797 Mon Sep 17 00:00:00 2001 From: Jason Zhang <166434281+jayfish0@users.noreply.github.com> Date: Fri, 14 Mar 2025 01:37:21 -0700 Subject: [PATCH] feat: Add AgentQL Tool (#18131) --- .../llama-index-tools-agentql/.gitignore | 153 ++++ .../tools/llama-index-tools-agentql/BUILD | 3 + .../tools/llama-index-tools-agentql/Makefile | 17 + .../tools/llama-index-tools-agentql/README.md | 96 ++ .../examples/AgentQL_browser_agent.ipynb | 817 ++++++++++++++++++ .../llama_index/tools/agentql/BUILD | 1 + .../llama_index/tools/agentql/__init__.py | 11 + .../tools/agentql/agentql_browser_tool/BUILD | 1 + .../agentql/agentql_browser_tool/__init__.py | 0 .../agentql/agentql_browser_tool/base.py | 129 +++ .../tools/agentql/agentql_rest_api_tool/BUILD | 1 + .../agentql/agentql_rest_api_tool/__init__.py | 0 .../agentql/agentql_rest_api_tool/base.py | 98 +++ .../llama_index/tools/agentql/const.py | 16 + .../llama_index/tools/agentql/messages.py | 8 + .../llama_index/tools/agentql/utils.py | 93 ++ .../llama-index-tools-agentql/pyproject.toml | 62 ++ .../llama-index-tools-agentql/tests/BUILD | 9 + .../tests/__init__.py | 0 .../tests/conftest.py | 13 + .../tests/test_browser_spec.py | 67 ++ .../tests/test_rest_api_spec.py | 44 + 22 files changed, 1639 insertions(+) create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/.gitignore create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/BUILD create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/Makefile create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/README.md create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/examples/AgentQL_browser_agent.ipynb create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/BUILD create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/__init__.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/BUILD create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/__init__.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/base.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/BUILD create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/__init__.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/base.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/const.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/messages.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/utils.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/pyproject.toml create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/tests/BUILD create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/tests/__init__.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/tests/conftest.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/tests/test_browser_spec.py create mode 100644 llama-index-integrations/tools/llama-index-tools-agentql/tests/test_rest_api_spec.py diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/.gitignore b/llama-index-integrations/tools/llama-index-tools-agentql/.gitignore new file mode 100644 index 0000000000..990c18de22 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/BUILD b/llama-index-integrations/tools/llama-index-tools-agentql/BUILD new file mode 100644 index 0000000000..0896ca890d --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/Makefile b/llama-index-integrations/tools/llama-index-tools-agentql/Makefile new file mode 100644 index 0000000000..d83aa123a2 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests --asyncio-mode=auto + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/README.md b/llama-index-integrations/tools/llama-index-tools-agentql/README.md new file mode 100644 index 0000000000..1767dce45b --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/README.md @@ -0,0 +1,96 @@ +# llama-index-tools-agentql + +[AgentQL](https://www.agentql.com/) provides web interaction and structured data extraction from any web page using an [AgentQL query](https://docs.agentql.com/agentql-query) or a Natural Language prompt. AgentQL can be used across multiple languages and web pages without breaking over time and change. + +> **Warning** +> Only supports async functions and playwright browser APIs, please refer to the following PR for more details: https://github.com/run-llama/llama_index/pull/17808 + +## Installation + +```bash +pip install llama-index-tools-agentql +``` + +You also need to configure the `AGENTQL_API_KEY` environment variable. You can acquire an API key from our [Dev Portal](https://dev.agentql.com). + +## Overview + +AgentQL provides the following three function tools: + +- **`extract_web_data_with_rest_api`**: Extracts structured data as JSON from a web page given a URL using either an [AgentQL query](https://docs.agentql.com/agentql-query/query-intro) or a Natural Language description of the data. + +- **`extract_web_data_from_browser`**: Extracts structured data as JSON from the active web page in a browser using either an [AgentQL query](https://docs.agentql.com/agentql-query/query-intro) or a Natural Language description. **This tool must be used with a Playwright browser.** + +- **`get_web_element_from_browser`**: Finds a web element on the active web page in a browser using a Natural Language description and returns its CSS selector for further interaction. **This tool must be used with a Playwright browser.** + +You can learn more about how to use AgentQL tools in this [Jupyter notebook](https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/tools/llama-index-tools-agentql/examples/AgentQL_browser_agent.ipynb). + +### Extract data using REST API + +```python +from llama_index.tools.agentql import AgentQLRestAPIToolSpec + +agentql_rest_api_tool = AgentQLRestAPIToolSpec() +await agentql_rest_api_tool.extract_web_data_with_rest_api( + url="https://www.agentql.com/blog", + query="{ posts[] { title url author date }}", +) +``` + +### Work with data and web elements using browser + +#### Setup + +In order to use the `extract_web_data_from_browser` and `get_web_element_from_browser`, you need to have a Playwright browser instance. If you do not have an active instance, you can initiate one using the `create_async_playwright_browser` utility method from LlamaIndex's Playwright ToolSpec. + +> **Note** +> Agentql browser tools are best used along with LlamaIndex's [Playwright tools](https://docs.llamaindex.ai/en/stable/api_reference/tools/playwright/). + +```python +from llama_index.tools.playwright.base import PlaywrightToolSpec + +async_browser = await PlaywrightToolSpec.create_async_playwright_browser() +``` + +You can also use an existing browser instance via Chrome DevTools Protocol (CDP) connection URL: + +```python +p = await async_playwright().start() +async_browser = await p.chromium.connect_over_cdp("CDP_CONNECTION_URL") +``` + +#### Extract data from the active browser page + +```python +from llama_index.tools.agentql import AgentQLBrowserToolSpec + +playwright_tool = PlaywrightToolSpec(async_browser=async_browser) +await playwright_tool.navigate_to("https://www.agentql.com/blog") + +agentql_browser_tool = AgentQLBrowserToolSpec(async_browser=async_browser) +await agentql_browser_tool.extract_web_data_from_browser( + prompt="the blog posts with title and url", +) +``` + +#### Find a web element on the active browser page + +```python +next_page_button = await agentql_browser_tool.get_web_element_from_browser( + prompt="The next page navigation button", +) + +await playwright_tool.click(next_page_button) +``` + +## Agentic Usage + +This tool has a more extensive example for agentic usage documented in this [Jupyter notebook](https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/tools/llama-index-tools-agentql/examples/AgentQL_browser_agent.ipynb). + +## Run tests + +In order to run integration tests, you need to configure LLM credentials by setting the `OPENAI_API_KEY` and `AGENTQL_API_KEY` environment variables first. Then run the tests with the following command: + +```bash +make test +``` diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/examples/AgentQL_browser_agent.ipynb b/llama-index-integrations/tools/llama-index-tools-agentql/examples/AgentQL_browser_agent.ipynb new file mode 100644 index 0000000000..94366540e5 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/examples/AgentQL_browser_agent.ipynb @@ -0,0 +1,817 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building a Browser Agent with AgentQL\n", + "\n", + "<a href=\"https://colab.research.google.com/github/run-llama/llama_index/blob/main/llama-index-integrations/tools/llama-index-tools-agentql/examples/agentql_browser_agent.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n", + "\n", + "[AgentQL](https://www.agentql.com/) tools provide web interaction and structured data extraction from any web page using an [AgentQL query](https://docs.agentql.com/agentql-query) or a Natural Language prompt. AgentQL can be used across multiple languages and web pages without breaking over time and change.\n", + "\n", + "This tutorial shows you how to:\n", + "\n", + "* Create a browser agent with AgentQL tools and LlamaIndex\n", + "* How to use AgentQL tools to navigate the Internet \n", + "* How to use AgentQL tools to scrape content from the Internet\n", + "\n", + "## Overview\n", + "\n", + "AgentQL provides three function tools. The first doesn't require a browser and relies on the REST API:\n", + "\n", + "- **`extract_web_data_with_rest_api`** extracts structured data as JSON from a web page given a URL using either an [AgentQL query](https://docs.agentql.com/agentql-query/query-intro) or a Natural Language description of the data.\n", + "\n", + "The other two tools must be used with a `Playwright` browser or a remote browser instance via Chrome DevTools Protocal (CDP):\n", + "\n", + "- **`extract_web_data_from_browser`** extracts structured data as JSON from the active web page in a browser using either an [AgentQL query](https://docs.agentql.com/agentql-query/query-intro) or a Natural Language description.\n", + "\n", + "- **`get_web_element_from_browser`** finds a web element on the active web page in a browser using a Natural Language description and returns its CSS selector for further interaction.\n", + "\n", + "### Tool features\n", + "\n", + "| Tool | Web Data Extraction | Web Element Extraction | Use With Local Browser |\n", + "| :--- | :---: | :---: | :---: |\n", + "| extract_web_data_with_rest_api | ✅ | ⌠| âŒ\n", + "| extract_web_data_from_browser | ✅ | ⌠| ✅\n", + "| get_web_element_from_browser | ⌠| ✅ | ✅" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-tools-agentql llama-index-tools-playwright llama-index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Credentials\n", + "\n", + "To use the AgentQL tools, you will need to get your own API key from the [AgentQL Dev Portal](https://dev.agentql.com/) and set the AgentQL environment variable:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"AGENTQL_API_KEY\"] = \"YOUR_AGENTQL_API_KEY\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up Playwright browser and AgentQL tools\n", + "To run this notebook, install Playwright browser and configure Jupyter Notebook's `asyncio` loop." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!playwright install\n", + "\n", + "# This import is required only for jupyter notebooks, since they have their own eventloop\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `AgentQLRestAPIToolSpec`\n", + "`AgentQLRestAPIToolSpec` provides `extract_web_data_with_rest_api` function tool.\n", + "\n", + "You can instantiate `AgentQLRestAPIToolSpec` with the following param:\n", + "- `timeout`: The number of seconds to wait for a request before timing out. Increase if data extraction times out. **Defaults to `900`.**\n", + "- `is_stealth_mode_enabled`: Whether to enable experimental anti-bot evasion strategies. This feature may not work for all websites at all times. Data extraction may take longer to complete with this mode enabled. **Defaults to `False`.**\n", + "- `wait_for`: The number of seconds to wait for the page to load before extracting data. **Defaults to `0`.**\n", + "- `is_scroll_to_bottom_enabled`: Whether to scroll to bottom of the page before extracting data. **Defaults to `False`.**\n", + "- `mode`: `\"standard\"` uses deep data analysis, while `\"fast\"` trades some depth of analysis for speed and is adequate for most usecases. [Learn more about the modes in this guide.](https://docs.agentql.com/accuracy/standard-mode) **Defaults to `\"fast\"`.**\n", + "- `is_screenshot_enabled`: Whether to take a screenshot before extracting data. Returned in 'metadata' as a Base64 string. **Defaults to `False`.**\n", + "\n", + "`AgentQLRestAPIToolSpec` is using AgentQL REST API, for more details about the parameters read [API Reference docs](https://docs.agentql.com/rest-api/api-reference)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.tools.agentql import AgentQLRestAPIToolSpec\n", + "\n", + "agentql_rest_api_tool = AgentQLRestAPIToolSpec()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `AgentQLBrowserToolSpec`\n", + "`AgentQLBrowserToolSpec` provides 2 tools: `extract_web_data_from_browser` and `get_web_element_from_browser` function tools." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`AgentQLBrowserToolSpec` take the following params:\n", + "- `async_browser`: An async playwright browser instance.\n", + "- `timeout_for_data`: The number of seconds to wait for a extract data request before timing out. **Defaults to `900`.**\n", + "- `timeout_for_element`: The number of seconds to wait for a get element request before timing out. **Defaults to `900`.**\n", + "- `wait_for_network_idle`: Whether to wait until the network reaches a full idle state before executing. **Defaults to `True`.**\n", + "- `include_hidden_for_data`: Whether to take into account visually hidden elements on the page for extract data. **Defaults to `True`.**\n", + "- `include_hidden_for_element`: Whether to take into account visually hidden elements on the page for get element. **Defaults to `False`.**\n", + "- `mode`: `\"standard\"` uses deep data analysis, while `\"fast\"` trades some depth of analysis for speed and is adequate for most usecases. [Learn more about the modes in this guide.](https://docs.agentql.com/accuracy/standard-mode) **Defaults to `\"fast\"`.**\n", + "\n", + "`AgentQLBrowserToolSpec` is using AgentQL SDK. You can find more details about the parameters and the functions in [SDK API Reference](https://docs.agentql.com/python-sdk/api-references/agentql-page)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> **Note:** To instantiate `AgentQLBrowserToolSpec` you need to provide a browser instance. You can create one using `create_async_playwright_browser` utility method from LlamaIndex's Playwright ToolSpec." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.tools.playwright.base import PlaywrightToolSpec\n", + "from llama_index.tools.agentql import AgentQLBrowserToolSpec\n", + "\n", + "async_browser = await PlaywrightToolSpec.create_async_playwright_browser()\n", + "agentql_browser_tool = AgentQLBrowserToolSpec(async_browser=async_browser)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Invoking the AgentQL tools" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `extract_web_data_with_rest_api`\n", + "\n", + "This tool uses AgentQL's REST API under the hood, sending the publically available web page's URL to AgentQL's endpoint. This will not work with private pages or logged in sessions. Use `extract_web_data_from_browser` for those usecases.\n", + "\n", + "- `url`: The URL of the web page you want to extract data from.\n", + "- `query`: The AgentQL query to execute. Use this if you want to extract data in a structure you define. Learn more about [how to write an AgentQL query in the docs](https://docs.agentql.com/agentql-query).\n", + "- `prompt`: A Natural Language description of the data to extract from the page. AgentQL will infer the data’s structure from your prompt.\n", + "\n", + "> **Note:** You must define either a `query` or a `prompt` to use AgentQL." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': {'posts': [{'title': 'AgentQL MCP Server: Structured Web Data for Claude, Cursor, Windsurf, and more',\n", + " 'url': 'https://www.agentql.com/blog/2025-mcp-integration',\n", + " 'author': 'Rachel-Lee Nabors',\n", + " 'date': 'Mar 12, 2025'},\n", + " {'title': 'Dify + AgentQL: Build AI Apps with Live Web Data, No Code Needed',\n", + " 'url': 'https://www.agentql.com/blog/2025-dify-integration',\n", + " 'author': 'Rachel-Lee Nabors',\n", + " 'date': 'Mar 11, 2025'},\n", + " {'title': 'Zapier + AgentQL: No-Code Web Data for Smarter Workflows',\n", + " 'url': 'https://www.agentql.com/blog/2025-zapier-integration',\n", + " 'author': 'Rachel-Lee Nabors',\n", + " 'date': 'Mar 10, 2025'},\n", + " {'title': 'Something is coming.',\n", + " 'url': 'https://www.agentql.com/blog/2025-iw-teaser',\n", + " 'author': 'Rachel-Lee Nabors',\n", + " 'date': 'Mar 7, 2025'},\n", + " {'title': 'Automated web application testing with AI and Playwright',\n", + " 'url': 'https://www.agentql.com/blog/2025-automated-testing-web-ai-playwright',\n", + " 'author': 'Vladimir de Turckheim',\n", + " 'date': 'Feb 26, 2025'}]},\n", + " 'metadata': {'request_id': '5a43ab86-f68b-4470-bca9-ab51a791041a',\n", + " 'generated_query': None,\n", + " 'screenshot': None}}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can invoke the tool with either a query or a prompt\n", + "\n", + "# await agentql_rest_api_tool.extract_web_data_with_rest_api(\n", + "# url=\"https://www.agentql.com/blog\",\n", + "# prompt=\"the blog posts with title, url, author and publication date\",\n", + "# )\n", + "\n", + "await agentql_rest_api_tool.extract_web_data_with_rest_api(\n", + " url=\"https://www.agentql.com/blog\",\n", + " query=\"{ posts[] { title url author date }}\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Stealth Mode\n", + "AgentQL provides experimental anti-bot evasion strategies to avoid detection by anti-bot services.\n", + "\n", + "> **Note**: Stealth mode is experimental and may not work for all websites at all times. The data extraction may take longer to complete comparing to non-stealth mode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': {'items': [{'name': \"W's Recycled Down Sweaterâ„¢ Parka - Pitch Blue (PIBL) (28460)\",\n", + " 'price': 178.99},\n", + " {'name': \"W's Recycled Down Sweaterâ„¢ Parka - Shelter Brown (SHBN) (28460)\",\n", + " 'price': 178.99},\n", + " {'name': \"W's Recycled Down Sweaterâ„¢ Parka - Pine Needle Green (PNGR) (28460)\",\n", + " 'price': 178.99},\n", + " {'name': \"W's Recycled Down Sweaterâ„¢ Parka - Burnished Red (BURR) (28460)\",\n", + " 'price': 178.99},\n", + " {'name': \"W's Nano Puff® Jacket - Burnished Red (BURR) (84217)\",\n", + " 'price': 118.99},\n", + " {'name': \"W's Nano Puff® Jacket - Pine Needle Green (PNGR) (84217)\",\n", + " 'price': 118.99},\n", + " {'name': \"W's Powder Town Jacket - Vivid Apricot (VAPC) (31635)\",\n", + " 'price': 208.99},\n", + " {'name': \"W's Powder Town Jacket - Pine Needle Green (PNGR) (31635)\",\n", + " 'price': 208.99},\n", + " {'name': \"W's Powder Town Jacket - Dulse Mauve (DLMA) (31635)\",\n", + " 'price': 208.99},\n", + " {'name': \"W's Powder Town Jacket - Smolder Blue w/Dulse Mauve (SBMA) (31635)\",\n", + " 'price': 208.99},\n", + " {'name': \"W's Powder Town Pants - Pine Needle Green (PNGR) (31645)\",\n", + " 'price': 148.99},\n", + " {'name': \"W's Powder Town Pants - Thermal Blue (TMBL) (31645)\",\n", + " 'price': 173.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Dulse Mauve (DLMA) (25455)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Synched Flight Small: Natural (SYNL) (25455)\",\n", + " 'price': 96.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Thermal Blue (TMBL) (25455)\",\n", + " 'price': 82.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Across Oceans: Pitch Blue (ASPH) (25455)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Terra Pink (TRPI) (25455)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Small Currents: Natural (SCNL) (25455)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Nickel w/Vivid Apricot (NLVA) (25455)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Echo Purple (ECPU) (25455)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Lightweight Synchilla® Snap-T® Pullover - Oatmeal Heather w/Vessel Blue (OHVL) (25455)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Down Sweaterâ„¢ - Seabird Grey (SBDY) (84684)\",\n", + " 'price': 166.99},\n", + " {'name': \"W's Pine Bank 3-in-1 Parka - Shelter Brown (SHBN) (21025)\",\n", + " 'price': 273.99},\n", + " {'name': \"W's Pine Bank 3-in-1 Parka - Pitch Blue (PIBL) (21025)\",\n", + " 'price': 328.99},\n", + " {'name': \"W's Pine Bank 3-in-1 Parka - Burnished Red (BURR) (21025)\",\n", + " 'price': 273.99},\n", + " {'name': \"W's Pine Bank 3-in-1 Parka - Pine Needle Green (PNGR) (21025)\",\n", + " 'price': 273.99},\n", + " {'name': \"W's SnowDrifter Jacket - Vessel Blue (VSLB) (30071)\",\n", + " 'price': 268.99},\n", + " {'name': \"W's SnowDrifter Jacket - Dulse Mauve (DLMA) (30071)\",\n", + " 'price': 268.99},\n", + " {'name': \"W's SnowDrifter Jacket - Vivid Apricot (VAPC) (30071)\",\n", + " 'price': 268.99},\n", + " {'name': \"W's SnowDrifter Jacket - Thermal Blue (TMBL) (30071)\",\n", + " 'price': 268.99},\n", + " {'name': \"W's Re-Tool Half-Snap Pullover - Burnished Red (BURR) (26465)\",\n", + " 'price': 78.99},\n", + " {'name': \"W's Re-Tool Half-Snap Pullover - Vessel Blue (VSLB) (26465)\",\n", + " 'price': 94.99},\n", + " {'name': \"W's Re-Tool Half-Snap Pullover - Dulse Mauve (DLMA) (26465)\",\n", + " 'price': 78.99},\n", + " {'name': \"W's Re-Tool Half-Snap Pullover - Shelter Brown (SHBN) (26465)\",\n", + " 'price': 78.99},\n", + " {'name': \"W's Insulated Storm Shift Jacket - Dulse Mauve (DLMA) (31835)\",\n", + " 'price': 383.99},\n", + " {'name': \"W's Insulated Storm Shift Jacket - Pine Needle Green (PNGR) (31835)\",\n", + " 'price': 328.99},\n", + " {'name': \"W's SnowDrifter Bibs - Black (BLK) (30081)\", 'price': 238.99},\n", + " {'name': \"W's SnowDrifter Bibs - Smolder Blue (SMDB) (30081)\",\n", + " 'price': 278.99},\n", + " {'name': \"W's SnowDrifter Bibs - Dulse Mauve (DLMA) (30081)\",\n", + " 'price': 238.99},\n", + " {'name': \"W's SnowDrifter Bibs - Pine Needle Green (PNGR) (30081)\",\n", + " 'price': 238.99},\n", + " {'name': \"W's Recycled Wool-Blend Crewneck Sweater - Chevron Cable: Natural (CHNL) (51025)\",\n", + " 'price': 73.99},\n", + " {'name': \"W's Recycled Wool-Blend Crewneck Sweater - Only Earth: Beeswax Tan (OETN) (51025)\",\n", + " 'price': 103.99},\n", + " {'name': \"W's Recycled Wool-Blend Crewneck Sweater - Snowdrift: Thermal Blue (SDTL) (51025)\",\n", + " 'price': 88.99},\n", + " {'name': \"W's Recycled Wool-Blend Crewneck Sweater - Ridge: Pine Needle Green (RPNG) (51025)\",\n", + " 'price': 88.99},\n", + " {'name': \"W's Recycled Wool-Blend Crewneck Sweater - Chevron Cable: Madder Red (CHMR) (51025)\",\n", + " 'price': 88.99},\n", + " {'name': \"W's Recycled Wool-Blend Crewneck Sweater - Smolder Blue (SMDB) (51025)\",\n", + " 'price': 73.99},\n", + " {'name': \"W's Recycled Wool-Blend Crewneck Sweater - Fireside: Shelter Brown (FISN) (51025)\",\n", + " 'price': 73.99},\n", + " {'name': \"W's Micro D® Joggers - Synched Flight Small: Natural (SYNL) (22020)\",\n", + " 'price': 48.99},\n", + " {'name': \"W's Micro D® Joggers - Endless Blue (ENLB) (22020)\",\n", + " 'price': 58.99},\n", + " {'name': \"W's Micro D® Joggers - Small Currents: Natural (SCNL) (22020)\",\n", + " 'price': 48.99},\n", + " {'name': \"W's Better Sweater® 1/4-Zip - Stormy Mauve (STMA) (25618)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Better Sweater® 1/4-Zip - Dulse Mauve (DLMA) (25618)\",\n", + " 'price': 82.99},\n", + " {'name': \"W's Better Sweater® 1/4-Zip - Torrey Pine Green (TPGN) (25618)\",\n", + " 'price': 82.99},\n", + " {'name': \"W's Better Sweater® 1/4-Zip - Nouveau Green (NUVG) (25618)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Better Sweater® 1/4-Zip - Raptor Brown (RPBN) (25618)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Insulated Powder Town Pants - Black (BLK) (31185)\",\n", + " 'price': 160.99},\n", + " {'name': \"W's Insulated Powder Town Pants - Smolder Blue (SMDB) (31185)\",\n", + " 'price': 160.99},\n", + " {'name': \"W's Insulated Powder Town Pants - Dulse Mauve (DLMA) (31185)\",\n", + " 'price': 160.99},\n", + " {'name': \"W's Insulated Powder Town Pants - Vivid Apricot (VAPC) (31185)\",\n", + " 'price': 160.99},\n", + " {'name': \"W's Insulated Powder Town Pants - Across Oceans: Smolder Blue (ASBE) (31185)\",\n", + " 'price': 160.99},\n", + " {'name': 'Atom Sling 8L - Vessel Blue (VSLB) (48262)', 'price': 44.99},\n", + " {'name': 'Atom Sling 8L - Buckhorn Green (BUGR) (48262)', 'price': 44.99},\n", + " {'name': 'Atom Sling 8L - Dulse Mauve (DLMA) (48262)', 'price': 44.99},\n", + " {'name': \"W's Classic Retro-X® Jacket - Natural w/Smolder Blue (NTSB) (23074)\",\n", + " 'price': 136.99},\n", + " {'name': \"W's Classic Retro-X® Jacket - Nest Brown w/Dulse Mauve (NBDU) (23074)\",\n", + " 'price': 113.99},\n", + " {'name': \"W's Classic Retro-X® Jacket - Small Currents: Natural (SCNL) (23074)\",\n", + " 'price': 113.99},\n", + " {'name': \"W's Los Gatos 1/4-Zip - Salt Grey (SGRY) (25236)\",\n", + " 'price': 53.99},\n", + " {'name': \"W's Los Gatos 1/4-Zip - Dulse Mauve (DLMA) (25236)\",\n", + " 'price': 64.99},\n", + " {'name': \"W's Stand Up® Cropped Corduroy Overalls - Nest Brown (NESB) (75100)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Stand Up® Cropped Corduroy Overalls - Pitch Blue (PIBL) (75100)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Stand Up® Cropped Corduroy Overalls - Beeswax Tan (BWX) (75100)\",\n", + " 'price': 68.99},\n", + " {'name': \"W's Synchilla® Jacket - Oatmeal Heather w/Natural (OTNL) (22955)\",\n", + " 'price': 88.99},\n", + " {'name': \"W's Synchilla® Jacket - Black (BLK) (22955)\", 'price': 73.99},\n", + " {'name': \"W's Synchilla® Jacket - Pitch Blue (PIBL) (22955)\",\n", + " 'price': 73.99},\n", + " {'name': \"W's Synchilla® Jacket - Beeswax Tan (BWX) (22955)\",\n", + " 'price': 73.99},\n", + " {'name': \"W's Insulated Powder Town Jacket - Vivid Apricot (VAPC) (31200)\",\n", + " 'price': 238.99},\n", + " {'name': \"W's Insulated Powder Town Jacket - Black (BLK) (31200)\",\n", + " 'price': 278.99},\n", + " {'name': \"W's Insulated Powder Town Jacket - Across Oceans: Smolder Blue (ASBE) (31200)\",\n", + " 'price': 238.99},\n", + " {'name': \"W's Powder Town Bibs - Smolder Blue (SMDB) (31650)\",\n", + " 'price': 178.99},\n", + " {'name': \"W's Powder Town Bibs - Dulse Mauve (DLMA) (31650)\",\n", + " 'price': 208.99},\n", + " {'name': \"W's Powder Town Bibs - Pine Needle Green (PNGR) (31650)\",\n", + " 'price': 178.99},\n", + " {'name': \"W's Powder Town Bibs - Seabird Grey (SBDY) (31650)\",\n", + " 'price': 178.99},\n", + " {'name': \"W's Retro Pile Marsupial - Thermal Blue (TMBL) (22835)\",\n", + " 'price': 73.99},\n", + " {'name': \"W's Retro Pile Marsupial - Shroom Taupe (STPE) (22835)\",\n", + " 'price': 88.99},\n", + " {'name': \"W's Retro Pile Marsupial - Shelter Brown (SHBN) (22835)\",\n", + " 'price': 73.99},\n", + " {'name': \"W's Cord Fjord Coat - Dulse Mauve (DLMA) (26881)\",\n", + " 'price': 163.99},\n", + " {'name': \"W's Cord Fjord Coat - Shelter Brown (SHBN) (26881)\",\n", + " 'price': 163.99},\n", + " {'name': \"W's Regenerative Organic Certified® Cotton Essential Top - Thermal Blue (TMBL) (42171)\",\n", + " 'price': 41.99},\n", + " {'name': \"W's Regenerative Organic Certified® Cotton Essential Top - Pine Needle Green (PNGR) (42171)\",\n", + " 'price': 41.99},\n", + " {'name': \"W's Lonesome Mesa Long Coat - Pitch Blue (PIBL) (26655)\",\n", + " 'price': 148.99},\n", + " {'name': \"W's Lonesome Mesa Long Coat - Pine Needle Green (PNGR) (26655)\",\n", + " 'price': 148.99}]},\n", + " 'metadata': {'request_id': '0016c761-92c1-47b5-9b8f-f71f9727d58d',\n", + " 'generated_query': None,\n", + " 'screenshot': None}}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# agentql_rest_api_tool = AgentQLRestAPIToolSpec(is_stealth_mode_enabled=True)\n", + "\n", + "await agentql_rest_api_tool.extract_web_data_with_rest_api(\n", + " url=\"https://www.patagonia.com/shop/web-specials/womens\",\n", + " query=\"{ items[] { name price}}\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `extract_web_data_from_browser`\n", + "\n", + "- `query`: The AgentQL query to execute. Use this if you want to extract data in a structure you define. Learn more about [how to write an AgentQL query in the docs](https://docs.agentql.com/agentql-query).\n", + "- `prompt`: A Natural Language description of the data to extract from the page. AgentQL will infer the data’s structure from your prompt.\n", + "\n", + "> **Note:** You must define either a `query` or a `prompt` to use AgentQL.\n", + "\n", + "To extract data, first you must navigate to a web page using LlamaIndex's [Playwright](https://docs.llamaindex.ai/en/stable/api_reference/tools/playwright/) click tool." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jisonz/Library/Caches/pypoetry/virtualenvs/llama-index-AJEGkUS0-py3.13/lib/python3.13/site-packages/agentql/_core/_utils.py:167: UserWarning: \u001b[31m🚨 The function get_data_by_prompt_experimental is experimental and may not work as expected 🚨\u001b[0m\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "{'blog_post': [{'title': 'AgentQL MCP Server: Structured Web Data for Claude, Cursor, Windsurf, and more',\n", + " 'url': 'https://www.agentql.com/blog/2025-mcp-integration'},\n", + " {'title': 'Dify + AgentQL: Build AI Apps with Live Web Data, No Code Needed',\n", + " 'url': 'https://www.agentql.com/blog/2025-dify-integration'},\n", + " {'title': 'Zapier + AgentQL: No-Code Web Data for Smarter Workflows',\n", + " 'url': 'https://www.agentql.com/blog/2025-zapier-integration'},\n", + " {'title': 'Something is coming.',\n", + " 'url': 'https://www.agentql.com/blog/2025-iw-teaser'},\n", + " {'title': 'Automated web application testing with AI and Playwright',\n", + " 'url': 'https://www.agentql.com/blog/2025-automated-testing-web-ai-playwright'}]}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "playwright_tool = PlaywrightToolSpec(async_browser=async_browser)\n", + "await playwright_tool.navigate_to(\"https://www.agentql.com/blog\")\n", + "\n", + "# You can invoke the tool with either a query or a prompt\n", + "\n", + "# await agentql_browser_tool.extract_web_data_from_browser(\n", + "# query=\"{ posts[] { title url }}\",\n", + "# )\n", + "\n", + "await agentql_browser_tool.extract_web_data_from_browser(\n", + " prompt=\"the blog posts with title and url\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `get_web_element_from_browser`\n", + "\n", + "- `prompt`: A Natural Language description of the web element to find on the page." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://www.agentql.com/blog\n" + ] + }, + { + "data": { + "text/plain": [ + "\"[tf623_id='1111']\"" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "await playwright_tool.navigate_to(\"https://www.agentql.com/blog\")\n", + "print(await playwright_tool.get_current_page())\n", + "\n", + "next_page_button = await agentql_browser_tool.get_web_element_from_browser(\n", + " prompt=\"The next page navigation button\",\n", + ")\n", + "next_page_button" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Click on the element and check the url again" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Clicked element '[tf623_id='1111']'\"" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "await playwright_tool.click(next_page_button)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://www.agentql.com/blog/page/2\n" + ] + } + ], + "source": [ + "print(await playwright_tool.get_current_page())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using the AgentQL tools with agent\n", + "To get started, you will need an [OpenAI api key](https://platform.openai.com/account/api-keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set your openai key, if using openai\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.agent import FunctionCallingAgent\n", + "from llama_index.llms.openai import OpenAI\n", + "\n", + "# We add playwright's click, get_current_page, and navigate_to tools to the agent along with agentql tools\n", + "playwright_tool = PlaywrightToolSpec(async_browser=async_browser)\n", + "playwright_tool_list = playwright_tool.to_tool_list()\n", + "playwright_agent_tool_list = [\n", + " tool\n", + " for tool in playwright_tool_list\n", + " if tool.metadata.name in [\"click\", \"get_current_page\", \"navigate_to\"]\n", + "]\n", + "\n", + "agent = FunctionCallingAgent.from_tools(\n", + " playwright_agent_tool_list + agentql_browser_tool.to_tool_list(),\n", + " llm=OpenAI(model=\"gpt-4o\"),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I have extracted the blog post titled \"What I wish someone had told me\" along with the number of views. Here are the details:\n", + "\n", + "**Blog Text:**\n", + "> Optimism, obsession, self-belief, raw horsepower and personal connections are how things get started. Cohesive teams, the right combination of calmness and urgency, and unreasonable commitment are how things get finished. Long-term orientation is in short supply; try not to worry about what people think in the short term, which will get easier over time. It is easier for a team to do a hard thing that really matters than to do an easy thing that doesn’t really matter; audacious ideas motivate people. Incentives are superpowers; set them carefully. Concentrate your resources on a small number of high-conviction bets; this is easy to say but evidently hard to do. You can delete more stuff than you think. Communicate clearly and concisely. Fight bullshit and bureaucracy every time you see it and get other people to fight it too. Do not let the org chart get in the way of people working productively together. Outcomes are what count; don’t let good process excuse bad results. Spend more time recruiting. Take risks on high-potential people with a fast rate of improvement. Look for evidence of getting stuff done in addition to intelligence. Superstars are even more valuable than they seem, but you have to evaluate people on their net impact on the performance of the organization. Fast iteration can make up for a lot; it’s usually ok to be wrong if you iterate quickly. Plans should be measured in decades, execution should be measured in weeks. Don’t fight the business equivalent of the laws of physics. Inspiration is perishable and life goes by fast. Inaction is a particularly insidious type of risk. Scale often has surprising emergent properties. Compounding exponentials are magic. In particular, you really want to build a business that gets a compounding advantage with scale. Get back up and keep going. Working with great people is one of the best parts of life.\n", + "\n", + "**Number of Views:** 531,222\n" + ] + } + ], + "source": [ + "print(\n", + " agent.chat(\n", + " \"\"\"\n", + " Navigate to https://blog.samaltman.com/archive,\n", + " Find blog posts titled \"What I wish someone had told me\", click on the link,\n", + " Extract the blog text and number of views.\n", + " \"\"\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using the playwright tool with agent workflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.llms.openai import OpenAI\n", + "from llama_index.core.agent.workflow import AgentWorkflow\n", + "\n", + "from llama_index.core.agent.workflow import (\n", + " AgentInput,\n", + " AgentOutput,\n", + " ToolCall,\n", + " ToolCallResult,\n", + " AgentStream,\n", + ")\n", + "\n", + "playwright_tool_list = playwright_tool.to_tool_list()\n", + "playwright_agent_tool_list = [\n", + " tool\n", + " for tool in playwright_tool_list\n", + " if tool.metadata.name in [\"click\", \"get_current_page\", \"navigate_to\"]\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "navigate_to\n", + "{'url': 'https://blog.samaltman.com/archive'}\n", + "Navigating to https://blog.samaltman.com/archive returned status code 200\n", + "get_web_element_from_browser\n", + "{'prompt': \"blog post titled 'What I wish someone had told me'\"}\n", + "[tf623_id='1849']\n", + "click\n", + "{'selector': \"[tf623_id='1849']\"}\n", + "Clicked element '[tf623_id='1849']'\n", + "get_current_page\n", + "{}\n", + "https://blog.samaltman.com/what-i-wish-someone-had-told-me\n", + "extract_web_data_from_browser\n", + "{'prompt': 'Extract the blog text and number of views from the page.'}\n", + "{'blog_post_text': 'Optimism, obsession, self-belief, raw horsepower and personal connections are how things get started.\\nCohesive teams, the right combination of calmness and urgency, and unreasonable commitment are how things get finished. Long-term orientation is in short supply; try not to worry about what people think in the short term, which will get easier over time.\\nIt is easier for a team to do a hard thing that really matters than to do an easy thing that doesn’t really matter; audacious ideas motivate people.\\nIncentives are superpowers; set them carefully.\\nConcentrate your resources on a small number of high-conviction bets; this is easy to say but evidently hard to do. You can delete more stuff than you think.\\nCommunicate clearly and concisely.\\nFight bullshit and bureaucracy every time you see it and get other people to fight it too. Do not let the org chart get in the way of people working productively together.\\nOutcomes are what count; don’t let good process excuse bad results.\\nSpend more time recruiting. Take risks on high-potential people with a fast rate of improvement. Look for evidence of getting stuff done in addition to intelligence.\\nSuperstars are even more valuable than they seem, but you have to evaluate people on their net impact on the performance of the organization.\\nFast iteration can make up for a lot; it’s usually ok to be wrong if you iterate quickly. Plans should be measured in decades, execution should be measured in weeks.\\nDon’t fight the business equivalent of the laws of physics.\\nInspiration is perishable and life goes by fast. Inaction is a particularly insidious type of risk.\\nScale often has surprising emergent properties.\\nCompounding exponentials are magic. In particular, you really want to build a business that gets a compounding advantage with scale.\\nGet back up and keep going.\\nWorking with great people is one of the best parts of life.', 'views_count': 531223}\n", + "I have navigated to the blog post titled \"What I Wish Someone Had Told Me\" and extracted the following information:\n", + "\n", + "**Blog Text:**\n", + "Optimism, obsession, self-belief, raw horsepower and personal connections are how things get started.\n", + "Cohesive teams, the right combination of calmness and urgency, and unreasonable commitment are how things get finished. Long-term orientation is in short supply; try not to worry about what people think in the short term, which will get easier over time.\n", + "It is easier for a team to do a hard thing that really matters than to do an easy thing that doesn’t really matter; audacious ideas motivate people.\n", + "Incentives are superpowers; set them carefully.\n", + "Concentrate your resources on a small number of high-conviction bets; this is easy to say but evidently hard to do. You can delete more stuff than you think.\n", + "Communicate clearly and concisely.\n", + "Fight bullshit and bureaucracy every time you see it and get other people to fight it too. Do not let the org chart get in the way of people working productively together.\n", + "Outcomes are what count; don’t let good process excuse bad results.\n", + "Spend more time recruiting. Take risks on high-potential people with a fast rate of improvement. Look for evidence of getting stuff done in addition to intelligence.\n", + "Superstars are even more valuable than they seem, but you have to evaluate people on their net impact on the performance of the organization.\n", + "Fast iteration can make up for a lot; it’s usually ok to be wrong if you iterate quickly. Plans should be measured in decades, execution should be measured in weeks.\n", + "Don’t fight the business equivalent of the laws of physics.\n", + "Inspiration is perishable and life goes by fast. Inaction is a particularly insidious type of risk.\n", + "Scale often has surprising emergent properties.\n", + "Compounding exponentials are magic. In particular, you really want to build a business that gets a compounding advantage with scale.\n", + "Get back up and keep going.\n", + "Working with great people is one of the best parts of life.\n", + "\n", + "**Number of Views:** 531,223" + ] + } + ], + "source": [ + "llm = OpenAI(model=\"gpt-4o\")\n", + "\n", + "workflow = AgentWorkflow.from_tools_or_functions(\n", + " playwright_agent_tool_list + agentql_browser_tool.to_tool_list(),\n", + " llm=llm,\n", + " system_prompt=\"You are a helpful assistant that can do browser automation, data extraction and text summarization\",\n", + ")\n", + "\n", + "handler = workflow.run(\n", + " user_msg=\"\"\"\n", + " Navigate to https://blog.samaltman.com/archive,\n", + " Find blog posts titled \"What I wish someone had told me\", click on the link,\n", + " Detect if the webpage has navigated to the blog post, \n", + " then extract the blog text and number of views.\n", + " \"\"\"\n", + ")\n", + "\n", + "async for event in handler.stream_events():\n", + " if isinstance(event, AgentStream):\n", + " print(event.delta, end=\"\", flush=True)\n", + " elif isinstance(event, ToolCallResult):\n", + " print(event.tool_name) # the tool name\n", + " print(event.tool_kwargs) # the tool kwargs\n", + " print(event.tool_output) # the tool output" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama-index-zpEnpL0o-py3.12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/BUILD b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/BUILD new file mode 100644 index 0000000000..ddbe0c9161 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/__init__.py b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/__init__.py new file mode 100644 index 0000000000..4115f9f993 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/__init__.py @@ -0,0 +1,11 @@ +from llama_index.tools.agentql.agentql_browser_tool.base import ( + AgentQLBrowserToolSpec, +) +from llama_index.tools.agentql.agentql_rest_api_tool.base import ( + AgentQLRestAPIToolSpec, +) + +__all__ = [ + "AgentQLBrowserToolSpec", + "AgentQLRestAPIToolSpec", +] diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/BUILD b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/BUILD new file mode 100644 index 0000000000..ddbe0c9161 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/__init__.py b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/base.py b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/base.py new file mode 100644 index 0000000000..e7bb2ee1ff --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_browser_tool/base.py @@ -0,0 +1,129 @@ +from typing import Optional + +from playwright.async_api import Browser as AsyncBrowser + +from llama_index.core.tools.tool_spec.base import BaseToolSpec + +from llama_index.tools.agentql.const import ( + DEFAULT_EXTRACT_DATA_TIMEOUT_SECONDS, + DEFAULT_EXTRACT_ELEMENTS_TIMEOUT_SECONDS, + DEFAULT_WAIT_FOR_NETWORK_IDLE, + DEFAULT_INCLUDE_HIDDEN_DATA, + DEFAULT_INCLUDE_HIDDEN_ELEMENTS, + DEFAULT_RESPONSE_MODE, + REQUEST_ORIGIN, +) +from llama_index.tools.agentql.messages import ( + QUERY_PROMPT_REQUIRED_ERROR_MESSAGE, + QUERY_PROMPT_EXCLUSIVE_ERROR_MESSAGE, +) +from llama_index.tools.agentql.utils import _aget_current_agentql_page + + +class AgentQLBrowserToolSpec(BaseToolSpec): + """ + AgentQL Browser Tool Spec. + """ + + spec_functions = [ + "extract_web_data_from_browser", + "get_web_element_from_browser", + ] + + def __init__( + self, + async_browser: AsyncBrowser, + timeout_for_data: int = DEFAULT_EXTRACT_DATA_TIMEOUT_SECONDS, + timeout_for_element: int = DEFAULT_EXTRACT_ELEMENTS_TIMEOUT_SECONDS, + wait_for_network_idle: bool = DEFAULT_WAIT_FOR_NETWORK_IDLE, + include_hidden_for_data: bool = DEFAULT_INCLUDE_HIDDEN_DATA, + include_hidden_for_element: bool = DEFAULT_INCLUDE_HIDDEN_ELEMENTS, + mode: str = DEFAULT_RESPONSE_MODE, + ): + """ + Initialize AgentQL Browser Tool Spec. + + Args: + async_browser: An async playwright browser instance. + timeout_for_data: The number of seconds to wait for a extract data request before timing out. Defaults to 900. + timeout_for_element: The number of seconds to wait for a get element request before timing out. Defaults to 300. + wait_for_network_idle: Whether to wait for network idle state. Defaults to `True`. + include_hidden_for_data: Whether to take into account visually hidden elements on the page for extract data. Defaults to `True`. + include_hidden_for_element: Whether to take into account visually hidden elements on the page for get element. Defaults to `False`. + + mode: `standard` uses deep data analysis, while `fast` trades some depth of analysis for speed and is adequate for most usecases. + Learn more about the modes in this guide: https://docs.agentql.com/accuracy/standard-mode. Defaults to `fast`. + """ + self.async_browser = async_browser + self.timeout_for_data = timeout_for_data + self.timeout_for_element = timeout_for_element + self.wait_for_network_idle = wait_for_network_idle + self.include_hidden_for_data = include_hidden_for_data + self.include_hidden_for_element = include_hidden_for_element + self.mode = mode + + async def extract_web_data_from_browser( + self, + query: Optional[str] = None, + prompt: Optional[str] = None, + ) -> dict: + """ + Extracts structured data as JSON from a web page given a URL using either an AgentQL query or a Natural Language description of the data. + + Args: + query: AgentQL query used to extract the data. The query must be enclosed with curly braces `{}`. Either this field or `prompt` field must be provided. + prompt: Natural Language description of the data to extract from the page. If AgentQL query is not specified, always use the `prompt` field. Either this field or `query` field must be provided. + + Returns: + dict: The extracted data + """ + # Check that query and prompt cannot be both empty or both provided + if not query and not prompt: + raise ValueError(QUERY_PROMPT_REQUIRED_ERROR_MESSAGE) + if query and prompt: + raise ValueError(QUERY_PROMPT_EXCLUSIVE_ERROR_MESSAGE) + + page = await _aget_current_agentql_page(self.async_browser) + if query: + return await page.query_data( + query, + self.timeout_for_data, + self.wait_for_network_idle, + self.include_hidden_for_data, + self.mode, + request_origin=REQUEST_ORIGIN, + ) + else: + return await page.get_data_by_prompt_experimental( + prompt, + self.timeout_for_data, + self.wait_for_network_idle, + self.include_hidden_for_data, + self.mode, + request_origin=REQUEST_ORIGIN, + ) + + async def get_web_element_from_browser( + self, + prompt: str, + ) -> str: + """ + Finds a web element on the active web page in a running browser instance using element’s Natural Language description and returns its CSS selector for further interaction, like clicking, filling a form field, etc. + + Args: + prompt: Natural Language description of the web element to find on the page. + + Returns: + str: The CSS selector of the target element. + """ + page = await _aget_current_agentql_page(self.async_browser) + element = await page.get_by_prompt( + prompt, + self.timeout_for_element, + self.wait_for_network_idle, + self.include_hidden_for_element, + self.mode, + request_origin=REQUEST_ORIGIN, + ) + tf_id = await element.get_attribute("tf623_id") + return f"[tf623_id='{tf_id}']" diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/BUILD b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/BUILD new file mode 100644 index 0000000000..ddbe0c9161 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/__init__.py b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/base.py b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/base.py new file mode 100644 index 0000000000..b3042bf682 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/agentql_rest_api_tool/base.py @@ -0,0 +1,98 @@ +from typing import Optional +import os + +from llama_index.core.tools.tool_spec.base import BaseToolSpec + +from llama_index.tools.agentql.const import ( + DEFAULT_API_TIMEOUT_SECONDS, + DEFAULT_IS_STEALTH_MODE_ENABLED, + DEFAULT_WAIT_FOR_PAGE_LOAD_SECONDS, + DEFAULT_IS_SCROLL_TO_BOTTOM_ENABLED, + DEFAULT_RESPONSE_MODE, + DEFAULT_IS_SCREENSHOT_ENABLED, +) +from llama_index.tools.agentql.messages import UNSET_API_KEY_ERROR_MESSAGE +from llama_index.tools.agentql.utils import _aload_data + + +class AgentQLRestAPIToolSpec(BaseToolSpec): + """ + AgentQL Rest API Tool Spec. + """ + + spec_functions = [ + "extract_web_data_with_rest_api", + ] + + def __init__( + self, + timeout: int = DEFAULT_API_TIMEOUT_SECONDS, + is_stealth_mode_enabled: bool = DEFAULT_IS_STEALTH_MODE_ENABLED, + wait_for: int = DEFAULT_WAIT_FOR_PAGE_LOAD_SECONDS, + is_scroll_to_bottom_enabled: bool = DEFAULT_IS_SCROLL_TO_BOTTOM_ENABLED, + mode: str = DEFAULT_RESPONSE_MODE, + is_screenshot_enabled: bool = DEFAULT_IS_SCREENSHOT_ENABLED, + ): + """ + Initialize AgentQL Rest API Tool Spec. + + Args: + timeout: The number of seconds to wait for a request before timing out. Defaults to 900. + + is_stealth_mode_enabled: Whether to enable experimental anti-bot evasion strategies. This feature may not work for all websites at all times. + Data extraction may take longer to complete with this mode enabled. Defaults to `False`. + + wait_for: The number of seconds to wait for the page to load before extracting data. Defaults to 0. + is_scroll_to_bottom_enabled: Whether to scroll to bottom of the page before extracting data. Defaults to `False`. + + mode: 'standard' uses deep data analysis, while 'fast' trades some depth of analysis for speed and is adequate for most usecases. + Learn more about the modes in this guide: https://docs.agentql.com/accuracy/standard-mode) Defaults to 'fast'. + + is_screenshot_enabled: Whether to take a screenshot before extracting data. Returned in 'metadata' as a Base64 string. Defaults to `False`. + """ + self._api_key = os.getenv("AGENTQL_API_KEY") + if not self._api_key: + raise ValueError(UNSET_API_KEY_ERROR_MESSAGE) + self.timeout = timeout + self.is_stealth_mode_enabled = is_stealth_mode_enabled + self.wait_for = wait_for + self.is_scroll_to_bottom_enabled = is_scroll_to_bottom_enabled + self.mode = mode + self.is_screenshot_enabled = is_screenshot_enabled + + async def extract_web_data_with_rest_api( + self, + url: str, + query: Optional[str] = None, + prompt: Optional[str] = None, + ) -> dict: + """ + Extracts structured data as a JSON from the active web page in a running browser instance using either an AgentQL query or a Natural Language description of the data. + + Args: + url: URL of the public webpage to extract data from. + query: AgentQL query used to extract the data. The query must be enclosed with curly braces `{}`. Either this field or `prompt` field must be provided. + prompt: Natural Language description of the data to extract from the page. If AgentQL query is not specified, always use the `prompt` field. Either this field or `query` field must be provided. + + Returns: + dict: Extracted data. + """ + _params = { + "wait_for": self.wait_for, + "is_scroll_to_bottom_enabled": self.is_scroll_to_bottom_enabled, + "mode": self.mode, + "is_screenshot_enabled": self.is_screenshot_enabled, + } + _metadata = { + "experimental_stealth_mode_enabled": self.is_stealth_mode_enabled, + } + + return await _aload_data( + url=url, + query=query, + prompt=prompt, + params=_params, + metadata=_metadata, + api_key=self._api_key, + timeout=self.timeout, + ) diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/const.py b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/const.py new file mode 100644 index 0000000000..0a027d0972 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/const.py @@ -0,0 +1,16 @@ +DEFAULT_EXTRACT_ELEMENTS_TIMEOUT_SECONDS = 300 +DEFAULT_EXTRACT_DATA_TIMEOUT_SECONDS = 900 +DEFAULT_WAIT_FOR_NETWORK_IDLE = True +DEFAULT_INCLUDE_HIDDEN_DATA = True +DEFAULT_INCLUDE_HIDDEN_ELEMENTS = False +DEFAULT_RESPONSE_MODE = "fast" + +DEFAULT_WAIT_FOR_PAGE_LOAD_SECONDS = 0 +DEFAULT_IS_SCROLL_TO_BOTTOM_ENABLED = False +DEFAULT_IS_SCREENSHOT_ENABLED = False +DEFAULT_IS_STEALTH_MODE_ENABLED = False + +EXTRACT_DATA_ENDPOINT = "https://api.agentql.com/v1/query-data" +DEFAULT_API_TIMEOUT_SECONDS = 900 + +REQUEST_ORIGIN = "llamaindex" diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/messages.py b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/messages.py new file mode 100644 index 0000000000..ca118d5151 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/messages.py @@ -0,0 +1,8 @@ +QUERY_PROMPT_REQUIRED_ERROR_MESSAGE = ( + "Invalid arguments provided. Either 'query' or 'prompt' must be provided." +) +QUERY_PROMPT_EXCLUSIVE_ERROR_MESSAGE = ( + "Invalid arguments provided. Only one of 'query' or 'prompt' should be provided." +) +UNSET_API_KEY_ERROR_MESSAGE = "No AgentQL API key provided. You can set your API key in code by specifying the `api_key` argument or by setting the `AGENTQL_API_KEY` environment variable. You can create an API key at https://dev.agentql.com." +UNAUTHORIZED_ERROR_MESSAGE = "Invalid AgentQL API key provided. Please provide a valid API Key. You can create one at https://dev.agentql.com." diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/utils.py b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/utils.py new file mode 100644 index 0000000000..f01489bd0c --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/llama_index/tools/agentql/utils.py @@ -0,0 +1,93 @@ +from typing import Optional +import agentql +import httpx + +from llama_index.tools.agentql.const import EXTRACT_DATA_ENDPOINT, REQUEST_ORIGIN +from llama_index.tools.agentql.messages import ( + QUERY_PROMPT_REQUIRED_ERROR_MESSAGE, + QUERY_PROMPT_EXCLUSIVE_ERROR_MESSAGE, + UNAUTHORIZED_ERROR_MESSAGE, +) + +try: + from playwright.async_api import Browser as AsyncBrowser + from playwright.async_api import Page as AsyncPage +except ImportError as e: + raise ImportError( + "Unable to import playwright. Please make sure playwright module is properly installed." + ) from e + + +async def _aget_current_agentql_page(browser: AsyncBrowser) -> AsyncPage: + """ + Get the current page of the async browser. + + Args: + browser: The browser to get the current page from. + + Returns: + Page: The current page. + """ + context = browser.contexts[0] if browser.contexts else await browser.new_context() + page = context.pages[-1] if context.pages else await context.new_page() + return await agentql.wrap_async(page) + + +def _handle_http_error(e: httpx.HTTPStatusError) -> None: + response = e.response + if response.status_code == httpx.codes.UNAUTHORIZED: + raise ValueError(UNAUTHORIZED_ERROR_MESSAGE) from e + + msg = response.text + try: + error_json = response.json() + msg = ( + error_json["error_info"] if "error_info" in error_json else str(error_json) + ) + except (ValueError, TypeError): + msg = f"HTTP {e}." + raise ValueError(msg) from e + + +async def _aload_data( + url: str, + api_key: str, + metadata: dict, + params: dict, + timeout: int, + query: Optional[str] = None, + prompt: Optional[str] = None, +) -> dict: + if not query and not prompt: + raise ValueError(QUERY_PROMPT_REQUIRED_ERROR_MESSAGE) + if query and prompt: + raise ValueError(QUERY_PROMPT_EXCLUSIVE_ERROR_MESSAGE) + + payload = { + "url": url, + "query": query, + "prompt": prompt, + "params": params, + "metadata": metadata, + } + + headers = { + "X-API-Key": f"{api_key}", + "Content-Type": "application/json", + "X-TF-Request-Origin": REQUEST_ORIGIN, + } + + async with httpx.AsyncClient() as client: + try: + response = await client.post( + EXTRACT_DATA_ENDPOINT, + headers=headers, + json=payload, + timeout=timeout, + ) + response.raise_for_status() + + except httpx.HTTPStatusError as e: + _handle_http_error(e) + else: + return response.json() diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/pyproject.toml b/llama-index-integrations/tools/llama-index-tools-agentql/pyproject.toml new file mode 100644 index 0000000000..222eeb131b --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/pyproject.toml @@ -0,0 +1,62 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.tools.agentql" + +[tool.llamahub.class_authors] +AgentQLBrowserToolSpec = "jayfish0" +AgentQLRestAPIToolSpec = "jayfish0" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = [ + {email = "jason@tinyfish.io", name = "Jason Zhang"}, +] +description = "llama-index tools agentql integration" +license = "MIT" +maintainers = [ + {email = "jason@tinyfish.io", name = "Jason Zhang"}, +] +name = "llama-index-tools-agentql" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "1.0.0" + +[tool.poetry.dependencies] +python = ">=3.9,<4.0" +llama-index-core = "^0.10.0" +agentql = "^1.8.1" +playwright = "^1.50.0" +httpx = "^0.28.1" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/tests/BUILD b/llama-index-integrations/tools/llama-index-tools-agentql/tests/BUILD new file mode 100644 index 0000000000..cad53b339e --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/tests/BUILD @@ -0,0 +1,9 @@ +python_tests( + name="tests", + +) + +python_test_utils( + name="test_utils", + +) diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/tests/__init__.py b/llama-index-integrations/tools/llama-index-tools-agentql/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/tests/conftest.py b/llama-index-integrations/tools/llama-index-tools-agentql/tests/conftest.py new file mode 100644 index 0000000000..8d6e6af5bc --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/tests/conftest.py @@ -0,0 +1,13 @@ +def get_testing_data() -> dict: + return { + "TEST_DATA": { + "product": { + "name": "iKey FT-88-TP-USB Police Emergency Car Mount Backlit Red USB Keyboard B22", + "price": 27.95, + "condition": "Used", + }, + "seller": {"name": "Ativo"}, + }, + "TEST_URL": "https://storage.googleapis.com/tf-benchmark/ebay_product_page/page.html", + "TEST_QUERY": "{ product { name price condition } seller { name }}", + } diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/tests/test_browser_spec.py b/llama-index-integrations/tools/llama-index-tools-agentql/tests/test_browser_spec.py new file mode 100644 index 0000000000..d3e97550d0 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/tests/test_browser_spec.py @@ -0,0 +1,67 @@ +import pytest +import os + +from llama_index.core.tools.tool_spec.base import BaseToolSpec +from llama_index.core.agent import FunctionCallingAgent + +from llama_index.tools.agentql import AgentQLBrowserToolSpec +from llama_index.tools.playwright import PlaywrightToolSpec + +from llama_index.llms.openai import OpenAI + +from tests.conftest import get_testing_data + + +def test_class(): + names_of_base_classes = [b.__name__ for b in AgentQLBrowserToolSpec.__mro__] + assert BaseToolSpec.__name__ in names_of_base_classes + + +class TestExtractDataBrowserTool: + @pytest.fixture(autouse=True) + async def agentql_browser_tool(self): + test_data = get_testing_data() + # Use playwright tool to navigate to the test url + async_browser = await PlaywrightToolSpec.create_async_playwright_browser() + playwright_tool = PlaywrightToolSpec.from_async_browser(async_browser) + await playwright_tool.navigate_to(test_data["TEST_URL"]) + + # initialize extract data browser tool + agentql_browser_tool = AgentQLBrowserToolSpec(async_browser=async_browser) + yield agentql_browser_tool + await async_browser.close() + + @pytest.fixture() + def agent(self, agentql_browser_tool): + return FunctionCallingAgent.from_tools( + agentql_browser_tool.to_tool_list(), + llm=OpenAI(model="gpt-4o"), + ) + + @pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ or "AGENTQL_API_KEY" not in os.environ, + reason="OPENAI_API_KEY or AGENTQL_API_KEY is not set", + ) + def test_extract_web_data_browser_tool_call(self, agent): + test_data = get_testing_data() + res = agent.chat( + f""" + extract data with the following agentql query: {test_data["TEST_QUERY"]} + """ + ) + tool_output = res.sources[0] + assert tool_output.tool_name == "extract_web_data_from_browser" + assert tool_output.raw_input["kwargs"] == { + "query": test_data["TEST_QUERY"], + } + assert tool_output.raw_output == test_data["TEST_DATA"] + + @pytest.mark.skipif( + "AGENTQL_API_KEY" not in os.environ, + reason="AGENTQL_API_KEY is not set", + ) + async def test_get_web_element_browser_tool_call(self, agentql_browser_tool): + next_page_button = await agentql_browser_tool.get_web_element_from_browser( + prompt="button for buying it now", + ) + assert next_page_button == "[tf623_id='965']" diff --git a/llama-index-integrations/tools/llama-index-tools-agentql/tests/test_rest_api_spec.py b/llama-index-integrations/tools/llama-index-tools-agentql/tests/test_rest_api_spec.py new file mode 100644 index 0000000000..8d5eed337f --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-agentql/tests/test_rest_api_spec.py @@ -0,0 +1,44 @@ +import pytest +import os + +from llama_index.core.tools.tool_spec.base import BaseToolSpec +from llama_index.core.agent import FunctionCallingAgent + +from llama_index.tools.agentql import AgentQLRestAPIToolSpec +from llama_index.llms.openai import OpenAI + +from tests.conftest import get_testing_data + + +def test_class(): + names_of_base_classes = [b.__name__ for b in AgentQLRestAPIToolSpec.__mro__] + assert BaseToolSpec.__name__ in names_of_base_classes + + +class TestExtractDataRestApiTool: + @pytest.fixture() + def agent(self): + agentql_rest_api_tool = AgentQLRestAPIToolSpec() + return FunctionCallingAgent.from_tools( + agentql_rest_api_tool.to_tool_list(), + llm=OpenAI(model="gpt-4o"), + ) + + @pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ or "AGENTQL_API_KEY" not in os.environ, + reason="OPENAI_API_KEY or AGENTQL_API_KEY is not set", + ) + def test_extract_web_data_llm_tool_call(self, agent): + test_data = get_testing_data() + res = agent.chat( + f""" + extract the data from {test_data["TEST_URL"]} with the following agentql query: {test_data["TEST_QUERY"]} + """ + ) + tool_output = res.sources[0] + assert tool_output.tool_name == "extract_web_data_with_rest_api" + assert tool_output.raw_input["kwargs"] == { + "url": test_data["TEST_URL"], + "query": test_data["TEST_QUERY"], + } + assert tool_output.raw_output["data"] == test_data["TEST_DATA"] -- GitLab