diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/.gitignore b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..990c18de229088f55c6c514fd0f2d49981d1b0e7 --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/BUILD b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..c8a68223980975f6f3e17dcbe1df672045711eff --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry" +) diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/Makefile b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b9eab05aa370629a4a3de75df3ff64cd53887b68 --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/README.md b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b5be21d44c076f5f7ead926712b341e3fe8667c9 --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/README.md @@ -0,0 +1,49 @@ +## TL;DW Video Retriever + +### Overview + +**TL;DW** is a powerful video understanding API that retrieves precise moments from videos using natural language queries. By integrating **TL;DW** with **LlamaIndex**, we can efficiently index and search video content, enabling seamless knowledge retrieval from videos. + +### Setup + +- Obtain API keys from [tl;dw Playground](https://app.trytldw.ai/account?tab=api). New users are granted free indexing minutes automatically. + +- Install the required packages: + +```sh +pip install llama-index-retrievers-tldw +``` + +### Usage + +- Initialize the TldwRetriever with your API key and collection ID: + +```python +from llama_index.retrievers.tldw import TldwRetriever +from llama_index.core.query_engine import RetrieverQueryEngine + +# Initialize the retriever +retriever = TldwRetriever( + api_key="YOUT_TLDW_API_KEY", + collection_id="YOUR_COLLECTION_ID", # Replace with your actual collection ID +) + +# Create a query engine +query_engine = RetrieverQueryEngine( + retriever=retriever, +) + +# Query and summarize response +response = query_engine.query("What are the brands of smart watches reviewed?") +print( + response +) # "The brands of smartwatches reviewed in the videos are Apple and Garmin." +``` + +## Support + +If you have any questions or feedback, please feel free to reach out to us. + +- [tl;dw AI](https://www.trytldw.ai/) +- [Code Examples](https://github.com/tldw-ai/example-playbooks) +- [Email](mailto:contact@trytldw.ai) diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/BUILD b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/__init__.py b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b32c6e5d1a794e3d4f962f5c187fa7fdbf77d02f --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/__init__.py @@ -0,0 +1,3 @@ +from llama_index.retrievers.tldw.base import TldwRetriever + +__all__ = ["TldwRetriever"] diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/base.py b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/base.py new file mode 100644 index 0000000000000000000000000000000000000000..61b014331ecead4cdb7a8ba55190ec3a0ce9d36d --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/llama_index/retrievers/tldw/base.py @@ -0,0 +1,101 @@ +import logging +from typing import Any, Dict, List, Optional + +import requests +from llama_index.core.base.base_retriever import BaseRetriever +from llama_index.core.callbacks.base import CallbackManager +from llama_index.core.schema import ( + NodeWithScore, + QueryBundle, + TextNode, +) +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + +API_ENDPOINT = "https://api.trytldw.ai/v1" + + +class Fragment(BaseModel): + """Represents a fragment of a video scene with metadata.""" + + uuid: str + start_ms: float + end_ms: float + similarity: float + description: str + + +class Scene(BaseModel): + """Represents a video scene containing multiple fragments.""" + + media_id: str + external_id: str + start_ms: float + end_ms: float + max_similarity: float + fragments: List[Fragment] + + +class SearchResult(BaseModel): + """Encapsulates the search results from the TL;DW API.""" + + scenes: List[Scene] + metadata: Dict[str, Any] + + +class TldwRetriever(BaseRetriever): + r""" + A retriever that searches for relevant video moments from the TL;DW collection. + + Args: + api_key (str): The API key for authentication. + collection_id (str): The ID of the video collection to search within. + callback_manager (Optional[CallbackManager]): Optional callback manager for logging and event handling. + """ + + def __init__( + self, + api_key: str, + collection_id: str, + callback_manager: Optional[CallbackManager] = None, + ) -> None: + self._api_key = api_key + self._collection_id = collection_id + super().__init__( + callback_manager=callback_manager, + ) + + def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: + headers = { + "Authorization": f"Bearer {self._api_key}", + } + res = requests.post( + f"{API_ENDPOINT}/search", + headers=headers, + json={ + "collection_id": self._collection_id, + "search_term": query_bundle.query_str, + }, + ) + search_results = SearchResult.model_validate(res.json()) + + # Return individual fragments as nodes + return [ + NodeWithScore( + node=TextNode( + text=fragment.description, + metadata={ + "scene_index": idx, + "media_id": scene.media_id, + "start_ms": fragment.start_ms, + "end_ms": fragment.end_ms, + "scene_start_ms": scene.start_ms, + "scene_end_ms": scene.end_ms, + }, + ), + score=fragment.similarity, + ) + for idx, scene in enumerate(search_results.scenes) + for fragment in scene.fragments + ] diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/pyproject.toml b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..0aeb1e4ade524578dc49e3bdd961223b53443776 --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/pyproject.toml @@ -0,0 +1,62 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.retrievers.tldw" + +[tool.llamahub.class_authors] +TldwRetriever = "tldw-ai" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Shihang W <sw@trytldw.ai>"] +description = "llama-index retrievers tl;dw AI integration" +exclude = ["**/BUILD"] +license = "MIT" +name = "llama-index-retrievers-tldw" +readme = "README.md" +version = "0.0.1" + +[tool.poetry.dependencies] +python = ">=3.9,<4.0" +llama-index-core = "^0.12.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/tests/BUILD b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/tests/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..dabf212d7e7162849c24a733909ac4f645d75a31 --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/tests/__init__.py b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llama-index-integrations/retrievers/llama-index-retrievers-tldw/tests/test_retrievers_tldw_retriever.py b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/tests/test_retrievers_tldw_retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..6c99120f2c394334233f110077b02480ee6f473a --- /dev/null +++ b/llama-index-integrations/retrievers/llama-index-retrievers-tldw/tests/test_retrievers_tldw_retriever.py @@ -0,0 +1,7 @@ +from llama_index.core.base.base_retriever import BaseRetriever +from llama_index.retrievers.tldw.base import TldwRetriever + + +def test_class(): + names_of_base_classes = [b.__name__ for b in TldwRetriever.__mro__] + assert BaseRetriever.__name__ in names_of_base_classes