Skip to content
Snippets Groups Projects
Unverified Commit f51dbc3b authored by Adithya Krishnan's avatar Adithya Krishnan Committed by GitHub
Browse files

Add: duckdb vector store (#10805)


* Add: duckdb vector store

* Update: lin & format

* Add: tests and metadata

* Add: duckdb vector store

* Update: lin & format

* Add: tests and metadata

* pants tailor

* fix version constraint

* Update: set home directory to user home

---------

Co-authored-by: default avatarAndrei Fajardo <andrei@nerdai.io>
parent db84afea
No related branches found
No related tags found
No related merge requests found
Showing
with 5718 additions and 0 deletions
llama_index/_static
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
bin/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
etc/
include/
lib/
lib64/
parts/
sdist/
share/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
.ruff_cache
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
notebooks/
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pyvenv.cfg
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# Jetbrains
.idea
modules/
*.swp
# VsCode
.vscode
# pipenv
Pipfile
Pipfile.lock
# pyright
pyrightconfig.json
python_sources()
poetry_requirements(
name="poetry",
)
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
help: ## Show all Makefile targets.
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
format: ## Run code autoformatters (black).
pre-commit install
git ls-files | xargs pre-commit run black --files
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
test: ## Run tests via pytest.
pytest tests
watch-docs: ## Build and watch documentation.
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
publish:
poetry publish --build --username __token__ --password $$PYPI_KEY --build --skip-existing
# LlamaIndex Vector_Stores Integration: Duckdb
from llama_index.vector_stores.duckdb.base import DuckDBVectorStore
__all__ = ["DuckDBVectorStore"]
"""DuckDB vector store."""
import logging
import json
from typing import Any, List, Optional
import os
from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.schema import BaseNode, MetadataMode, TextNode
from llama_index.core.vector_stores.types import (
BasePydanticVectorStore,
MetadataFilters,
VectorStoreQuery,
VectorStoreQueryResult,
)
from llama_index.core.vector_stores.utils import (
node_to_metadata_dict,
)
logger = logging.getLogger(__name__)
import_err_msg = "`duckdb` package not found, please run `pip install duckdb`"
class DuckDBLocalContext:
def __init__(self, database_path: str):
self.database_path = database_path
self._conn = None
self._home_dir = os.path.expanduser("~")
def __enter__(self) -> "duckdb.DuckDBPyConnection":
try:
import duckdb
except ImportError:
raise ImportError(import_err_msg)
if not os.path.exists(os.path.dirname(self.database_path)):
raise ValueError(
f"Directory {os.path.dirname(self.database_path)} does not exist."
)
# if not os.path.isfile(self.database_path):
# raise ValueError(f"Database path {self.database_path} is not a valid file.")
self._conn = duckdb.connect(self.database_path)
self._conn.execute(f"SET home_directory='{self._home_dir}';")
self._conn.install_extension("json")
self._conn.load_extension("json")
self._conn.install_extension("fts")
self._conn.load_extension("fts")
return self._conn
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
self._conn.close()
if self._conn:
self._conn.close()
class DuckDBVectorStore(BasePydanticVectorStore):
"""DuckDB vector store.
In this vector store, embeddings are stored within a DuckDB database.
During query time, the index uses DuckDB to query for the top
k most similar nodes.
"""
stores_text: bool = True
flat_metadata: bool = True
database_name: Optional[str]
table_name: Optional[str]
# schema_name: Optional[str] # TODO: support schema name
embed_dim: Optional[int]
# hybrid_search: Optional[bool] # TODO: support hybrid search
text_search_config: Optional[dict]
persist_dir: Optional[str]
_conn: Any = PrivateAttr()
_is_initialized: bool = PrivateAttr(default=False)
_database_path: Optional[str] = PrivateAttr()
def __init__(
self,
database_name: Optional[str] = ":memory:",
table_name: Optional[str] = "documents",
# schema_name: Optional[str] = "main",
embed_dim: Optional[int] = 1536,
# hybrid_search: Optional[bool] = False,
# https://duckdb.org/docs/extensions/full_text_search
text_search_config: Optional[dict] = {
"stemmer": "english",
"stopwords": "english",
"ignore": "(\\.|[^a-z])+",
"strip_accents": True,
"lower": True,
"overwrite": False,
},
persist_dir: Optional[str] = "./storage",
**kwargs: Any,
) -> None:
"""Init params."""
try:
import duckdb
except ImportError:
raise ImportError(import_err_msg)
self._is_initialized = False
if database_name == ":memory:":
_home_dir = os.path.expanduser("~")
self._conn = duckdb.connect(database_name)
self._conn.execute(f"SET home_directory='{_home_dir}';")
self._conn.install_extension("json")
self._conn.load_extension("json")
self._conn.install_extension("fts")
self._conn.load_extension("fts")
else:
# check if persist dir exists
if not os.path.exists(persist_dir):
os.makedirs(persist_dir)
self._database_path = os.path.join(persist_dir, database_name)
with DuckDBLocalContext(self._database_path) as _conn:
pass
self._conn = None
super().__init__(
database_name=database_name,
table_name=table_name,
# schema_name=schema_name,
embed_dim=embed_dim,
# hybrid_search=hybrid_search,
text_search_config=text_search_config,
persist_dir=persist_dir,
)
@classmethod
def from_local(
cls, database_path: str, table_name: str = "documents"
) -> "DuckDBVectorStore":
"""Load a DuckDB vector store from a local file."""
with DuckDBLocalContext(database_path) as _conn:
try:
_table_info = _conn.execute(f"SHOW {table_name};").fetchall()
except Exception as e:
raise ValueError(f"Index table {table_name} not found in the database.")
_std = {
"text": "VARCHAR",
"node_id": "VARCHAR",
"embedding": "FLOAT[]",
"metadata_": "JSON",
}
_ti = {_i[0]: _i[1] for _i in _table_info}
if _std != _ti:
raise ValueError(
f"Index table {table_name} does not have the correct schema."
)
_cls = cls(
database_name=os.path.basename(database_path),
table_name=table_name,
persist_dir=os.path.dirname(database_path),
)
_cls._is_initialized = True
return _cls
@classmethod
def from_params(
cls,
database_name: Optional[str] = ":memory:",
table_name: Optional[str] = "documents",
# schema_name: Optional[str] = "main",
embed_dim: Optional[int] = 1536,
# hybrid_search: Optional[bool] = False,
text_search_config: Optional[dict] = {
"stemmer": "english",
"stopwords": "english",
"ignore": "(\\.|[^a-z])+",
"strip_accents": True,
"lower": True,
"overwrite": False,
},
persist_dir: Optional[str] = "./storage",
**kwargs: Any,
) -> "DuckDBVectorStore":
return cls(
database_name=database_name,
table_name=table_name,
# schema_name=schema_name,
embed_dim=embed_dim,
# hybrid_search=hybrid_search,
text_search_config=text_search_config,
persist_dir=persist_dir,
**kwargs,
)
@classmethod
def class_name(cls) -> str:
return "DuckDBVectorStore"
@property
def client(self) -> Any:
"""Return client."""
return self._conn
def _initialize(self) -> None:
if not self._is_initialized:
# TODO: schema.table also.
# Check if table and type is present
# if not, create table
if self.database_name == ":memory:":
self._conn.execute(
f"""
CREATE TABLE {self.table_name} (
node_id VARCHAR,
text TEXT,
embedding FLOAT[{self.embed_dim}],
metadata_ JSON
);
"""
)
else:
with DuckDBLocalContext(self._database_path) as _conn:
_conn.execute(
f"""
CREATE TABLE {self.table_name} (
node_id VARCHAR,
text TEXT,
embedding FLOAT[{self.embed_dim}],
metadata_ JSON
);
"""
)
self._is_initialized = True
def _node_to_table_row(self, node: BaseNode) -> Any:
return (
node.node_id,
node.get_content(metadata_mode=MetadataMode.NONE),
node.get_embedding(),
node_to_metadata_dict(
node,
remove_text=True,
flat_metadata=self.flat_metadata,
),
)
def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
"""Add nodes to index.
Args:
nodes: List[BaseNode]: list of nodes with embeddings
"""
self._initialize()
ids = []
if self.database_name == ":memory:":
_table = self._conn.table(self.table_name)
for node in nodes:
ids.append(node.node_id)
_row = self._node_to_table_row(node)
_table.insert(_row)
else:
with DuckDBLocalContext(self._database_path) as _conn:
_table = _conn.table(self.table_name)
for node in nodes:
ids.append(node.node_id)
_row = self._node_to_table_row(node)
_table.insert(_row)
return ids
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
"""
Delete nodes using with ref_doc_id.
Args:
ref_doc_id (str): The doc_id of the document to delete.
"""
_ddb_query = f"""
DELETE FROM {self.table_name}
WHERE json_extract_string(metadata_, '$.ref_doc_id') = '{ref_doc_id}';
"""
if self.database_name == ":memory:":
self._conn.execute(_ddb_query)
else:
with DuckDBLocalContext(self._database_path) as _conn:
_conn.execute(_ddb_query)
@staticmethod
def _build_metadata_filter_condition(
standard_filters: MetadataFilters,
) -> dict:
"""Translate standard metadata filters to DuckDB SQL specification."""
filters_list = []
# condition = standard_filters.condition or "and" ## and/or as strings.
condition = "AND"
_filters_condition_list = []
for filter in standard_filters.filters:
if filter.operator:
if filter.operator in [
"<",
">",
"<=",
">=",
"<>",
"!=",
]:
filters_list.append((filter.key, filter.operator, filter.value))
elif filter.operator in ["=="]:
filters_list.append((filter.key, "=", filter.value))
else:
raise ValueError(
f"Filter operator {filter.operator} not supported."
)
else:
filters_list.append((filter.key, "=", filter.value))
for _fc in filters_list:
if isinstance(_fc[2], str):
_filters_condition_list.append(
f"json_extract_string(metadata_, '$.{_fc[0]}') {_fc[1]} '{_fc[2]}'"
)
else:
_filters_condition_list.append(
f"json_extract(metadata_, '$.{_fc[0]}') {_fc[1]} {_fc[2]}"
)
return f" {condition} ".join(_filters_condition_list)
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
"""Query index for top k most similar nodes.
Args:
query.query_embedding (List[float]): query embedding
query.similarity_top_k (int): top k most similar nodes
"""
nodes = []
similarities = []
ids = []
if query.filters is not None:
# TODO: results from the metadata filter query
_filter_string = self._build_metadata_filter_condition(query.filters)
_ddb_query = f"""
SELECT node_id, text, embedding, metadata_, score
FROM (
SELECT *, list_cosine_similarity(embedding, {query.query_embedding}) AS score
FROM {self.table_name}
WHERE {_filter_string}
) sq
WHERE score IS NOT NULL
ORDER BY score DESC LIMIT {query.similarity_top_k};
"""
else:
_ddb_query = f"""
SELECT node_id, text, embedding, metadata_, score
FROM (
SELECT *, list_cosine_similarity(embedding, {query.query_embedding}) AS score
FROM {self.table_name}
) sq
WHERE score IS NOT NULL
ORDER BY score DESC LIMIT {query.similarity_top_k};
"""
if self.database_name == ":memory:":
_final_results = self._conn.execute(_ddb_query).fetchall()
else:
with DuckDBLocalContext(self._database_path) as _conn:
_final_results = _conn.execute(_ddb_query).fetchall()
for _row in _final_results:
node = TextNode(
id_=_row[0],
text=_row[1],
embedding=_row[2],
metadata=json.loads(_row[3]),
)
nodes.append(node)
similarities.append(_row[4])
ids.append(_row[0])
return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
[build-system]
build-backend = "poetry.core.masonry.api"
requires = ["poetry-core"]
[tool.codespell]
check-filenames = true
check-hidden = true
# Feel free to un-skip examples, and experimental, you will just need to
# work through many typos (--write-changes and --interactive will help)
skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
[tool.llamahub]
classes = ["DuckDBVectorStore"]
contains_example = true
import_path = "llama_index.vector_stores.duckdb"
[tool.mypy]
disallow_untyped_defs = true
# Remove venv skip when integrated with pre-commit
exclude = ["_static", "build", "examples", "notebooks", "venv"]
ignore_missing_imports = true
python_version = "3.8"
[tool.poetry]
authors = ["Adithya Krishnan <me@krishadi.com>"]
description = "llama-index vector_stores duckdb integration"
license = "MIT"
maintainers = ["krish-adi"]
name = "llama-index-vector-stores-duckdb"
packages = [{include = "llama_index/"}]
readme = "README.md"
version = "0.1.0"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
llama-index-core = "^0.10.0"
duckdb = "0.9.2"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
codespell = {extras = ["toml"], version = ">=v2.2.6"}
ipython = "8.10.0"
jupyter = "^1.0.0"
mypy = "0.991"
pre-commit = "3.2.0"
pylint = "2.15.10"
pytest = "7.2.1"
pytest-mock = "3.11.1"
ruff = "0.0.292"
tree-sitter-languages = "^1.8.0"
types-Deprecated = ">=0.1.0"
types-PyYAML = "^6.0.12.12"
types-protobuf = "^4.24.0.4"
types-redis = "4.5.5.0"
types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991
types-setuptools = "67.1.0.0"
python_sources()
python_tests(
name="tests0",
)
import pytest
from typing import List
import importlib.util
from llama_index.core.schema import NodeRelationship, RelatedNodeInfo, TextNode
from llama_index.core.vector_stores.types import VectorStoreQuery
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.vector_stores.duckdb import DuckDBVectorStore
def test_duckdb_installed():
assert importlib.util.find_spec("duckdb") is not None
@pytest.fixture(scope="module")
def text_node_list() -> List[TextNode]:
return [
TextNode(
text="lorem ipsum",
id_="c330d77f-90bd-4c51-9ed2-57d8d693b3b0",
relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="test-0")},
metadata={
"author": "Stephen King",
"theme": "Friendship",
},
embedding=[1.0, 0.0, 0.0],
),
TextNode(
text="lorem ipsum",
id_="c3d1e1dd-8fb4-4b8f-b7ea-7fa96038d39d",
relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="test-1")},
metadata={
"director": "Francis Ford Coppola",
"theme": "Mafia",
},
embedding=[0.0, 1.0, 0.0],
),
TextNode(
text="lorem ipsum",
id_="c3ew11cd-8fb4-4b8f-b7ea-7fa96038d39d",
relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="test-2")},
metadata={
"director": "Christopher Nolan",
},
embedding=[0.0, 0.0, 1.0],
),
TextNode(
text="I was taught that the way of progress was neither swift nor easy.",
id_="0b31ae71-b797-4e88-8495-031371a7752e",
relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="text-3")},
metadata={
"author": "Marie Curie",
},
embedding=[0.0, 0.0, 0.9],
),
TextNode(
text=(
"The important thing is not to stop questioning."
+ " Curiosity has its own reason for existing."
),
id_="bd2e080b-159a-4030-acc3-d98afd2ba49b",
relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="text-4")},
metadata={
"author": "Albert Einstein",
},
embedding=[0.0, 0.0, 0.5],
),
TextNode(
text=(
"I am no bird; and no net ensnares me;"
+ " I am a free human being with an independent will."
),
id_="f658de3b-8cef-4d1c-8bed-9a263c907251",
relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="text-5")},
metadata={
"author": "Charlotte Bronte",
},
embedding=[0.0, 0.0, 0.3],
),
]
@pytest.fixture(scope="module")
def vector_store() -> DuckDBVectorStore:
return DuckDBVectorStore()
def test_instance_creation_from_memory(
vector_store: DuckDBVectorStore,
) -> None:
assert isinstance(vector_store, DuckDBVectorStore)
assert vector_store.database_name == ":memory:"
def test_duckdb_add_and_query(
vector_store: DuckDBVectorStore, text_node_list: List[TextNode]
) -> None:
vector_store.add(text_node_list)
res = vector_store.query(
VectorStoreQuery(query_embedding=[1.0, 0.0, 0.0], similarity_top_k=1)
)
assert res.nodes
assert res.nodes[0].get_content() == "lorem ipsum"
from llama_index.core.vector_stores.types import BasePydanticVectorStore
from llama_index.vector_stores.duckdb.base import DuckDBVectorStore
def test_class():
names_of_base_classes = [b.__name__ for b in DuckDBVectorStore.__mro__]
assert BasePydanticVectorStore.__name__ in names_of_base_classes
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment