diff --git a/docs/examples/vector_stores/BaiduVectorDBIndexDemo.ipynb b/docs/examples/vector_stores/BaiduVectorDBIndexDemo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a2879e911c71bd8f54d85b2932d1fa18107ab4ee --- /dev/null +++ b/docs/examples/vector_stores/BaiduVectorDBIndexDemo.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0b692c73", + "metadata": {}, + "source": [ + "# Baidu VectorDB" + ] + }, + { + "cell_type": "markdown", + "id": "1e7787c2", + "metadata": {}, + "source": [ + ">[Baidu VectorDB](https://cloud.baidu.com/product/vdb.html) is a robust, enterprise-level distributed database service, meticulously developed and fully managed by Baidu Intelligent Cloud. It stands out for its exceptional ability to store, retrieve, and analyze multi-dimensional vector data. At its core, VectorDB operates on Baidu's proprietary \\\"Mochow\\\" vector database kernel, which ensures high performance, availability, and security, alongside remarkable scalability and user-friendliness.\n", + "\n", + ">This database service supports a diverse range of index types and similarity calculation methods, catering to various use cases. A standout feature of VectorDB is its capacity to manage an immense vector scale of up to 10 billion, while maintaining impressive query performance, supporting millions of queries per second (QPS) with millisecond-level query latency.\n", + "\n", + "**This notebook shows the basic usage of BaiduVectorDB as a Vector Store in LlamaIndex.**\n", + "\n", + "To run, you should have a [Database instance.](https://cloud.baidu.com/doc/VDB/s/hlrsoazuf)" + ] + }, + { + "cell_type": "markdown", + "id": "daff81fe", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f5e26095", + "metadata": {}, + "source": [ + "If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf987167", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-vector-stores-baiduvectordb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "156e71f4", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install llama-index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9f8dbcb", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install pymochow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47264e32", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import (\n", + " VectorStoreIndex,\n", + " SimpleDirectoryReader,\n", + " StorageContext,\n", + ")\n", + "from llama_index.vector_stores.baiduvectordb import (\n", + " BaiduVectorDB,\n", + " TableParams,\n", + " TableField,\n", + ")\n", + "import pymochow" + ] + }, + { + "cell_type": "markdown", + "id": "f9b97a89", + "metadata": {}, + "source": [ + "### Please provide OpenAI access key\n", + "\n", + "In order use embeddings by OpenAI you need to supply an OpenAI API Key:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c9f4d21-145a-401e-95ff-ccb259e8ef84", + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "\n", + "OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n", + "openai.api_key = OPENAI_API_KEY" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cb63ec81", + "metadata": {}, + "source": [ + "## Download Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b869a554", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p 'data/paul_graham/'\n", + "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'" + ] + }, + { + "cell_type": "markdown", + "id": "59ff935d", + "metadata": {}, + "source": [ + "## Creating and populating the Vector Store\n", + "\n", + "You will now load some essays by Paul Graham from a local file and store them into the Baidu VectorDB." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68cbd239-880e-41a3-98d8-dbb3fab55431", + "metadata": {}, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader(\"./data/paul_graham\").load_data()\n", + "print(f\"Total documents: {len(documents)}\")\n", + "print(f\"First document, id: {documents[0].doc_id}\")\n", + "print(f\"First document, hash: {documents[0].hash}\")\n", + "print(\n", + " f\"First document, text ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "dd270925", + "metadata": {}, + "source": [ + "### Initialize the Baidu VectorDB\n", + "\n", + "Creation of the vector store entails creation of the underlying database collection if it does not exist yet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afc5c44f", + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = BaiduVectorDB(\n", + " endpoint=\"http://192.168.X.X\",\n", + " api_key=\"*******\",\n", + " table_params=TableParams(dimension=1536, drop_exists=True),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cbabd1a7", + "metadata": {}, + "source": [ + "Now wrap this store into an `index` LlamaIndex abstraction for later querying:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca205b88", + "metadata": {}, + "outputs": [], + "source": [ + "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", + "\n", + "index = VectorStoreIndex.from_documents(\n", + " documents, storage_context=storage_context\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cb11e2e2", + "metadata": {}, + "source": [ + "Note that the above `from_documents` call does several things at once: it splits the input documents into chunks of manageable size (\"nodes\"), computes embedding vectors for each node, and stores them all in the Baidu VectorDB." + ] + }, + { + "cell_type": "markdown", + "id": "04304299-fc3e-40a0-8600-f50c3292767e", + "metadata": {}, + "source": [ + "## Querying the store" + ] + }, + { + "cell_type": "markdown", + "id": "b241797e", + "metadata": {}, + "source": [ + "### Basic querying" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35369eda", + "metadata": {}, + "outputs": [], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"Why did the author choose to work on AI?\")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "id": "48761020", + "metadata": {}, + "source": [ + "### MMR-based queries\n", + "\n", + "The MMR (maximal marginal relevance) method is designed to fetch text chunks from the store that are at the same time relevant to the query but as different as possible from each other, with the goal of providing a broader context to the building of the final answer:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb2054c0", + "metadata": {}, + "outputs": [], + "source": [ + "query_engine = index.as_query_engine(vector_store_query_mode=\"mmr\")\n", + "response = query_engine.query(\"Why did the author choose to work on AI?\")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "id": "4d7bc976", + "metadata": {}, + "source": [ + "## Connecting to an existing store\n", + "\n", + "Since this store is backed by Baidu VectorDB, it is persistent by definition. So, if you want to connect to a store that was created and populated previously, here is how:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0aae26e", + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = BaiduVectorDB(\n", + " endpoint=\"http://192.168.X.X\",\n", + " api_key=\"*******\",\n", + " table_params=TableParams(dimension=1536, drop_exists=False),\n", + ")\n", + "\n", + "# Create index (from preexisting stored vectors)\n", + "new_index_instance = VectorStoreIndex.from_vector_store(\n", + " vector_store=new_vector_store\n", + ")\n", + "\n", + "# now you can do querying, etc:\n", + "query_engine = index.as_query_engine(similarity_top_k=5)\n", + "response = query_engine.query(\n", + " \"What did the author study prior to working on AI?\"\n", + ")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "id": "9fa59402", + "metadata": {}, + "source": [ + "## Metadata filtering\n", + "\n", + "The Baidu VectorDB vector store support metadata filtering in the form of exact-match `key=value` pairs at query time. The following cells, which work on a brand new collection, demonstrate this feature.\n", + "\n", + "In this demo, for the sake of brevity, a single source document is loaded (the `../data/paul_graham/paul_graham_essay.txt` text file). Nevertheless, you will attach some custom metadata to the document to illustrate how you can can restrict queries with conditions on the metadata attached to the documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23c6ff1a", + "metadata": {}, + "outputs": [], + "source": [ + "filter_fields = [\n", + " TableField(name=\"source_type\"),\n", + "]\n", + "\n", + "md_storage_context = StorageContext.from_defaults(\n", + " vector_store=BaiduVectorDB(\n", + " endpoint=\"http://192.168.X.X\",\n", + " api_key=\"=\"*******\",\",\n", + " table_params=TableParams(\n", + " dimension=1536, drop_exists=True, filter_fields=filter_fields\n", + " ),\n", + " )\n", + ")\n", + "\n", + "\n", + "def my_file_metadata(file_name: str):\n", + " \"\"\"Depending on the input file name, associate a different metadata.\"\"\"\n", + " if \"essay\" in file_name:\n", + " source_type = \"essay\"\n", + " elif \"dinosaur\" in file_name:\n", + " # this (unfortunately) will not happen in this demo\n", + " source_type = \"dinos\"\n", + " else:\n", + " source_type = \"other\"\n", + " return {\"source_type\": source_type}\n", + "\n", + "\n", + "# Load documents and build index\n", + "md_documents = SimpleDirectoryReader(\n", + " \"../data/paul_graham\", file_metadata=my_file_metadata\n", + ").load_data()\n", + "md_index = VectorStoreIndex.from_documents(\n", + " md_documents, storage_context=md_storage_context\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4bfd6f6", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.vector_stores import MetadataFilter, MetadataFilters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "733467f3", + "metadata": {}, + "outputs": [], + "source": [ + "md_query_engine = md_index.as_query_engine(\n", + " filters=MetadataFilters(\n", + " filters=[MetadataFilter(key=\"source_type\", value=\"essay\")]\n", + " )\n", + ")\n", + "md_response = md_query_engine.query(\n", + " \"How long it took the author to write his thesis?\"\n", + ")\n", + "print(md_response.response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/module_guides/storing/vector_stores.md b/docs/module_guides/storing/vector_stores.md index 70e8f8e24b9692e20f9bd397a7d39ddfb9aa51d4..2d721c8361ad79191bf0f01c8269062a908feb2b 100644 --- a/docs/module_guides/storing/vector_stores.md +++ b/docs/module_guides/storing/vector_stores.md @@ -19,6 +19,7 @@ We are actively adding more integrations and improving feature coverage for each | Astra DB | cloud | ✓ | | ✓ | ✓ | | | Azure Cognitive Search | cloud | | ✓ | ✓ | ✓ | | | Azure CosmosDB MongoDB | cloud | | | ✓ | ✓ | | +| BaiduVectorDB | cloud | ✓ | ✓ | | ✓ | | | ChatGPT Retrieval Plugin | aggregator | | | ✓ | ✓ | | | Chroma | self-hosted | ✓ | | ✓ | ✓ | | | DashVector | cloud | ✓ | ✓ | ✓ | ✓ | | @@ -64,6 +65,7 @@ maxdepth: 1 /examples/vector_stores/AsyncIndexCreationDemo.ipynb /examples/vector_stores/AzureAISearchIndexDemo.ipynb /examples/vector_stores/AzureCosmosDBMongoDBvCoreDemo.ipynb +/examples/vector_stores/BaiduVectorDBIndexDemo.ipynb /examples/vector_stores/CassandraIndexDemo.ipynb /examples/vector_stores/ChromaIndexDemo.ipynb /examples/vector_stores/DashvectorIndexDemo.ipynb diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..0896ca890d8bffd60a44fa824f8d57fecd73ee53 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/Makefile b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6aff20724a253984eba0edf2002f8d1a11071f98 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/Makefile @@ -0,0 +1,20 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ + +publish: + poetry publish --build --username __token__ --password $$PYPI_KEY --build --skip-existing diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b2597b5be15ac6952fb4966ed3744afab7b2da50 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/README.md @@ -0,0 +1 @@ +# LlamaIndex Vector_Stores Integration: Baiduvectordb diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..282ce7d42fedad86f91e851876ce4bff31a06b1f --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/__init__.py @@ -0,0 +1,7 @@ +from llama_index.vector_stores.baiduvectordb.base import ( + BaiduVectorDB, + TableParams, + TableField, +) + +__all__ = ["BaiduVectorDB", "TableParams", "TableField"] diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/base.py new file mode 100644 index 0000000000000000000000000000000000000000..4a8a00e4120ea52b3876d9ff09c7c0005b36a6c7 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/llama_index/vector_stores/baiduvectordb/base.py @@ -0,0 +1,463 @@ +"""A store that is built with Baidu VectorDB.""" + +import json +import time +from typing import Any, Dict, List, Optional + +from llama_index.core.schema import ( + BaseNode, + NodeRelationship, + RelatedNodeInfo, + TextNode, +) +from llama_index.core.vector_stores.types import ( + MetadataFilters, + VectorStore, + VectorStoreQuery, + VectorStoreQueryResult, +) +from llama_index.core.vector_stores.utils import DEFAULT_DOC_ID_KEY, DEFAULT_TEXT_KEY + +DEFAULT_ACCOUNT = "root" +DEFAULT_DATABASE_NAME = "llama_default_database" +DEFAULT_TABLE_NAME = "llama_default_table" +DEFAULT_TIMEOUT_IN_MILLS: int = 30 * 1000 + +DEFAULT_PARTITION = 1 +DEFAULT_REPLICA = 3 +DEFAULT_INDEX_TYPE = "HNSW" +DEFAULT_METRIC_TYPE = "L2" + +DEFAULT_HNSW_M = 16 +DEFAULT_HNSW_EF_CONSTRUCTION = 200 +DEFAULT_HNSW_EF = 10 + +FIELD_ID: str = "id" +FIELD_VECTOR: str = "vector" +FIELD_METADATA: str = "metadata" +INDEX_SUFFIX: str = "_index" +INDEX_VECTOR: str = "vector_index" + +VALUE_NONE_ERROR = "Parameter `{}` can not be None." +VALUE_RANGE_ERROR = "The value of parameter `{}` must be within {}." +NOT_SUPPORT_INDEX_TYPE_ERROR = ( + "Unsupported index type: `{}`, supported index types are {}" +) +NOT_SUPPORT_METRIC_TYPE_ERROR = ( + "Unsupported metric type: `{}`, supported metric types are {}" +) + + +def _try_import() -> None: + try: + import pymochow # noqa: F401 + except ImportError: + raise ImportError( + "`pymochow` package not found, please run `pip install pymochow`" + ) + + +class TableField: + name: str + data_type: str = "STRING" + + def __init__(self, name: str, data_type: str = "STRING"): + self.name = name + self.data_type = "STRING" if data_type is None else data_type + + +class TableParams: + """Baidu VectorDB table params. + + See the following documentation for details: + https://cloud.baidu.com/doc/VDB/s/mlrsob0p6 + + Args: + dimension int: The dimension of vector. + replication int: The number of replicas in the table. + partition int: The number of partitions in the table. + index_type (Optional[str]): HNSW, FLAT... Default value is "HNSW" + metric_type (Optional[str]): L2, COSINE, IP. Default value is "L2" + drop_exists (Optional[bool]): Delete the existing Table. Default value is False. + vector_params (Optional[Dict]): + if HNSW set parameters: `M` and `efConstruction`, for example `{'M': 16, efConstruction: 200}` + default is HNSW + filter_fields: Optional[List[str]]: Set the fields for filtering, The + fields used for filtering must have a value in every row of the table + and cannot be null. + for example: ['author', 'age'] + This can be used when calling the query method: + store.add([ + TextNode(..., metadata={'age'=23, 'name'='name1'}) + ]) + ... + query = VectorStoreQuery(...) + store.query(query, filter="age > 20 and age < 40 and name = 'name1'") + """ + + def __init__( + self, + dimension: int, + table_name: str = DEFAULT_TABLE_NAME, + replication: int = DEFAULT_REPLICA, + partition: int = DEFAULT_PARTITION, + index_type: str = DEFAULT_INDEX_TYPE, + metric_type: str = DEFAULT_METRIC_TYPE, + drop_exists: Optional[bool] = False, + vector_params: Optional[Dict] = None, + filter_fields: Optional[List[TableField]] = None, + ): + if filter_fields is None: + filter_fields = [] + self.dimension = dimension + self.table_name = table_name + self.replication = replication + self.partition = partition + self.index_type = index_type + self.metric_type = metric_type + self.drop_exists = drop_exists + self.vector_params = vector_params + self.filter_fields = filter_fields + + +class BaiduVectorDB(VectorStore): + """Baidu VectorDB as a vector store. + + In order to use this you need to have a database instance. + See the following documentation for details: + https://cloud.baidu.com/doc/VDB/index.html + + Args: + endpoint (Optional[str]): endpoint of Baidu VectorDB + account (Optional[str]): The account for Baidu VectorDB. Default value is "root" + api_key (Optional[str]): The Api-Key for Baidu VectorDB + database_name(Optional[str]): The database name for Baidu VectorDB + table_params (Optional[TableParams]): The table parameters for BaiduVectorDB + """ + + user_defined_fields: List[TableField] = [] + + def __init__( + self, + endpoint: str, + api_key: str, + account: str = DEFAULT_ACCOUNT, + database_name: str = DEFAULT_DATABASE_NAME, + table_params: TableParams = TableParams(dimension=1536), + batch_size: int = 1000, + **kwargs: Any, + ): + """Init params.""" + self._init_client(endpoint, account, api_key) + self._create_database_if_not_exists(database_name) + self._create_table(table_params) + self.batch_size = batch_size + self.user_defined_fields = table_params.filter_fields + + @classmethod + def class_name(cls) -> str: + return "BaiduVectorDB" + + @classmethod + def from_params( + cls, + endpoint: str, + api_key: str, + account: str = DEFAULT_ACCOUNT, + database_name: str = DEFAULT_DATABASE_NAME, + table_params: TableParams = TableParams(dimension=1536), + batch_size: int = 1000, + **kwargs: Any, + ) -> "BaiduVectorDB": + _try_import() + return cls( + endpoint=endpoint, + account=account, + api_key=api_key, + database_name=database_name, + table_params=table_params, + batch_size=batch_size, + **kwargs, + ) + + def _init_client(self, endpoint: str, account: str, api_key: str) -> None: + import pymochow + from pymochow.configuration import Configuration + from pymochow.auth.bce_credentials import BceCredentials + + config = Configuration( + credentials=BceCredentials(account, api_key), + endpoint=endpoint, + connection_timeout_in_mills=DEFAULT_TIMEOUT_IN_MILLS, + ) + self.vdb_client = pymochow.MochowClient(config) + + def _create_database_if_not_exists(self, database_name: str) -> None: + db_list = self.vdb_client.list_databases() + + if database_name in [db.database_name for db in db_list]: + self.database = self.vdb_client.database(database_name) + else: + self.database = self.vdb_client.create_database(database_name) + + def _create_table(self, table_params: TableParams) -> None: + import pymochow + + if table_params is None: + raise ValueError(VALUE_NONE_ERROR.format("table_params")) + + try: + self.table = self.database.describe_table(table_params.table_name) + if table_params.drop_exists: + self.database.drop_table(table_params.table_name) + # wait db release resource + time.sleep(5) + self._create_table_in_db(table_params) + except pymochow.exception.ServerError: + self._create_table_in_db(table_params) + + def _create_table_in_db( + self, + table_params: TableParams, + ) -> None: + from pymochow.model.enum import FieldType + from pymochow.model.schema import Field, Schema, SecondaryIndex, VectorIndex + from pymochow.model.table import Partition + + index_type = self._get_index_type(table_params.index_type) + metric_type = self._get_metric_type(table_params.metric_type) + vector_params = self._get_index_params(index_type, table_params) + fields = [] + fields.append( + Field( + FIELD_ID, + FieldType.STRING, + primary_key=True, + partition_key=True, + auto_increment=False, + not_null=True, + ) + ) + fields.append(Field(DEFAULT_DOC_ID_KEY, FieldType.STRING)) + fields.append(Field(FIELD_METADATA, FieldType.STRING)) + fields.append(Field(DEFAULT_TEXT_KEY, FieldType.STRING)) + fields.append( + Field( + FIELD_VECTOR, FieldType.FLOAT_VECTOR, dimension=table_params.dimension + ) + ) + for field in table_params.filter_fields: + fields.append(Field(field.name, FieldType(field.data_type), not_null=True)) + + indexes = [] + indexes.append( + VectorIndex( + index_name=INDEX_VECTOR, + index_type=index_type, + field=FIELD_VECTOR, + metric_type=metric_type, + params=vector_params, + ) + ) + for field in table_params.filter_fields: + index_name = field.name + INDEX_SUFFIX + indexes.append(SecondaryIndex(index_name=index_name, field=field.name)) + + schema = Schema(fields=fields, indexes=indexes) + self.table = self.database.create_table( + table_name=table_params.table_name, + replication=table_params.replication, + partition=Partition(partition_num=table_params.partition), + schema=Schema(fields=fields, indexes=indexes), + enable_dynamic_field=True, + ) + # need wait 10s to wait proxy sync meta + time.sleep(10) + + @staticmethod + def _get_index_params(index_type: Any, table_params: TableParams) -> None: + from pymochow.model.enum import IndexType + from pymochow.model.schema import HNSWParams + + vector_params = ( + {} if table_params.vector_params is None else table_params.vector_params + ) + + if index_type == IndexType.HNSW: + return HNSWParams( + m=vector_params.get("M", DEFAULT_HNSW_M), + efconstruction=vector_params.get( + "efConstruction", DEFAULT_HNSW_EF_CONSTRUCTION + ), + ) + return None + + @staticmethod + def _get_index_type(index_type_value: str) -> Any: + from pymochow.model.enum import IndexType + + index_type_value = index_type_value or IndexType.HNSW + try: + return IndexType(index_type_value) + except ValueError: + support_index_types = [d.value for d in IndexType.__members__.values()] + raise ValueError( + NOT_SUPPORT_INDEX_TYPE_ERROR.format( + index_type_value, support_index_types + ) + ) + + @staticmethod + def _get_metric_type(metric_type_value: str) -> Any: + from pymochow.model.enum import MetricType + + metric_type_value = metric_type_value or MetricType.L2 + try: + return MetricType(metric_type_value.upper()) + except ValueError: + support_metric_types = [d.value for d in MetricType.__members__.values()] + raise ValueError( + NOT_SUPPORT_METRIC_TYPE_ERROR.format( + metric_type_value, support_metric_types + ) + ) + + @property + def client(self) -> Any: + """Get client.""" + return self.tencent_client + + def add( + self, + nodes: List[BaseNode], + **add_kwargs: Any, + ) -> List[str]: + """Add nodes to index. + + Args: + nodes: List[BaseNode]: list of nodes with embeddings + + """ + from pymochow.model.table import Row + from pymochow.model.enum import IndexState + + ids = [] + rows = [] + for node in nodes: + row = Row(id=node.node_id, vector=node.get_embedding()) + if node.ref_doc_id is not None: + row._data[DEFAULT_DOC_ID_KEY] = node.ref_doc_id + if node.metadata is not None: + row._data[FIELD_METADATA] = json.dumps(node.metadata) + for field in self.user_defined_fields: + v = node.metadata.get(field.name) + if v is not None: + row._data[field.name] = v + if isinstance(node, TextNode) and node.text is not None: + row._data[DEFAULT_TEXT_KEY] = node.text + + rows.append(row) + ids.append(node.node_id) + + if len(rows) >= self.batch_size: + self.collection.upsert(rows=rows) + rows = [] + + if len(rows) > 0: + self.table.upsert(rows=rows) + + self.table.rebuild_index(INDEX_VECTOR) + while True: + time.sleep(2) + index = self.table.describe_index(INDEX_VECTOR) + if index.state == IndexState.NORMAL: + break + + return ids + + # Baidu VectorDB Not support delete with filter right now, will support it later. + def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: + """ + Delete nodes using with ref_doc_id or ids. + + Args: + ref_doc_id (str): The doc_id of the document to delete. + + """ + raise NotImplementedError("Not support.") + + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + """Query index for top k most similar nodes. + + Args: + query (VectorStoreQuery): contains + query_embedding (List[float]): query embedding + similarity_top_k (int): top k most similar nodes + filters (Optional[MetadataFilters]): filter result + """ + from pymochow.model.table import AnnSearch, HNSWSearchParams + + search_filter = None + if query.filters is not None: + search_filter = self._build_filter_condition(query.filters, **kwargs) + anns = AnnSearch( + vector_field=FIELD_VECTOR, + vector_floats=query.query_embedding, + params=HNSWSearchParams(ef=DEFAULT_HNSW_EF, limit=query.similarity_top_k), + filter=search_filter, + ) + res = self.table.search(anns=anns, retrieve_vector=True) + rows = res.rows + if rows is None or len(rows) == 0: + return VectorStoreQueryResult(nodes=[], similarities=[], ids=[]) + + nodes = [] + similarities = [] + ids = [] + for row in rows: + similarities.append(row.get("distance")) + row_data = row.get("row", {}) + ids.append(row_data.get(FIELD_ID)) + + meta_str = row_data.get(FIELD_METADATA) + meta = {} if meta_str is None else json.loads(meta_str) + doc_id = row_data.get(DEFAULT_DOC_ID_KEY) + + node = TextNode( + id_=row_data.get(FIELD_ID), + text=row_data.get(DEFAULT_TEXT_KEY), + embedding=row_data.get(FIELD_VECTOR), + metadata=meta, + ) + if doc_id is not None: + node.relationships = { + NodeRelationship.SOURCE: RelatedNodeInfo(node_id=doc_id) + } + + nodes.append(node) + + return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids) + + @staticmethod + def _build_filter_condition(standard_filters: MetadataFilters) -> str: + filters_list = [] + + for filter in standard_filters.filters: + if filter.operator: + if filter.operator in ["<", ">", "<=", ">=", "!="]: + condition = f"{filter.key}{filter.operator}{filter.value}" + elif filter.operator in ["=="]: + if isinstance(filter.value, str): + condition = f"{filter.key}='{filter.value}'" + else: + condition = f"{filter.key}=={filter.value}" + else: + raise ValueError( + f"Filter operator {filter.operator} not supported." + ) + else: + condition = f"{filter.key}={filter.value}" + + filters_list.append(condition) + + return standard_filters.condition.join(filters_list) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..1e00593052e8bc0930cbafc14d2fc4376d3c6dbe --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.vector_stores.baiduvectordb" + +[tool.llamahub.class_authors] +BaiduVectorDB = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name <you@example.com>"] +description = "llama-index vector_stores baiduvectordb integration" +license = "MIT" +name = "llama-index-vector-stores-baiduvectordb" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.0" +pymochow = "^1.0.2" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/tests/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/tests/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..dabf212d7e7162849c24a733909ac4f645d75a31 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/tests/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/tests/test_vector_stores_baiduvectordb.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/tests/test_vector_stores_baiduvectordb.py new file mode 100644 index 0000000000000000000000000000000000000000..8371b553445b98ac7da772a70702c14c38ff9589 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-baiduvectordb/tests/test_vector_stores_baiduvectordb.py @@ -0,0 +1,7 @@ +from llama_index.core.vector_stores.types import VectorStore +from llama_index.vector_stores.baiduvectordb import BaiduVectorDB + + +def test_class(): + names_of_base_classes = [b.__name__ for b in BaiduVectorDB.__mro__] + assert VectorStore.__name__ in names_of_base_classes