From 63116040a64262c21a5bbb8e491b8da3f38513ce Mon Sep 17 00:00:00 2001
From: NickhilN <92180678+NickhilN@users.noreply.github.com>
Date: Thu, 14 Mar 2024 22:46:06 +0000
Subject: [PATCH] Feature/databricks vector search (#10754)

---
 .../DatabricksVectorSearchDemo.ipynb          | 213 +++++++++
 docs/module_guides/storing/vector_stores.md   |   2 +
 .../.gitignore                                | 153 +++++++
 .../BUILD                                     |   4 +
 .../Makefile                                  |  17 +
 .../README.md                                 |   1 +
 .../vector_stores/databricks/BUILD            |   1 +
 .../vector_stores/databricks/__init__.py      |   5 +
 .../vector_stores/databricks/base.py          | 418 ++++++++++++++++++
 .../pyproject.toml                            |  62 +++
 .../tests/BUILD                               |   1 +
 .../tests/__init__.py                         |   0
 ..._vector_stores_databricks_vector_search.py |   7 +
 13 files changed, 884 insertions(+)
 create mode 100644 docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/BUILD
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/__init__.py
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/test_vector_stores_databricks_vector_search.py

diff --git a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
new file mode 100644
index 000000000..e48cfd926
--- /dev/null
+++ b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Databricks Vector Search\n",
+    "\n",
+    "Databricks Vector Search is a vector database that is built into the Databricks Intelligence Platform and integrated with its governance and productivity tools. Full docs here: https://docs.databricks.com/en/generative-ai/vector-search.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install llama-index and databricks-vectorsearch. You must be inside a Databricks runtime to use the Vector Search python client."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install llama-index llama-index-vector-stores-databricks\n",
+    "%pip install databricks-vectorsearch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import databricks dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from databricks.vector_search.client import (\n",
+    "    VectorSearchIndex,\n",
+    "    VectorSearchClient,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import LlamaIndex dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import (\n",
+    "    VectorStoreIndex,\n",
+    "    SimpleDirectoryReader,\n",
+    "    ServiceContext,\n",
+    "    StorageContext,\n",
+    ")\n",
+    "from llama_index.vector_stores.databricks import DatabricksVectorSearch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load example data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p 'data/paul_graham/'\n",
+    "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load documents\n",
+    "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n",
+    "print(f\"Total documents: {len(documents)}\")\n",
+    "print(f\"First document, id: {documents[0].doc_id}\")\n",
+    "print(f\"First document, hash: {documents[0].hash}\")\n",
+    "print(\n",
+    "    \"First document, text\"\n",
+    "    f\" ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a Databricks Vector Search endpoint which will serve the index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a vector search endpoint\n",
+    "client = VectorSearchClient()\n",
+    "client.create_endpoint(\n",
+    "    name=\"llamaindex_dbx_vector_store_test_endpoint\", endpoint_type=\"STANDARD\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the Databricks Vector Search index, and build it from the documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a vector search index\n",
+    "# it must be placed inside a Unity Catalog-enabled schema\n",
+    "\n",
+    "# We'll use self-managed embeddings (i.e. managed by LlamaIndex) rather than a Databricks-managed index\n",
+    "databricks_index = client.create_direct_access_index(\n",
+    "    endpoint_name=\"llamaindex_dbx_vector_store_test_endpoint\",\n",
+    "    index_name=\"my_catalog.my_schema.my_test_table\",\n",
+    "    primary_key=\"my_primary_key_name\",\n",
+    "    embedding_dimension=1536,  # match the embeddings model dimension you're going to use\n",
+    "    embedding_vector_column=\"my_embedding_vector_column_name\",  # you name this anything you want - it'll be picked up by the LlamaIndex class\n",
+    "    schema={\n",
+    "        \"my_primary_key_name\": \"string\",\n",
+    "        \"my_embedding_vector_column_name\": \"array<double>\",\n",
+    "        \"text\": \"string\",  # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text,\n",
+    "        \"doc_id\": \"string\",  # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)\n",
+    "        # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n",
+    "        # NOTE THAT THESE FIELDS MUST BE ADDED EXPLICITLY TO BE USED FOR METADATA FILTERING\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "databricks_vector_store = DatabricksVectorSearch(\n",
+    "    index=databricks_index,\n",
+    "    text_column=\"text\",\n",
+    "    columns=None,  # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE\n",
+    ")  # text_column is required for self-managed embeddings\n",
+    "storage_context = StorageContext.from_defaults(\n",
+    "    vector_store=databricks_vector_store\n",
+    ")\n",
+    "index = VectorStoreIndex.from_documents(\n",
+    "    documents, storage_context=storage_context\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Query the index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_engine = index.as_query_engine()\n",
+    "response = query_engine.query(\"Why did the author choose to work on AI?\")\n",
+    "\n",
+    "print(response.response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 4
+   },
+   "notebookName": "Databricks Vector Search Demo (LlamaIndex Integration)",
+   "widgets": {}
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/docs/module_guides/storing/vector_stores.md b/docs/module_guides/storing/vector_stores.md
index 2d721c836..8a6a39e1a 100644
--- a/docs/module_guides/storing/vector_stores.md
+++ b/docs/module_guides/storing/vector_stores.md
@@ -23,6 +23,7 @@ We are actively adding more integrations and improving feature coverage for each
 | ChatGPT Retrieval Plugin | aggregator              |                    |               | ✓      | ✓               |       |
 | Chroma                   | self-hosted             | ✓                  |               | ✓      | ✓               |       |
 | DashVector               | cloud                   | ✓                  | ✓             | ✓      | ✓               |       |
+| Databricks               | cloud                   | ✓                  |               | ✓      | ✓               |       |
 | Deeplake                 | self-hosted / cloud     | ✓                  |               | ✓      | ✓               |       |
 | DocArray                 | aggregator              | ✓                  |               | ✓      | ✓               |       |
 | DuckDB                   | in-memory / self-hosted | ✓                  |               | ✓      | ✓               |       |
@@ -70,6 +71,7 @@ maxdepth: 1
 /examples/vector_stores/ChromaIndexDemo.ipynb
 /examples/vector_stores/DashvectorIndexDemo.ipynb
 /examples/vector_stores/DashvectorIndexDemo-Hybrid.ipynb
+/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
 /examples/vector_stores/DeepLakeIndexDemo.ipynb
 /examples/vector_stores/DocArrayHnswIndexDemo.ipynb
 /examples/vector_stores/DocArrayInMemoryIndexDemo.ipynb
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore
new file mode 100644
index 000000000..990c18de2
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD
new file mode 100644
index 000000000..05444d69d
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD
@@ -0,0 +1,4 @@
+poetry_requirements(
+    name="poetry",
+    module_mapping={"databricks-vectorsearch": ["databricks"]}
+)
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile
new file mode 100644
index 000000000..b9eab05aa
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md
new file mode 100644
index 000000000..837b6aaec
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md
@@ -0,0 +1 @@
+# LlamaIndex Vector_Stores Integration: Databricks Vector Search
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD
new file mode 100644
index 000000000..db46e8d6c
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py
new file mode 100644
index 000000000..3d63d6acf
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py
@@ -0,0 +1,5 @@
+from llama_index.vector_stores.databricks.base import (
+    DatabricksVectorSearch,
+)
+
+__all__ = ["DatabricksVectorSearch"]
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py
new file mode 100644
index 000000000..bd7f5ae32
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py
@@ -0,0 +1,418 @@
+"""
+Databricks Vector Search index.
+
+Supports Delta Sync indexes and Direct Access indexes in Databricks Vector Search.
+"""
+
+import json
+import logging
+from typing import (
+    Any,
+    List,
+    Dict,
+    Optional,
+    cast,
+)
+from enum import Enum
+
+from databricks.vector_search.client import VectorSearchIndex
+
+from llama_index.core.bridge.pydantic import BaseModel, Field, PrivateAttr
+from llama_index.core.vector_stores.types import (
+    BasePydanticVectorStore,
+    MetadataFilters,
+    FilterCondition,
+    FilterOperator,
+    VectorStoreQuery,
+    VectorStoreQueryResult,
+    VectorStoreQueryMode,
+)
+from llama_index.core.vector_stores.utils import node_to_metadata_dict
+from llama_index.core.schema import TextNode, BaseNode
+from llama_index.core.bridge.pydantic import PrivateAttr
+
+
+class _DatabricksIndexType(str, Enum):
+    DIRECT_ACCESS = "DIRECT_ACCESS"
+    DELTA_SYNC = "DELTA_SYNC"
+
+
+class _DatabricksIndexDescription(BaseModel):
+    primary_key: str
+    index_type: _DatabricksIndexType
+    delta_sync_index_spec: Dict = Field(default_factory=dict)
+    direct_access_index_spec: Dict = Field(default_factory=dict)
+
+
+_logger = logging.getLogger(__name__)
+
+
+_filter_translation = {
+    FilterOperator.EQ: "",
+    FilterOperator.GT: ">",
+    FilterOperator.LT: "<",
+    FilterOperator.NE: "NOT",
+    FilterOperator.GTE: ">=",
+    FilterOperator.LTE: "<=",
+    FilterOperator.IN: "",
+    FilterOperator.NIN: "NOT",
+}
+
+
+def _transform_databricks_filter_operator(operator: FilterOperator) -> str:
+    try:
+        return _filter_translation[operator]
+
+    except KeyError as e:
+        raise ValueError(f"filter operator {operator} is not supported")
+
+
+def _to_databricks_filter(standard_filters: MetadataFilters) -> dict:
+    """Convert from standard dataclass to databricks filter dict."""
+    filters = {}
+
+    condition = standard_filters.condition or FilterOperator.AND
+
+    for filter in standard_filters.filters:
+        value = filter.value if isinstance(filter.value, list) else [filter.value]
+
+        transformed_operator = _transform_databricks_filter_operator(filter.operator)
+
+        if transformed_operator == "":
+            key = filter.key
+
+        else:
+            key = f"{filter.key} {transformed_operator}"
+
+        if key in filters:
+            raise ValueError(f"filter condition already exists for {key}")
+
+        filters[key] = value
+
+    if condition == FilterCondition.AND:
+        return filters
+
+    elif condition == FilterCondition.OR:
+        keys, values = zip(*filters.items())
+        return {" OR ".join(keys): values}
+
+    raise ValueError(f"condition {condition} is not supported")
+
+
+class DatabricksVectorSearch(BasePydanticVectorStore):
+    """
+    Vector store for Databricks Vector Search.
+
+    Install ``databricks-vectorsearch`` package using the following in a Databricks notebook:
+    %pip install databricks-vectorsearch
+    dbutils.library.restartPython()
+
+    """
+
+    stores_text: bool = True
+    text_column: Optional[str]
+    columns: Optional[List[str]]
+
+    _index: VectorSearchIndex = PrivateAttr()
+    _primary_key: str = PrivateAttr()
+    _index_type: str = PrivateAttr()
+    _delta_sync_index_spec: dict = PrivateAttr()
+    _direct_access_index_spec: dict = PrivateAttr()
+    _doc_id_to_pk: dict = PrivateAttr()
+
+    def __init__(
+        self,
+        index: VectorSearchIndex,
+        text_column: Optional[str] = None,
+        columns: Optional[List[str]] = None,
+    ) -> None:
+        try:
+            from databricks.vector_search.client import VectorSearchIndex
+        except ImportError:
+            raise ImportError(
+                "`databricks-vectorsearch` package not found: "
+                "please run `pip install databricks-vectorsearch`"
+            )
+        if not isinstance(index, VectorSearchIndex):
+            raise TypeError(
+                f"index must be of type `VectorSearchIndex`, not {type(index)}"
+            )
+
+        self._index = index
+
+        # unpack the index spec
+        index_description = _DatabricksIndexDescription.parse_obj(
+            self._index.describe()
+        )
+
+        self._primary_key = index_description.primary_key
+        self._index_type = index_description.index_type
+        self._delta_sync_index_spec = index_description.delta_sync_index_spec
+        self._direct_access_index_spec = index_description.direct_access_index_spec
+        self._doc_id_to_pk = {}
+
+        if columns is None:
+            columns = []
+        if "doc_id" not in columns:
+            columns = columns[:19] + ["doc_id"]
+        super().__init__(
+            text_column=text_column,
+            columns=columns,
+        )
+
+        # initialize the column name for the text column in the delta table
+        if self._is_databricks_managed_embeddings():
+            index_source_column = self._embedding_source_column_name()
+
+            # check if input text column matches the source column of the index
+            if text_column is not None and text_column != index_source_column:
+                raise ValueError(
+                    f"text_column '{text_column}' does not match with the "
+                    f"source column of the index: '{index_source_column}'."
+                )
+
+            self.text_column = index_source_column
+        else:
+            if text_column is None:
+                raise ValueError("text_column is required for self-managed embeddings.")
+            self.text_column = text_column
+
+        # Fold primary key and text column into columns if they're not empty.
+        columns_to_add = set(columns or [])
+        columns_to_add.add(self._primary_key)
+        columns_to_add.add(self.text_column)
+        columns_to_add -= {"", None}
+
+        self.columns = list(columns_to_add)
+
+        # If the index schema is known, all our columns should be in that index.
+        # Validate specified columns are in the index
+        index_schema = self._index_schema()
+
+        if self._is_direct_access_index() and index_schema:
+            missing_columns = columns_to_add - set(index_schema.keys())
+
+            if missing_columns:
+                raise ValueError(
+                    f"columns missing from schema: {', '.join(missing_columns)}"
+                )
+
+    def add(
+        self,
+        nodes: List[BaseNode],
+        **add_kwargs: Any,
+    ) -> List[str]:
+        """Add nodes to index.
+
+        Args:
+            nodes: List[BaseNode]: list of nodes with embeddings
+
+        """
+        if self._is_databricks_managed_embeddings():
+            raise ValueError(
+                "Adding nodes is not supported for Databricks-managed embeddings."
+            )
+
+        # construct the entries to upsert
+        entries = []
+        ids = []
+        for node in nodes:
+            node_id = node.node_id
+            metadata = node_to_metadata_dict(node, remove_text=True, flat_metadata=True)
+
+            metadata_columns = self.columns or []
+
+            # explicitly record doc_id as metadata (for delete)
+            metadata_columns.append("doc_id")
+
+            entry = {
+                self._primary_key: node_id,
+                self.text_column: node.get_content(),
+                self._embedding_vector_column_name(): node.get_embedding(),
+                **{
+                    col: metadata.get(col)
+                    for col in filter(
+                        lambda column: column
+                        not in (self._primary_key, self.text_column),
+                        metadata_columns,
+                    )
+                },
+            }
+            doc_id = metadata.get("doc_id")
+            self._doc_id_to_pk[doc_id] = list(
+                set(self._doc_id_to_pk.get(doc_id, []) + [node_id])  # noqa: RUF005
+            )  # associate this node_id with this doc_id
+
+            entries.append(entry)
+            ids.append(node_id)
+
+        # attempt the upsert
+        upsert_resp = self._index.upsert(
+            entries,
+        )
+
+        # return the successful IDs
+        response_status = upsert_resp.get("status")
+
+        failed_ids = (
+            set(upsert_resp["result"]["failed_primary_keys"] or [])
+            if "result" in upsert_resp
+            and "failed_primary_keys" in upsert_resp["result"]
+            else set()
+        )
+
+        if response_status not in ("PARTIAL_SUCCESS", "FAILURE") or not failed_ids:
+            return ids
+
+        elif response_status == "PARTIAL_SUCCESS":
+            _logger.warning(
+                "failed to add %d out of %d texts to the index",
+                len(failed_ids),
+                len(ids),
+            )
+
+        elif response_status == "FAILURE":
+            _logger.error("failed to add all %d texts to the index", len(ids))
+
+        return list(filter(lambda id_: id_ not in failed_ids, ids))
+
+    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
+        """
+        Delete nodes with ref_doc_id.
+
+        Args:
+            ref_doc_id (str): The doc_id of the document to delete.
+
+        """
+        primary_keys = self._doc_id_to_pk.get(
+            ref_doc_id, None
+        )  # get the node_ids associated with the doc_id
+        if primary_keys is not None:
+            self._index.delete(
+                primary_keys=primary_keys,
+            )
+            self._doc_id_to_pk.pop(
+                ref_doc_id
+            )  # remove this doc_id from the doc_id-to-node_id map
+
+    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
+        """Query index for top k most similar nodes."""
+        if self._is_databricks_managed_embeddings():
+            query_text = query.query_str
+            query_vector = None
+        else:
+            query_text = None
+            query_vector = cast(List[float], query.query_embedding)
+
+        if query.mode not in (
+            VectorStoreQueryMode.DEFAULT,
+            VectorStoreQueryMode.HYBRID,
+        ):
+            raise ValueError(
+                "Only DEFAULT and HYBRID modes are supported for Databricks Vector Search."
+            )
+
+        if query.filters is not None:
+            filters = _to_databricks_filter(query.filters)
+        else:
+            filters = None
+
+        search_resp = self._index.similarity_search(
+            columns=self.columns,
+            query_text=query_text,
+            query_vector=query_vector,
+            filters=filters,
+            num_results=query.similarity_top_k,
+        )
+
+        columns = [
+            col["name"] for col in search_resp.get("manifest", {}).get("columns", [])
+        ]
+        top_k_nodes = []
+        top_k_ids = []
+        top_k_scores = []
+        for result in search_resp.get("result", {}).get("data_array", []):
+            doc_id = result[columns.index(self._primary_key)]
+            text_content = result[columns.index(self.text_column)]
+            metadata = {
+                col: value
+                for col, value in zip(columns[:-1], result[:-1])
+                if col not in [self._primary_key, self.text_column]
+            }
+            metadata[self._primary_key] = doc_id
+            score = result[-1]
+            node = TextNode(
+                text=text_content, id_=doc_id, metadata=metadata
+            )  # TODO star_char, end_char, relationships? https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/vector_stores/llama-index-vector-stores-pinecone/llama_index/vector_stores/pinecone/base.py
+
+            top_k_ids.append(doc_id)
+            top_k_nodes.append(node)
+            top_k_scores.append(score)
+
+        return VectorStoreQueryResult(
+            nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids
+        )
+
+    @property
+    def client(self) -> Any:
+        """Return VectorStoreIndex."""
+        return self._index
+
+    # The remaining utilities (and snippets of the above) are taken from
+    # https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/databricks_vector_search.py
+    def _index_schema(self) -> Optional[dict]:
+        """Return the index schema as a dictionary.
+        Return None if no schema found.
+        """
+        if self._is_direct_access_index():
+            schema_json = self._direct_access_index_spec.get("schema_json")
+            if schema_json is not None:
+                return json.loads(schema_json)
+        return None
+
+    def _embedding_vector_column_name(self) -> Optional[str]:
+        """Return the name of the embedding vector column.
+        None if the index is not a self-managed embedding index.
+        """
+        return self._embedding_vector_column().get("name")
+
+    def _embedding_vector_column(self) -> dict:
+        """Return the embedding vector column configs as a dictionary.
+        Empty if the index is not a self-managed embedding index.
+        """
+        index_spec = (
+            self._delta_sync_index_spec
+            if self._is_delta_sync_index()
+            else self._direct_access_index_spec
+        )
+        return next(iter(index_spec.get("embedding_vector_columns") or []), {})
+
+    def _embedding_source_column_name(self) -> Optional[str]:
+        """Return the name of the embedding source column.
+        None if the index is not a Databricks-managed embedding index.
+        """
+        return self._embedding_source_column().get("name")
+
+    def _embedding_source_column(self) -> dict:
+        """Return the embedding source column configs as a dictionary.
+        Empty if the index is not a Databricks-managed embedding index.
+        """
+        return next(
+            iter(self._delta_sync_index_spec.get("embedding_source_columns") or []),
+            {},
+        )
+
+    def _is_delta_sync_index(self) -> bool:
+        """Return True if the index is a delta-sync index."""
+        return self._index_type == _DatabricksIndexType.DELTA_SYNC
+
+    def _is_direct_access_index(self) -> bool:
+        """Return True if the index is a direct-access index."""
+        return self._index_type == _DatabricksIndexType.DIRECT_ACCESS
+
+    def _is_databricks_managed_embeddings(self) -> bool:
+        """Return True if the embeddings are managed by Databricks Vector Search."""
+        return (
+            self._is_delta_sync_index()
+            and self._embedding_source_column_name() is not None
+        )
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml
new file mode 100644
index 000000000..0b45006cd
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml
@@ -0,0 +1,62 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.vector_stores.databricks"
+
+[tool.llamahub.class_authors]
+DatabricksVectorSearch = "NickhilN"
+
+[tool.mypy]
+disallow_untyped_defs = true
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Alberto Da Costa <alberto@bamelevate.com>", "Nickhil Nabar <nickhil@bamelevate.com"]
+description = "llama-index vector_stores databricks vector search integration"
+license = "MIT"
+name = "llama-index-vector-stores-databricks"
+readme = "README.md"
+version = "0.1.1"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<3.12"
+llama-index-core = "^0.10.1"
+databricks-vectorsearch = "^0.21"
+
+[tool.poetry.group.dev.dependencies]
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"
+types-setuptools = "67.1.0.0"
+
+[tool.poetry.group.dev.dependencies.black]
+extras = ["jupyter"]
+version = "<=23.9.1,>=23.7.0"
+
+[tool.poetry.group.dev.dependencies.codespell]
+extras = ["toml"]
+version = ">=v2.2.6"
+
+[[tool.poetry.packages]]
+include = "llama_index/"
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/BUILD
new file mode 100644
index 000000000..dabf212d7
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/BUILD
@@ -0,0 +1 @@
+python_tests()
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/test_vector_stores_databricks_vector_search.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/test_vector_stores_databricks_vector_search.py
new file mode 100644
index 000000000..e8555354d
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/tests/test_vector_stores_databricks_vector_search.py
@@ -0,0 +1,7 @@
+from llama_index.core.vector_stores.types import BasePydanticVectorStore
+from llama_index.vector_stores.databricks import DatabricksVectorSearch
+
+
+def test_class():
+    names_of_base_classes = [b.__name__ for b in DatabricksVectorSearch.__mro__]
+    assert BasePydanticVectorStore.__name__ in names_of_base_classes
-- 
GitLab