From b2ab9a643c274a8488fd5d6a49e7632a23e7a483 Mon Sep 17 00:00:00 2001
From: nbyrneKX <114394452+nbyrneKX@users.noreply.github.com>
Date: Thu, 21 Mar 2024 21:07:06 +0000
Subject: [PATCH] KDB.AI (#11967)

---
 .../KDBAI_Advanced_RAG_Demo.ipynb             | 627 ++++++++++++++++++
 .../llama-index-vector-stores-kdbai/BUILD     |   3 +
 .../llama-index-vector-stores-kdbai/Makefile  |  17 +
 .../llama-index-vector-stores-kdbai/README.md |   1 +
 .../llama_index/vector_stores/kdbai/BUILD     |   1 +
 .../vector_stores/kdbai/__init__.py           |   4 +
 .../llama_index/vector_stores/kdbai/base.py   | 214 ++++++
 .../llama_index/vector_stores/kdbai/utils.py  |  22 +
 .../pyproject.toml                            |  67 ++
 9 files changed, 956 insertions(+)
 create mode 100644 docs/examples/vector_stores/KDBAI_Advanced_RAG_Demo.ipynb
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/BUILD
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/Makefile
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/README.md
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/BUILD
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/__init__.py
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py
 create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml

diff --git a/docs/examples/vector_stores/KDBAI_Advanced_RAG_Demo.ipynb b/docs/examples/vector_stores/KDBAI_Advanced_RAG_Demo.ipynb
new file mode 100644
index 0000000000..f7757314f3
--- /dev/null
+++ b/docs/examples/vector_stores/KDBAI_Advanced_RAG_Demo.ipynb
@@ -0,0 +1,627 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cb212650-86b7-4298-8bbd-c20a5227fbf0",
+   "metadata": {},
+   "source": [
+    "# Advanced RAG with temporal filters using LlamaIndex and KDB.AI vector store\n",
+    "\n",
+    "> [KDB.AI](https://kdb.ai/) is a powerful knowledge-based vector database and search engine that allows you to build scalable, reliable AI applications, using real-time data, by providing advanced search, recommendation and personalization.\n",
+    "\n",
+    "This example demonstrates how to use KDB.AI to run semantic search, summarization and analysis of financial regulations around some specific moment in time.\n",
+    "\n",
+    "To access your end point and API keys, sign up to KDB.AI here.\n",
+    "\n",
+    "To set up your development environment, follow the instructions on the KDB.AI pre-requisites page.\n",
+    "\n",
+    "The following examples demonstrate some of the ways you can interact with KDB.AI through LlamaIndex."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32f36ddd-aa4d-4284-a236-be3028758ae2",
+   "metadata": {},
+   "source": [
+    "## Install dependencies with Pip"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1530006e-dbcb-4783-8f7a-5155f4edc1d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install llama-index llama-index-embeddings-huggingface llama-index-llms-openai llama-index-readers-file llama-index-vector-stores-kdbai\n",
+    "# %pip install kdbai_client pandas"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "68ba14b7-1208-4494-93ec-ce1930b7bf5b",
+   "metadata": {},
+   "source": [
+    "## Import dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4ca21f7-819c-4abb-8479-e7c6f3175f34",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "OpenAI API key:  ········\n"
+     ]
+    }
+   ],
+   "source": [
+    "from getpass import getpass\n",
+    "import re\n",
+    "import os\n",
+    "import shutil\n",
+    "import time\n",
+    "import urllib\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from llama_index.core import (\n",
+    "    Settings,\n",
+    "    SimpleDirectoryReader,\n",
+    "    ServiceContext,\n",
+    "    StorageContext,\n",
+    "    VectorStoreIndex,\n",
+    ")\n",
+    "from llama_index.core.node_parser import SentenceSplitter\n",
+    "from llama_index.core.retrievers import VectorIndexRetriever\n",
+    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
+    "from llama_index.llms.openai import OpenAI\n",
+    "from llama_index.vector_stores.kdbai import KDBAIVectorStore\n",
+    "\n",
+    "import pykx as kx\n",
+    "import kdbai_client as kdbai\n",
+    "\n",
+    "OUTDIR = \"pdf\"\n",
+    "RESET = True\n",
+    "\n",
+    "# LLM = 'gpt-3.5-turbo'\n",
+    "LLM = \"gpt-4-turbo-preview\"  # Expensive !!!\n",
+    "EMBEDDING = \"sentence-transformers/all-mpnet-base-v2\"\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key: \")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ca073f5-3d84-4b0e-8684-1396c1311fb8",
+   "metadata": {},
+   "source": [
+    "## Create KDB.AI session and table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a5bb2fb-4ffc-4bfd-b825-36c2cf50367c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "KDBAI_ENDPOINT = \"http://localhost:8082\"\n",
+    "KDBAI_API_KEY = None\n",
+    "KDBAI_TABLE_NAME = \"reports\"\n",
+    "\n",
+    "session = kdbai.Session(endpoint=KDBAI_ENDPOINT, api_key=KDBAI_API_KEY)\n",
+    "\n",
+    "if KDBAI_TABLE_NAME in session.list():\n",
+    "    session.table(KDBAI_TABLE_NAME).drop()\n",
+    "\n",
+    "schema = dict(\n",
+    "    columns=[\n",
+    "        dict(name=\"document_id\", pytype=\"bytes\"),\n",
+    "        dict(name=\"text\", pytype=\"bytes\"),\n",
+    "        dict(\n",
+    "            name=\"embedding\",\n",
+    "            vectorIndex=dict(type=\"flat\", metric=\"L2\", dims=768),\n",
+    "        ),\n",
+    "        dict(name=\"title\", pytype=\"bytes\"),\n",
+    "        dict(name=\"publication_date\", pytype=\"datetime64[ns]\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "table = session.create_table(KDBAI_TABLE_NAME, schema)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a208460c-2c87-4a3f-9926-65d6dcc4b45d",
+   "metadata": {},
+   "source": [
+    "## Financial reports urls and metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6143a9ec-7d48-4f61-bb86-a1de427a0279",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "INPUT_URLS = [\n",
+    "    \"https://www.govinfo.gov/content/pkg/PLAW-106publ102/pdf/PLAW-106publ102.pdf\",\n",
+    "    \"https://www.govinfo.gov/content/pkg/PLAW-111publ203/pdf/PLAW-111publ203.pdf\",\n",
+    "]\n",
+    "\n",
+    "METADATA = {\n",
+    "    \"pdf/PLAW-106publ102.pdf\": {\n",
+    "        \"title\": \"GRAMM–LEACH–BLILEY ACT, 1999\",\n",
+    "        \"publication_date\": pd.to_datetime(\"1999-11-12\"),\n",
+    "    },\n",
+    "    \"pdf/PLAW-111publ203.pdf\": {\n",
+    "        \"title\": \"DODD-FRANK WALL STREET REFORM AND CONSUMER PROTECTION ACT, 2010\",\n",
+    "        \"publication_date\": pd.to_datetime(\"2010-07-21\"),\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1e6c6c5-f151-4c01-a9a1-ab1d540402eb",
+   "metadata": {},
+   "source": [
+    "## Download PDF files locally"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ee36757-b0b7-4478-a71a-601220048a05",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www.govinfo.gov/content/pkg/PLAW-106publ102/pdf/PLAW-106publ102.pdf...\n",
+      "Downloading https://www.govinfo.gov/content/pkg/PLAW-111publ203/pdf/PLAW-111publ203.pdf...\n",
+      "CPU times: user 33 ms, sys: 25.4 ms, total: 58.4 ms\n",
+      "Wall time: 6.09 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "CHUNK_SIZE = 512 * 1024\n",
+    "\n",
+    "\n",
+    "def download_file(url):\n",
+    "    print(\"Downloading %s...\" % url)\n",
+    "    out = os.path.join(OUTDIR, os.path.basename(url))\n",
+    "    try:\n",
+    "        response = urllib.request.urlopen(url)\n",
+    "    except urllib.error.URLError as e:\n",
+    "        logging.exception(\"Failed to download %s !\" % url)\n",
+    "    else:\n",
+    "        with open(out, \"wb\") as f:\n",
+    "            while True:\n",
+    "                chunk = response.read(CHUNK_SIZE)\n",
+    "                if chunk:\n",
+    "                    f.write(chunk)\n",
+    "                else:\n",
+    "                    break\n",
+    "    return out\n",
+    "\n",
+    "\n",
+    "if RESET:\n",
+    "    if os.path.exists(OUTDIR):\n",
+    "        shutil.rmtree(OUTDIR)\n",
+    "    os.mkdir(OUTDIR)\n",
+    "\n",
+    "    local_files = [download_file(x) for x in INPUT_URLS]\n",
+    "    local_files[:10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10d52ad8-b9bd-459e-9ce4-c370982a149f",
+   "metadata": {},
+   "source": [
+    "## Load local PDF files with LlamaIndex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9714258b-f58a-4964-9d3f-0298a98b87e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 3.94 s, sys: 22.5 ms, total: 3.96 s\n",
+      "Wall time: 3.96 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "994"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "\n",
+    "def get_metadata(filepath):\n",
+    "    return METADATA[filepath]\n",
+    "\n",
+    "\n",
+    "documents = SimpleDirectoryReader(\n",
+    "    input_files=local_files,\n",
+    "    file_metadata=get_metadata,\n",
+    ")\n",
+    "\n",
+    "docs = documents.load_data()\n",
+    "len(docs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f3ba953-4034-4421-acf0-dbac33dfed67",
+   "metadata": {},
+   "source": [
+    "## Setup LlamaIndex RAG pipeline using KDB.AI vector store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ed27c4b-a979-4d4f-9f17-7c6bd9844d9a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<timed exec>:4: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1min 23s, sys: 3min 3s, total: 4min 27s\n",
+      "Wall time: 32.4 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "embed_model = HuggingFaceEmbedding(model_name=EMBEDDING)\n",
+    "llm = OpenAI(temperature=0, model=LLM)\n",
+    "vector_store = KDBAIVectorStore(table)\n",
+    "service_context = ServiceContext.from_defaults(\n",
+    "    embed_model=embed_model, llm=llm\n",
+    ")\n",
+    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
+    "index = VectorStoreIndex.from_documents(\n",
+    "    docs,\n",
+    "    service_context=service_context,\n",
+    "    storage_context=storage_context,\n",
+    "    transformations=[SentenceSplitter(chunk_size=2048, chunk_overlap=0)],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a0ca610-4038-41c2-a1f9-5c8af9d764ce",
+   "metadata": {},
+   "source": [
+    "## Setup the LlamaIndex Query Engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7d29b82-c1b8-4880-869e-dabc8bdedaa4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 24.1 ms, sys: 3.74 ms, total: 27.9 ms\n",
+      "Wall time: 26.6 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.\n",
+    "# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.\n",
+    "K = 100\n",
+    "\n",
+    "query_engine = index.as_query_engine(\n",
+    "    similarity_top_k=K,\n",
+    "    filter=[(\"<\", \"publication_date\", \"2008-09-15\")],\n",
+    "    sort_by=\"publication_date\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e442d238-03cf-41d1-9ea4-9d435bb30278",
+   "metadata": {},
+   "source": [
+    "## Before the 2008 crisis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d791e0e-67b0-4179-a4c4-a1f5fd6d765b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Before the 2008 financial crisis, the main financial regulation in the US included a variety of laws and regulatory measures, but one of the most significant frameworks was established by the Gramm-Leach-Bliley Act of 1999. This act repealed parts of the Glass-Steagall Act of 1933, allowing banks to offer a broader range of financial services, including investment, commercial banking, and insurance services. Other regulatory measures and entities, such as the Securities and Exchange Commission (SEC) and laws like the Sarbanes-Oxley Act of 2002, also played key roles in the financial regulatory landscape prior to the crisis.\n",
+      "CPU times: user 202 ms, sys: 53.5 ms, total: 256 ms\n",
+      "Wall time: 18.1 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "result = query_engine.query(\n",
+    "    \"\"\"\n",
+    "What was the main financial regulation in the US before the 2008 financial crisis ?\n",
+    "\"\"\"\n",
+    ")\n",
+    "print(result.response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee5c52ed-fc1a-4f4f-8cc4-efbb4e5a067f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The Gramm-Leach-Bliley Act of 1999, also known as the Financial Services Modernization Act, aimed to modernize financial services by removing barriers between banking, securities, and insurance companies, allowing them to offer each other's services. While the Act contributed to financial services integration and competition, its effectiveness in preventing crises like that of 2008 is debatable due to its strengths and weaknesses in regulating the US stock market.\n",
+      "\n",
+      "Strengths:\n",
+      "1. Enhanced Competition: By allowing financial institutions to merge and offer a broader range of services, the Act fostered competition, innovation, and efficiency in the financial sector.\n",
+      "2. Functional Regulation: The Act maintained that activities within financial institutions would be regulated by the appropriate functional regulator (e.g., securities activities by the SEC), aiming for expertise-based oversight.\n",
+      "\n",
+      "Weaknesses:\n",
+      "1. Increased Systemic Risk: The Act's facilitation of larger, more complex financial institutions may have contributed to systemic risk, as failures of these institutions could have more significant impacts on the financial system.\n",
+      "2. Regulatory Gaps and Oversight Challenges: The integration of different financial services under one roof made it challenging for regulators to oversee and manage the risks of these conglomerates effectively. The Act did not fully address the need for a systemic risk regulator or enhance oversight of the shadow banking system, which played a significant role in the 2008 crisis.\n",
+      "3. Weakened Consumer Privacy Protections: While the Act included provisions for protecting consumers' personal financial information, critics argue that it also allowed for increased sharing of this information among financial entities, potentially undermining consumer privacy.\n",
+      "\n",
+      "In summary, while the Gramm-Leach-Bliley Act of 1999 had the potential to foster innovation and efficiency in the financial sector by breaking down barriers between different types of financial services, its weaknesses in addressing systemic risk and regulatory oversight challenges may have limited its effectiveness in preventing financial crises like that of 2008.\n",
+      "CPU times: user 177 ms, sys: 45.6 ms, total: 223 ms\n",
+      "Wall time: 31.6 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "result = query_engine.query(\n",
+    "    \"\"\"\n",
+    "Is the Gramm-Leach-Bliley Act of 1999 enough to prevent the 2008 crisis. Search the document and explain its strenghts and weaknesses to regulate the US stock market.\n",
+    "\"\"\"\n",
+    ")\n",
+    "print(result.response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5c6b52ee-2086-4e50-8e88-6ad6920cc8bc",
+   "metadata": {},
+   "source": [
+    "## After the 2008 crisis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37753e54-959b-43a3-b596-7cef0e94d4ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 217 µs, sys: 99 µs, total: 316 µs\n",
+      "Wall time: 320 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.\n",
+    "# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.\n",
+    "K = 100\n",
+    "\n",
+    "query_engine = index.as_query_engine(\n",
+    "    similarity_top_k=K,\n",
+    "    filter=[(\">=\", \"publication_date\", \"2008-09-15\")],\n",
+    "    sort_by=\"publication_date\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "445ebab3-4431-4f75-a07d-c998c98b7cfd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I'm unable to provide an answer based on the given instructions.\n",
+      "CPU times: user 151 ms, sys: 22 ms, total: 173 ms\n",
+      "Wall time: 12.7 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "result = query_engine.query(\n",
+    "    \"\"\"\n",
+    "What happened on the 15th of September 2008 ? Answer from your own knowledge only.\n",
+    "\"\"\"\n",
+    ")\n",
+    "print(result.response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a05c539e-85c1-4592-808b-07d68e68e032",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The Dodd-Frank Wall Street Reform and Consumer Protection Act, 2010.\n",
+      "CPU times: user 184 ms, sys: 23.1 ms, total: 207 ms\n",
+      "Wall time: 17.1 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "result = query_engine.query(\n",
+    "    \"\"\"\n",
+    "What was the new US financial regulation enacted after the 2008 crisis to increase the market regulation and to improve consumer sentiment ?\n",
+    "\"\"\"\n",
+    ")\n",
+    "print(result.response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f06802cf-c241-4131-8a5d-529ea3933e59",
+   "metadata": {},
+   "source": [
+    "## In depth analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67c2240b-7b0d-4bd8-8c19-fcf7e5ba429c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 381 µs, sys: 2 µs, total: 383 µs\n",
+      "Wall time: 399 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.\n",
+    "# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.\n",
+    "K = 100\n",
+    "\n",
+    "query_engine = index.as_query_engine(\n",
+    "    similarity_top_k=K, sort_by=\"publication_date\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5fcb92b-7e2f-4945-82c7-08bffd20a052",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Before the 2008 financial crisis, the US financial system was characterized by deregulation and an increase in complex financial products such as mortgage-backed securities and derivatives. The Gramm-Leach-Bliley Act of 1999 repealed the Glass-Steagall Act, allowing banks to engage in investment activities, which led to increased risk-taking. The lack of transparency and understanding of these complex financial products, coupled with inadequate oversight, contributed to the financial crisis.\n",
+      "\n",
+      "After the 2008 crisis, the Dodd-Frank Wall Street Reform and Consumer Protection Act was enacted in 2010 to address the regulatory gaps and weaknesses revealed by the crisis. The Act aimed to increase transparency, protect consumers, and prevent the occurrence of a similar crisis. Key provisions included the creation of the Financial Stability Oversight Council to monitor systemic risk, the establishment of the Consumer Financial Protection Bureau to protect consumers from abusive financial practices, and the introduction of the Volcker Rule to limit speculative investments by banks. Additionally, the Act imposed stricter capital requirements and introduced mechanisms for the orderly liquidation of failing financial institutions to prevent bailouts.\n",
+      "\n",
+      "To ensure that a similar crisis does not happen again, it is crucial to maintain vigilant regulatory oversight, promote transparency in financial markets, and ensure that financial institutions have robust risk management practices in place. Continuous monitoring of systemic risks and the ability to adapt regulations in response to evolving financial products and practices are also essential.\n",
+      "\n",
+      "This analysis is based on the context provided and my own knowledge of the US financial regulations before and after the 2008 crisis.\n",
+      "CPU times: user 1.11 s, sys: 1.99 s, total: 3.1 s\n",
+      "Wall time: 29.8 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "result = query_engine.query(\n",
+    "    \"\"\"\n",
+    "Analyse the US financial regulations before and after the 2008 crisis and produce a report of all related arguments to explain what happened, and to ensure that does not happen again.\n",
+    "Use both the provided context and your own knowledge but do mention explicitely which one you use.\n",
+    "\"\"\"\n",
+    ")\n",
+    "print(result.response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/BUILD
new file mode 100644
index 0000000000..0896ca890d
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/Makefile b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/Makefile
new file mode 100644
index 0000000000..b9eab05aa3
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/README.md
new file mode 100644
index 0000000000..b34a3d40e0
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/README.md
@@ -0,0 +1 @@
+# LlamaIndex Vector_Stores Integration: KDB.AI
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/BUILD
new file mode 100644
index 0000000000..db46e8d6c9
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/__init__.py
new file mode 100644
index 0000000000..2a2bbd60a2
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/__init__.py
@@ -0,0 +1,4 @@
+from llama_index.vector_stores.kdbai.base import KDBAIVectorStore
+
+
+__all__ = ["KDBAIVectorStore"]
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py
new file mode 100644
index 0000000000..006cd6b26a
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py
@@ -0,0 +1,214 @@
+"""KDB.AI vector store index.
+
+An index that is built within KDB.AI.
+
+"""
+
+import logging
+from typing import Any, List, Callable, Optional
+
+import pandas as pd
+
+from llama_index.core.bridge.pydantic import PrivateAttr
+from llama_index.core.schema import BaseNode, TextNode
+from llama_index.core.vector_stores.types import (
+    BasePydanticVectorStore,
+    VectorStoreQuery,
+    VectorStoreQueryResult,
+)
+from llama_index.vector_stores.kbdai.utils import default_sparse_encoder
+
+DEFAULT_COLUMN_NAMES = ["document_id", "text", "embedding"]
+
+DEFAULT_BATCH_SIZE = 100
+
+
+# INITIALISE LOGGER AND SET FORMAT
+logger = logging.getLogger(__name__)
+
+
+# MATCH THE METADATA COLUMN DATA TYPE TO ITS PYTYPE
+def convert_metadata_col(column, value):
+    try:
+        if column["pytype"] == "str":
+            return str(value)
+        elif column["pytype"] == "bytes":
+            return value.encode("utf-8")
+        elif column["pytype"] == "datetime64[ns]":
+            return pd.to_datetime(value)
+        elif column["pytype"] == "timedelta64[ns]":
+            return pd.to_timedelta(value)
+        return value.astype(column["pytype"])
+    except Exception as e:
+        logger.error(
+            f"Failed to convert column {column['name']} to type {column['pytype']}: {e}"
+        )
+
+
+class KDBAIVectorStore(BasePydanticVectorStore):
+    """The KDBAI Vector Store.
+
+    In this vector store we store the text, its embedding and
+    its metadata in a KDBAI vector store table. This implementation
+    allows the use of an already existing table.
+
+    Args:
+        table kdbai.Table: The KDB.AI table to use as storage.
+        batch (int, optional): batch size to insert data.
+            Default is 100.
+
+    Returns:
+        KDBAIVectorStore: Vectorstore that supports add and query.
+    """
+
+    stores_text: bool = True
+    flat_metadata: bool = True
+
+    hybrid_search: bool = False
+    batch_size: int
+
+    _table: Any = PrivateAttr()
+    _sparse_encoder: Optional[Callable] = PrivateAttr()
+
+    def __init__(
+        self,
+        table: Any = None,
+        hybrid_search: bool = False,
+        sparse_encoder: Optional[Callable] = None,
+        batch_size: int = DEFAULT_BATCH_SIZE,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize params."""
+        try:
+            import kdbai_client as kdbai
+
+            logger.info("KDBAI client version: " + kdbai.__version__)
+
+        except ImportError:
+            raise ValueError(
+                "Could not import kdbai_client package."
+                "Please add it to the dependencies."
+            )
+
+        if table is None:
+            raise ValueError("Must provide an existing KDB.AI table.")
+        else:
+            self._table = table
+
+        if hybrid_search:
+            if sparse_encoder is None:
+                self._sparse_encoder = default_sparse_encoder
+            else:
+                self._sparse_encoder = sparse_encoder
+
+        super().__init__(batch_size=batch_size, hybrid_search=hybrid_search)
+
+    @property
+    def client(self) -> Any:
+        """Return KDB.AI client."""
+        return self._table
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "KDBAIVectorStore"
+
+    def add(
+        self,
+        nodes: List[BaseNode],
+        **add_kwargs: Any,
+    ) -> List[str]:
+        """Add nodes to the KDBAI Vector Store.
+
+        Args:
+            nodes (List[BaseNode]): List of nodes to be added.
+
+        Returns:
+            List[str]: List of document IDs that were added.
+        """
+        df = pd.DataFrame()
+        docs = []
+        schema = self._table.schema()["columns"]
+        if self.hybrid_search:
+            schema = [item for item in schema if item["name"] != "sparseVectors"]
+
+        try:
+            for node in nodes:
+                doc = {
+                    "document_id": node.node_id.encode("utf-8"),
+                    "text": node.text.encode("utf-8"),
+                    "embedding": node.embedding,
+                }
+
+                if self.hybrid_search:
+                    doc["sparseVectors"] = self._sparse_encoder([node.get_content()])
+
+                # handle extra columns
+                if len(schema) > len(DEFAULT_COLUMN_NAMES):
+                    for column in schema[len(DEFAULT_COLUMN_NAMES) :]:
+                        try:
+                            doc[column["name"]] = convert_metadata_col(
+                                column, node.metadata[column["name"]]
+                            )
+                        except Exception as e:
+                            logger.error(
+                                f"Error writing column {column['name']} as type {column['pytype']}: {e}."
+                            )
+
+                docs.append(doc)
+
+            df = pd.DataFrame(docs)
+            for i in range((len(df) - 1) // self.batch_size + 1):
+                batch = df.iloc[i * self.batch_size : (i + 1) * self.batch_size]
+                try:
+                    self._table.insert(batch, warn=False)
+                    logger.info(f"inserted batch {i}")
+                except Exception as e:
+                    logger.exception(
+                        f"Failed to insert batch {i} of documents into the datastore: {e}"
+                    )
+
+            return List(df["document_id"])
+
+        except Exception as e:
+            logger.error(f"Error preparing data for KDB.AI: {e}.")
+
+    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
+        if query.filters is None:
+            filter = []
+        else:
+            filter = query.filters
+
+        if self.hybrid_search:
+            alpha = query.alpha if query.alpha is not None else 0.5
+            sparse_vectors = self._sparse_encoder([query.query_str])
+            results = self._table.hybrid_search(
+                dense_vectors=[query.query_embedding],
+                sparse_vectors=sparse_vectors,
+                n=query.similarity_top_k,
+                filter=filter,
+                alpha=alpha,
+            )[0]
+        else:
+            results = self._table.search(
+                vectors=[query.query_embedding], n=query.similarity_top_k, filter=filter
+            )[0]
+
+        top_k_nodes = []
+        top_k_ids = []
+        top_k_scores = []
+
+        for result in results.to_dict(orient="records"):
+            metadata = {x: result[x] for x in result if x not in DEFAULT_COLUMN_NAMES}
+            node = TextNode(
+                text=result["text"], id_=result["document_id"], metadata=metadata
+            )
+            top_k_ids.append(result["document_id"])
+            top_k_nodes.append(node)
+            top_k_scores.append(result["__nn_distance"])
+
+        return VectorStoreQueryResult(
+            nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids
+        )
+
+    def delete(self, **delete_kwargs: Any) -> None:
+        raise Exception("Not implemented.")
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py
new file mode 100644
index 0000000000..f690dba786
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py
@@ -0,0 +1,22 @@
+from typing import List, Dict
+
+
+def default_sparse_encoder(texts: List[str]) -> List[Dict[int, int]]:
+    try:
+        from transformers import BertTokenizer
+        from collections import Counter
+    except ImportError:
+        raise ImportError(
+            "Could not import transformers library. "
+            'Please install transformers with `pip install "transformers"`'
+        )
+
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    results = []
+    for text in texts:
+        tokenized_text = tokenizer(text, padding=True, truncation=True, max_length=512)[
+            "input_ids"
+        ]
+        sparse_encoding = dict(Counter(tokenized_text))
+        results.append(sparse_encoding)
+    return results
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml
new file mode 100644
index 0000000000..f3c9586617
--- /dev/null
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml
@@ -0,0 +1,67 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+# Feel free to un-skip examples, and experimental, you will just need to
+# work through many typos (--write-changes and --interactive will help)
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.vector_stores.kdbai"
+
+[tool.llamahub.class_authors]
+KDBAIVectorStore = "llama-index"
+
+[tool.mypy]
+disallow_untyped_defs = true
+# Remove venv skip when integrated with pre-commit
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = ">=3.8.1,<4.0"
+
+[tool.poetry]
+authors = ["Your Name <you@example.com>"]
+description = "llama-index vector_stores kdbai integration"
+exclude = ["**/BUILD"]
+license = "MIT"
+name = "llama-index-vector-stores-kdbai"
+readme = "README.md"
+version = "0.1.3"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+llama-index-core = "^0.10.0"
+pykx = "^2.1.1"
+kdbai-client = "^0.1.2"
+
+[tool.poetry.group.dev.dependencies]
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"
+types-setuptools = "67.1.0.0"
+
+[tool.poetry.group.dev.dependencies.black]
+extras = ["jupyter"]
+version = "<=23.9.1,>=23.7.0"
+
+[tool.poetry.group.dev.dependencies.codespell]
+extras = ["toml"]
+version = ">=v2.2.6"
+
+[[tool.poetry.packages]]
+include = "llama_index/"
-- 
GitLab