From 09cc05f1d84c9c811e94b466e4c7b2372b3dac3b Mon Sep 17 00:00:00 2001
From: jamescalam <james.briggs@hotmail.com>
Date: Sun, 24 Nov 2024 19:51:10 +0100
Subject: [PATCH] feat: integrating Pinecone with hybrid

---
 docs/examples/pinecone-hybrid.ipynb | 463 ++++++++++++++++++++++++++++
 semantic_router/index/pinecone.py   |  25 +-
 2 files changed, 482 insertions(+), 6 deletions(-)
 create mode 100644 docs/examples/pinecone-hybrid.ipynb

diff --git a/docs/examples/pinecone-hybrid.ipynb b/docs/examples/pinecone-hybrid.ipynb
new file mode 100644
index 00000000..f18eef6e
--- /dev/null
+++ b/docs/examples/pinecone-hybrid.ipynb
@@ -0,0 +1,463 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aurelio-labs/semantic-router/blob/main/docs/encoders/aurelio-bm25.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/aurelio-labs/semantic-router/blob/main/docs/encoders/aurelio-bm25.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using PineconeIndex for Hybrid Routes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Hybrid indexes combine both sparse and dense encodings to produce more accurate results. The dense encoder allows us to search based on semantic meaning, while the sparse encoder allows us to search based on text matches. Merging both dense and sparse into a single hybrid retrieval step allows us to step up our performance beyond what dense-only or sparse-only could achieve."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting Started"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We start by installing semantic-router. Support for the new `AurelioSparseEncoder` parameter was added in `semantic-router==0.1.0`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qU \"semantic-router[pinecone]==0.1.0\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We start by defining a dictionary mapping routes to example phrases that should trigger those routes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/jamesbriggs/Library/Caches/pypoetry/virtualenvs/semantic-router-C1zr4a78-py3.12/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from semantic_router import Route\n",
+    "\n",
+    "politics = Route(\n",
+    "    name=\"politics\",\n",
+    "    utterances=[\n",
+    "        \"isn't politics the best thing ever\",\n",
+    "        \"why don't you tell me about your political opinions\",\n",
+    "        \"don't you just love the president\",\n",
+    "        \"don't you just hate the president\",\n",
+    "        \"they're going to destroy this country!\",\n",
+    "        \"they will save the country!\",\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's define another for good measure:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chitchat = Route(\n",
+    "    name=\"chitchat\",\n",
+    "    utterances=[\n",
+    "        \"how's the weather today?\",\n",
+    "        \"how are things going?\",\n",
+    "        \"lovely weather today\",\n",
+    "        \"the weather is horrendous\",\n",
+    "        \"let's go to the chippy\",\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "routes = [politics, chitchat]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we initialize our embedding models. We are going to use a hybrid index which requires both a dense and sparse encoder. For the sparse encoder we will use the pretrained `bm25` model from the Aurelio Platform and OpenAI's `text-embedding-3-small` for the dense encoder.\n",
+    "\n",
+    "To get an API key for the Aurelio Platform, we head to the [Aurelio Platform](https://platform.aurelio.ai/settings/api-keys)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from getpass import getpass\n",
+    "from semantic_router.encoders.aurelio import AurelioSparseEncoder\n",
+    "\n",
+    "os.environ[\"AURELIO_API_KEY\"] = os.getenv(\"AURELIO_API_KEY\") or getpass(\n",
+    "    \"Enter Aurelio API Key: \"\n",
+    ")\n",
+    "\n",
+    "sparse_encoder = AurelioSparseEncoder(name=\"bm25\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Sparse encoders return dictionaries containing the the indices and values of the non-zero elements in the sparse matrix."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from semantic_router.encoders import OpenAIEncoder\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = os.getenv(\"OPENAI_API_KEY\") or getpass(\n",
+    "    \"Enter OpenAI API Key: \"\n",
+    ")\n",
+    "\n",
+    "encoder = OpenAIEncoder(\n",
+    "    name=\"text-embedding-3-small\", score_threshold=0.3\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We now have both our sparse and dense encoders. When using both sparse and dense encoders we need to initialize an index that supports hybrid, such as the `HybridLocalIndex` or `PineconeIndex`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-11-24 19:41:05 - pinecone_plugin_interface.logging - INFO - discover_namespace_packages.py:12 - discover_subpackages() - Discovering subpackages in _NamespacePath(['/Users/jamesbriggs/Library/Caches/pypoetry/virtualenvs/semantic-router-C1zr4a78-py3.12/lib/python3.12/site-packages/pinecone_plugins'])\n",
+      "2024-11-24 19:41:05 - pinecone_plugin_interface.logging - INFO - discover_plugins.py:9 - discover_plugins() - Looking for plugins in pinecone_plugins.inference\n",
+      "2024-11-24 19:41:05 - pinecone_plugin_interface.logging - INFO - installation.py:10 - install_plugins() - Installing plugin inference into Pinecone\n"
+     ]
+    }
+   ],
+   "source": [
+    "from semantic_router.index import PineconeIndex\n",
+    "\n",
+    "os.environ[\"PINECONE_API_KEY\"] = os.getenv(\"PINECONE_API_KEY\") or getpass(\n",
+    "    \"Enter Pinecone API Key: \"\n",
+    ")\n",
+    "\n",
+    "index = PineconeIndex(\n",
+    "    index_name=\"hybrid-test\",\n",
+    "    dimensions=1536,\n",
+    "    metric=\"dotproduct\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we define the `HybridRouter`. When called, the router will consume text (a query) and output the category (`Route`) it belongs to — to initialize a `HybridRouter` we need an `encoder`, `sparse_encoder` our `routes`, and the hybrid `index` we just define."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-11-24 19:41:15 - httpx - INFO - _client.py:1013 - _send_single_request() - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "2024-11-24 19:41:17 - semantic_router.utils.logger - WARNING - pinecone.py:247 - add() - TEMP | add:\n",
+      "politics: isn't politics the best thing ever\n",
+      "politics: why don't you tell me about your political opinions\n",
+      "politics: don't you just love the president\n",
+      "politics: don't you just hate the president\n",
+      "politics: they're going to destroy this country!\n",
+      "politics: they will save the country!\n",
+      "2024-11-24 19:41:17 - httpx - INFO - _client.py:1013 - _send_single_request() - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "2024-11-24 19:41:18 - semantic_router.utils.logger - WARNING - pinecone.py:247 - add() - TEMP | add:\n",
+      "chitchat: how's the weather today?\n",
+      "chitchat: how are things going?\n",
+      "chitchat: lovely weather today\n",
+      "chitchat: the weather is horrendous\n",
+      "chitchat: let's go to the chippy\n"
+     ]
+    }
+   ],
+   "source": [
+    "from semantic_router.routers import HybridRouter\n",
+    "\n",
+    "router = HybridRouter(\n",
+    "    encoder=encoder,\n",
+    "    sparse_encoder=sparse_encoder,\n",
+    "    routes=routes,\n",
+    "    index=index,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's see if our local and remote instances are synchronized..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-11-24 19:42:06 - semantic_router.utils.logger - WARNING - pinecone.py:424 - _read_hash() - Configuration for hash parameter not found in index.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "router.is_synced()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It seems like our `router` is not synchronized, meaning there are differences between the utterances in our local `HybridRouter` and the remote `PineconeIndex`. We can view the differences by calling `get_utterance_diff()`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['- chitchat: how are things going?',\n",
+       " \"- chitchat: how's the weather today?\",\n",
+       " \"- chitchat: let's go to the chippy\",\n",
+       " '- chitchat: lovely weather today',\n",
+       " '- chitchat: the weather is horrendous',\n",
+       " \"- politics: don't you just hate the president\",\n",
+       " \"- politics: don't you just love the president\",\n",
+       " \"- politics: isn't politics the best thing ever\",\n",
+       " '- politics: they will save the country!',\n",
+       " \"- politics: they're going to destroy this country!\",\n",
+       " \"- politics: why don't you tell me about your political opinions\"]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "router.get_utterance_diff()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From this, we can see that every utterance is preceeded by a `-` meaning it is unique to the local `HybridRouter`. So it seems our `PineconeIndex` is missing all utterances. We can confirm this further by calling `router.index.get_utterances()` to see all utterances in the remote `PineconeIndex`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "router.index.get_utterances()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As expected, we have no utterances in the remote `PineconeIndex`. The reason for this is that when initializing our `HybridRouter` we did not specify an `auto_sync` parameter, so `auto_sync` defaulted to `None`. When `auto_sync=None` no synchronization is performed during initialization. Let's try again with `auto_sync=\"local\"`, meaning take what we have locally and overwrite the remote `PineconeIndex` with these local values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-11-24 19:48:29 - httpx - INFO - _client.py:1013 - _send_single_request() - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "2024-11-24 19:48:31 - semantic_router.utils.logger - WARNING - pinecone.py:247 - add() - TEMP | add:\n",
+      "politics: isn't politics the best thing ever\n",
+      "politics: why don't you tell me about your political opinions\n",
+      "politics: don't you just love the president\n",
+      "politics: don't you just hate the president\n",
+      "politics: they're going to destroy this country!\n",
+      "politics: they will save the country!\n",
+      "2024-11-24 19:48:31 - httpx - INFO - _client.py:1013 - _send_single_request() - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "2024-11-24 19:48:32 - semantic_router.utils.logger - WARNING - pinecone.py:247 - add() - TEMP | add:\n",
+      "chitchat: how's the weather today?\n",
+      "chitchat: how are things going?\n",
+      "chitchat: lovely weather today\n",
+      "chitchat: the weather is horrendous\n",
+      "chitchat: let's go to the chippy\n"
+     ]
+    }
+   ],
+   "source": [
+    "router = HybridRouter(\n",
+    "    encoder=encoder,\n",
+    "    sparse_encoder=sparse_encoder,\n",
+    "    routes=routes,\n",
+    "    index=index,\n",
+    "    auto_sync=\"local\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's check our sync state:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "router.is_synced()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "... NEED TO FINISH HERE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "router(\"I'm interested in learning about llama 2\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this case, we return `None` because no matches were identified. We always recommend optimizing your `RouteLayer` for optimal performance, you can see how in [this notebook](https://github.com/aurelio-labs/semantic-router/blob/main/docs/06-threshold-optimization.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "decision-layer",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py
index 0a3856d8..2d432d33 100644
--- a/semantic_router/index/pinecone.py
+++ b/semantic_router/index/pinecone.py
@@ -22,6 +22,7 @@ def clean_route_name(route_name: str) -> str:
 class PineconeRecord(BaseModel):
     id: str = ""
     values: List[float]
+    sparse_values: Optional[dict[int, float]] = None
     route: str
     utterance: str
     function_schema: str = "{}"
@@ -42,11 +43,17 @@ class PineconeRecord(BaseModel):
         )
 
     def to_dict(self):
-        return {
+        d = {
             "id": self.id,
             "values": self.values,
             "metadata": self.metadata,
         }
+        if self.sparse_values:
+            d["sparse_values"] = {
+                "indices": list(self.sparse_values.keys()),
+                "values": list(self.sparse_values.values()),
+            }
+        return d
 
 
 class PineconeIndex(BaseIndex):
@@ -54,7 +61,7 @@ class PineconeIndex(BaseIndex):
     api_key: Optional[str] = None
     index_name: str = "index"
     dimensions: Union[int, None] = None
-    metric: str = "cosine"
+    metric: str = "dotproduct"
     cloud: str = "aws"
     region: str = "us-west-2"
     host: str = ""
@@ -70,7 +77,7 @@ class PineconeIndex(BaseIndex):
         api_key: Optional[str] = None,
         index_name: str = "index",
         dimensions: Optional[int] = None,
-        metric: str = "cosine",
+        metric: str = "dotproduct",
         cloud: str = "aws",
         region: str = "us-west-2",
         host: str = "",
@@ -233,6 +240,7 @@ class PineconeIndex(BaseIndex):
         function_schemas: Optional[List[Dict[str, Any]]] = None,
         metadata_list: List[Dict[str, Any]] = [],
         batch_size: int = 100,
+        sparse_embeddings: Optional[List[dict[int, float]]] = None,
     ):
         """Add vectors to Pinecone in batches."""
         temp = "\n".join([f"{x[0]}: {x[1]}" for x in zip(routes, utterances)])
@@ -240,17 +248,22 @@ class PineconeIndex(BaseIndex):
         if self.index is None:
             self.dimensions = self.dimensions or len(embeddings[0])
             self.index = self._init_index(force_create=True)
+        if function_schemas is None:
+            function_schemas = [None] * len(embeddings)
+        if sparse_embeddings is None:
+            sparse_embeddings = [None] * len(embeddings)
 
         vectors_to_upsert = [
             PineconeRecord(
                 values=vector,
+                sparse_values=sparse_dict,
                 route=route,
                 utterance=utterance,
                 function_schema=json.dumps(function_schema),
                 metadata=metadata,
             ).to_dict()
-            for vector, route, utterance, function_schema, metadata in zip(
-                embeddings, routes, utterances, function_schemas, metadata_list  # type: ignore
+            for vector, route, utterance, function_schema, metadata, sparse_dict in zip(
+                embeddings, routes, utterances, function_schemas, metadata_list, sparse_embeddings  # type: ignore
             )
         ]
 
@@ -523,7 +536,7 @@ class PineconeIndex(BaseIndex):
         dimension: int,
         cloud: str,
         region: str,
-        metric: str = "cosine",
+        metric: str = "dotproduct",
     ):
         params = {
             "name": name,
-- 
GitLab