From 805807f839a4415792608ae953af76e17aa82a20 Mon Sep 17 00:00:00 2001
From: jamescalam <james.briggs@hotmail.com>
Date: Wed, 27 Nov 2024 15:43:36 +0100
Subject: [PATCH] feat: optimized sparse embedding interface

---
 Makefile                              |  4 +-
 docs/encoders/aurelio-bm25.ipynb      |  4 +-
 docs/examples/hybrid-router.ipynb     |  4 +-
 docs/examples/pinecone-hybrid.ipynb   | 54 +++++++++++-----------
 semantic_router/encoders/aurelio.py   | 13 ++++--
 semantic_router/encoders/base.py      |  2 +-
 semantic_router/encoders/tfidf.py     |  7 ++-
 semantic_router/index/hybrid_local.py | 12 +++--
 semantic_router/index/pinecone.py     |  4 +-
 semantic_router/routers/base.py       |  5 +-
 semantic_router/routers/hybrid.py     |  8 ++--
 semantic_router/routers/semantic.py   |  5 +-
 semantic_router/schema.py             | 66 +++++++++++++++++----------
 tests/unit/test_sync.py               |  8 +++-
 14 files changed, 109 insertions(+), 87 deletions(-)

diff --git a/Makefile b/Makefile
index 8283b96d..979a97e5 100644
--- a/Makefile
+++ b/Makefile
@@ -7,8 +7,8 @@ lint: PYTHON_FILES=.
 lint_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep -E '\.py$$')
 
 lint lint_diff:
-	poetry run black --target-version py39 -l 88 $(PYTHON_FILES) --check
-	poetry run ruff .
+	poetry run black --target-version py311 -l 88 $(PYTHON_FILES) --check
+	poetry run ruff check .
 	poetry run mypy $(PYTHON_FILES)
 
 test:
diff --git a/docs/encoders/aurelio-bm25.ipynb b/docs/encoders/aurelio-bm25.ipynb
index e74f1e1b..9e4b7852 100644
--- a/docs/encoders/aurelio-bm25.ipynb
+++ b/docs/encoders/aurelio-bm25.ipynb
@@ -153,9 +153,7 @@
     "    \"Enter OpenAI API Key: \"\n",
     ")\n",
     "\n",
-    "encoder = OpenAIEncoder(\n",
-    "    name=\"text-embedding-3-small\", score_threshold=0.3\n",
-    ")"
+    "encoder = OpenAIEncoder(name=\"text-embedding-3-small\", score_threshold=0.3)"
    ]
   },
   {
diff --git a/docs/examples/hybrid-router.ipynb b/docs/examples/hybrid-router.ipynb
index e52c5752..71e57ca7 100644
--- a/docs/examples/hybrid-router.ipynb
+++ b/docs/examples/hybrid-router.ipynb
@@ -155,9 +155,7 @@
     "from semantic_router.routers import HybridRouter\n",
     "\n",
     "router = HybridRouter(\n",
-    "    encoder=dense_encoder,\n",
-    "    sparse_encoder=sparse_encoder,\n",
-    "    routes=routes\n",
+    "    encoder=dense_encoder, sparse_encoder=sparse_encoder, routes=routes\n",
     ")"
    ]
   },
diff --git a/docs/examples/pinecone-hybrid.ipynb b/docs/examples/pinecone-hybrid.ipynb
index 134b6e0d..b354907f 100644
--- a/docs/examples/pinecone-hybrid.ipynb
+++ b/docs/examples/pinecone-hybrid.ipynb
@@ -53,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -90,7 +90,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -119,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -143,7 +143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,9 +153,7 @@
     "    \"Enter OpenAI API Key: \"\n",
     ")\n",
     "\n",
-    "encoder = OpenAIEncoder(\n",
-    "    name=\"text-embedding-3-small\", score_threshold=0.3\n",
-    ")"
+    "encoder = OpenAIEncoder(name=\"text-embedding-3-small\", score_threshold=0.3)"
    ]
   },
   {
@@ -167,16 +165,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-11-26 22:34:54 - pinecone_plugin_interface.logging - INFO - discover_namespace_packages.py:12 - discover_subpackages() - Discovering subpackages in _NamespacePath(['/Users/jamesbriggs/Library/Caches/pypoetry/virtualenvs/semantic-router-C1zr4a78-py3.12/lib/python3.12/site-packages/pinecone_plugins'])\n",
-      "2024-11-26 22:34:54 - pinecone_plugin_interface.logging - INFO - discover_plugins.py:9 - discover_plugins() - Looking for plugins in pinecone_plugins.inference\n",
-      "2024-11-26 22:34:54 - pinecone_plugin_interface.logging - INFO - installation.py:10 - install_plugins() - Installing plugin inference into Pinecone\n"
+      "2024-11-27 15:41:32 - pinecone_plugin_interface.logging - INFO - discover_namespace_packages.py:12 - discover_subpackages() - Discovering subpackages in _NamespacePath(['/Users/jamesbriggs/Library/Caches/pypoetry/virtualenvs/semantic-router-C1zr4a78-py3.12/lib/python3.12/site-packages/pinecone_plugins'])\n",
+      "2024-11-27 15:41:32 - pinecone_plugin_interface.logging - INFO - discover_plugins.py:9 - discover_plugins() - Looking for plugins in pinecone_plugins.inference\n",
+      "2024-11-27 15:41:32 - pinecone_plugin_interface.logging - INFO - installation.py:10 - install_plugins() - Installing plugin inference into Pinecone\n"
      ]
     }
    ],
@@ -203,7 +201,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -226,16 +224,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "True"
+       "False"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -253,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -272,7 +270,7 @@
        " \"  politics: why don't you tell me about your political opinions\"]"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -290,7 +288,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -309,7 +307,7 @@
        " Utterance(route='politics', utterance=\"why don't you tell me about your political opinions\", function_schemas=None, metadata={}, diff_tag=' ')]"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -327,7 +325,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -349,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -358,7 +356,7 @@
        "True"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -369,7 +367,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -388,7 +386,7 @@
        " \"  politics: why don't you tell me about your political opinions\"]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -406,14 +404,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-11-26 22:35:56 - httpx - INFO - _client.py:1013 - _send_single_request() - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
+      "2024-11-27 15:42:03 - httpx - INFO - _client.py:1013 - _send_single_request() - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
      ]
     },
     {
@@ -422,7 +420,7 @@
        "RouteChoice(name=None, function_call=None, similarity_score=None)"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -440,7 +438,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-11-26 22:35:20 - httpx - INFO - _client.py:1013 - _send_single_request() - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
+      "2024-11-27 15:42:06 - httpx - INFO - _client.py:1013 - _send_single_request() - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
      ]
     },
     {
diff --git a/semantic_router/encoders/aurelio.py b/semantic_router/encoders/aurelio.py
index d226e3d3..8824b2f1 100644
--- a/semantic_router/encoders/aurelio.py
+++ b/semantic_router/encoders/aurelio.py
@@ -12,16 +12,17 @@ class AurelioSparseEncoder(SparseEncoder):
     model: Optional[Any] = None
     idx_mapping: Optional[Dict[int, int]] = None
     client: AurelioClient = Field(default_factory=AurelioClient, exclude=True)
-    async_client: AsyncAurelioClient = Field(default_factory=AsyncAurelioClient, exclude=True)
+    async_client: AsyncAurelioClient = Field(
+        default_factory=AsyncAurelioClient, exclude=True
+    )
     type: str = "sparse"
 
     def __init__(
         self,
         name: str = "bm25",
-        score_threshold: float = 1.0,
         api_key: Optional[str] = None,
     ):
-        super().__init__(name=name, score_threshold=score_threshold)
+        super().__init__(name=name)
         if api_key is None:
             api_key = os.getenv("AURELIO_API_KEY")
         if api_key is None:
@@ -33,9 +34,11 @@ class AurelioSparseEncoder(SparseEncoder):
         res: EmbeddingResponse = self.client.embedding(input=docs, model=self.name)
         embeds = [SparseEmbedding.from_aurelio(r.embedding) for r in res.data]
         return embeds
-    
+
     async def acall(self, docs: list[str]) -> list[SparseEmbedding]:
-        res: EmbeddingResponse = await self.async_client.embedding(input=docs, model=self.name)
+        res: EmbeddingResponse = await self.async_client.embedding(
+            input=docs, model=self.name
+        )
         embeds = [SparseEmbedding.from_aurelio(r.embedding) for r in res.data]
         return embeds
 
diff --git a/semantic_router/encoders/base.py b/semantic_router/encoders/base.py
index f2cee15d..1bcf3d9d 100644
--- a/semantic_router/encoders/base.py
+++ b/semantic_router/encoders/base.py
@@ -35,4 +35,4 @@ class SparseEncoder(BaseModel):
         raise NotImplementedError("Subclasses must implement this method")
 
     def acall(self, docs: List[str]) -> Coroutine[Any, Any, List[SparseEmbedding]]:
-        raise NotImplementedError("Subclasses must implement this method")
\ No newline at end of file
+        raise NotImplementedError("Subclasses must implement this method")
diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py
index 873d900a..1bec7b9f 100644
--- a/semantic_router/encoders/tfidf.py
+++ b/semantic_router/encoders/tfidf.py
@@ -14,9 +14,8 @@ class TfidfEncoder(SparseEncoder):
     idf: ndarray = np.array([])
     word_index: Dict = {}
 
-    def __init__(self, name: str = "tfidf", score_threshold: float = 0.82):
-        # TODO default score_threshold not thoroughly tested, should optimize
-        super().__init__(name=name, score_threshold=score_threshold)
+    def __init__(self, name: str = "tfidf"):
+        super().__init__(name=name)
         self.word_index = {}
         self.idf = np.array([])
 
@@ -29,7 +28,7 @@ class TfidfEncoder(SparseEncoder):
         docs = [self._preprocess(doc) for doc in docs]
         tf = self._compute_tf(docs)
         tfidf = tf * self.idf
-        return tfidf.tolist()
+        return tfidf
 
     def fit(self, routes: List[Route]):
         docs = []
diff --git a/semantic_router/index/hybrid_local.py b/semantic_router/index/hybrid_local.py
index 28a3d83b..f2821422 100644
--- a/semantic_router/index/hybrid_local.py
+++ b/semantic_router/index/hybrid_local.py
@@ -66,15 +66,19 @@ class HybridLocalIndex(LocalIndex):
             "dimensions": self.index.shape[1] if self.index is not None else 0,
             "vectors": self.index.shape[0] if self.index is not None else 0,
         }
-    
-    def _sparse_dot_product(self, vec_a: dict[int, float], vec_b: dict[int, float]) -> float:
+
+    def _sparse_dot_product(
+        self, vec_a: dict[int, float], vec_b: dict[int, float]
+    ) -> float:
         # switch vecs to ensure first is smallest for more efficiency
         if len(vec_a) > len(vec_b):
             vec_a, vec_b = vec_b, vec_a
         return sum(vec_a[i] * vec_b.get(i, 0) for i in vec_a)
-    
+
     def _sparse_index_dot_product(self, vec_a: dict[int, float]) -> list[float]:
-        dot_products = [self._sparse_dot_product(vec_a, vec_b) for vec_b in self.sparse_index]
+        dot_products = [
+            self._sparse_dot_product(vec_a, vec_b) for vec_b in self.sparse_index
+        ]
         return dot_products
 
     def query(
diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py
index 5eb0aecb..bb6ed3ef 100644
--- a/semantic_router/index/pinecone.py
+++ b/semantic_router/index/pinecone.py
@@ -652,7 +652,9 @@ class PineconeIndex(BaseIndex):
             )
 
     def __len__(self):
-        namespace_stats = self.index.describe_index_stats()["namespaces"].get(self.namespace)
+        namespace_stats = self.index.describe_index_stats()["namespaces"].get(
+            self.namespace
+        )
         if namespace_stats:
             return namespace_stats["vector_count"]
         else:
diff --git a/semantic_router/routers/base.py b/semantic_router/routers/base.py
index 1af3e38b..087eb57c 100644
--- a/semantic_router/routers/base.py
+++ b/semantic_router/routers/base.py
@@ -4,7 +4,7 @@ import os
 import random
 import hashlib
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-from pydantic.v1 import BaseModel, Field, validator
+from pydantic.v1 import BaseModel, Field
 
 import numpy as np
 import yaml  # type: ignore
@@ -380,8 +380,7 @@ class BaseRouter(BaseModel):
             self.index = index
 
     def _init_index_state(self):
-        """Initializes an index (where required) and runs auto_sync if active.
-        """
+        """Initializes an index (where required) and runs auto_sync if active."""
         # initialize index now, check if we need dimensions
         if self.index.dimensions is None:
             dims = len(self.encoder(["test"])[0])
diff --git a/semantic_router/routers/hybrid.py b/semantic_router/routers/hybrid.py
index f8ec89cd..0feb8379 100644
--- a/semantic_router/routers/hybrid.py
+++ b/semantic_router/routers/hybrid.py
@@ -60,7 +60,7 @@ class HybridRouter(BaseRouter):
         # run initialize index now if auto sync is active
         if self.auto_sync:
             self._init_index_state()
-    
+
     def _set_sparse_encoder(self, sparse_encoder: Optional[DenseEncoder]):
         if sparse_encoder is None:
             logger.warning("No sparse_encoder provided. Using default BM25Encoder.")
@@ -126,7 +126,7 @@ class HybridRouter(BaseRouter):
             vector=np.array(vector) if isinstance(vector, list) else vector,
             top_k=self.top_k,
             route_filter=route_filter,
-            sparse_vector=sparse_vector[0]
+            sparse_vector=sparse_vector[0],
         )
         top_class, top_class_scores = self._semantic_classify(
             list(zip(scores, route_names))
@@ -142,7 +142,9 @@ class HybridRouter(BaseRouter):
         scaled_dense = np.array(dense) * self.alpha
         scaled_sparse = []
         for sparse_dict in sparse:
-            scaled_sparse.append({k: v * (1 - self.alpha) for k, v in sparse_dict.items()})
+            scaled_sparse.append(
+                {k: v * (1 - self.alpha) for k, v in sparse_dict.items()}
+            )
         return scaled_dense, scaled_sparse
 
     def _set_aggregation_method(self, aggregation: str = "sum"):
diff --git a/semantic_router/routers/semantic.py b/semantic_router/routers/semantic.py
index e8a7db14..8a21fdf2 100644
--- a/semantic_router/routers/semantic.py
+++ b/semantic_router/routers/semantic.py
@@ -1,15 +1,12 @@
 import json
 import random
 from typing import Any, Dict, List, Optional, Tuple, Union
-from pydantic.v1 import validator, Field
 
 import numpy as np
 from tqdm.auto import tqdm
 
-from semantic_router.encoders import AutoEncoder, DenseEncoder, OpenAIEncoder
+from semantic_router.encoders import AutoEncoder, DenseEncoder
 from semantic_router.index.base import BaseIndex
-from semantic_router.index.local import LocalIndex
-from semantic_router.index.pinecone import PineconeIndex
 from semantic_router.llms import BaseLLM, OpenAILLM
 from semantic_router.route import Route
 from semantic_router.routers.base import BaseRouter, RouterConfig
diff --git a/semantic_router/schema.py b/semantic_router/schema.py
index 2d00572f..507ea349 100644
--- a/semantic_router/schema.py
+++ b/semantic_router/schema.py
@@ -1,11 +1,13 @@
 from datetime import datetime
 from difflib import Differ
 from enum import Enum
+import numpy as np
 from typing import List, Optional, Union, Any, Dict, Tuple
 from pydantic.v1 import BaseModel, Field
 from semantic_router.utils.logger import logger
 from aurelio_sdk.schema import BM25Embedding
 
+
 class EncoderType(Enum):
     AURELIO = "aurelio"
     AZURE = "azure"
@@ -406,37 +408,53 @@ class Metric(Enum):
     MANHATTAN = "manhattan"
 
 
-class SparseValue(BaseModel):
-    index: int
-    value: float
-
-
 class SparseEmbedding(BaseModel):
-    embedding: List[SparseValue]
+    """Sparse embedding interface. Primarily uses numpy operations for faster
+    operations.
+    """
+    embedding: np.ndarray
 
-    def to_dict(self):
-        return {x.index: x.value for x in self.embedding}
+    class Config:
+        arbitrary_types_allowed = True
+
+    @classmethod
+    def from_array(cls, array: np.ndarray):
+        if array.ndim != 2 or array.shape[1] != 2:
+            raise ValueError(
+                f"Expected a 2D array with 2 columns, got a {array.ndim}D array with {array.shape[1]} columns. "
+                "Column 0 should contain index positions, and column 1 should contain respective values."
+            )
+        return cls(embedding=array)
     
-    def to_pinecone(self):
-        return {
-            "indices": [x.index for x in self.embedding],
-            "values": [x.value for x in self.embedding],
-        }
+    @classmethod
+    def from_aurelio(cls, embedding: BM25Embedding):
+        arr = np.array([embedding.indices, embedding.values]).T
+        return cls.from_array(arr)
     
     @classmethod
     def from_dict(cls, sparse_dict: dict):
-        return cls(embedding=[SparseValue(index=i, value=v) for i, v in sparse_dict.items()])
+        arr = np.array([list(sparse_dict.keys()), list(sparse_dict.values())]).T
+        return cls.from_array(arr)
     
-    @classmethod
-    def from_aurelio(cls, embedding: BM25Embedding):
-        return cls(embedding=[
-            SparseValue(
-                index=i,
-                value=v
-            ) for i, v in zip(embedding.indices, embedding.values)
-        ])
+    def to_dict(self):
+        return {
+            i: v for i, v in zip(
+                self.embedding[:,0].astype(int),
+                self.embedding[:,1]
+            )
+        }
+    
+    def to_pinecone(self):
+        return {
+            "indices": self.embedding[:, 0].astype(int).tolist(),
+            "values": self.embedding[:, 1].tolist(),
+        }
     
     # dictionary interface
     def items(self):
-        return [(x.index, x.value) for x in self.embedding]
-
+        return [
+            (i, v) for i, v in zip(
+                self.embedding[:,0].astype(int),
+                self.embedding[:,1]
+            )
+        ]
diff --git a/tests/unit/test_sync.py b/tests/unit/test_sync.py
index 8405add9..ea4b8d41 100644
--- a/tests/unit/test_sync.py
+++ b/tests/unit/test_sync.py
@@ -223,7 +223,9 @@ class TestSemanticRouter:
         _ = SemanticRouter(
             encoder=openai_encoder, routes=routes, index=index, auto_sync="local"
         )
-        route_layer = SemanticRouter(encoder=openai_encoder, routes=routes_2, index=index)
+        route_layer = SemanticRouter(
+            encoder=openai_encoder, routes=routes_2, index=index
+        )
         if index_cls is PineconeIndex:
             time.sleep(PINECONE_SLEEP)  # allow for index to be populated
         assert route_layer.is_synced() is False
@@ -236,7 +238,9 @@ class TestSemanticRouter:
         _ = SemanticRouter(
             encoder=openai_encoder, routes=routes, index=index, auto_sync="local"
         )
-        route_layer_2 = SemanticRouter(encoder=openai_encoder, routes=routes_2, index=index)
+        route_layer_2 = SemanticRouter(
+            encoder=openai_encoder, routes=routes_2, index=index
+        )
         if index_cls is PineconeIndex:
             time.sleep(PINECONE_SLEEP)  # allow for index to be populated
         diff = route_layer_2.get_utterance_diff(include_metadata=True)
-- 
GitLab