Skip to content
Snippets Groups Projects
Commit 6bbafa4c authored by jamescalam's avatar jamescalam
Browse files

feat: compatibility for tfidf

parent a96f6b85
No related branches found
No related tags found
No related merge requests found
from typing import Any, Coroutine, List, Optional from typing import Any, Coroutine, List, Optional
from pydantic.v1 import BaseModel, Field, validator from pydantic.v1 import BaseModel, Field, validator
import numpy as np
from semantic_router.schema import SparseEmbedding from semantic_router.schema import SparseEmbedding
...@@ -34,5 +35,19 @@ class SparseEncoder(BaseModel): ...@@ -34,5 +35,19 @@ class SparseEncoder(BaseModel):
def __call__(self, docs: List[str]) -> List[SparseEmbedding]: def __call__(self, docs: List[str]) -> List[SparseEmbedding]:
raise NotImplementedError("Subclasses must implement this method") raise NotImplementedError("Subclasses must implement this method")
def acall(self, docs: List[str]) -> Coroutine[Any, Any, List[SparseEmbedding]]: async def acall(self, docs: List[str]) -> Coroutine[Any, Any, List[SparseEmbedding]]:
raise NotImplementedError("Subclasses must implement this method") raise NotImplementedError("Subclasses must implement this method")
def _array_to_sparse_embeddings(self, sparse_arrays: np.ndarray) -> List[SparseEmbedding]:
"""Consumes several sparse vectors containing zero-values and returns a compact array.
"""
if sparse_arrays.ndim != 2:
raise ValueError(f"Expected a 2D array, got a {sparse_arrays.ndim}D array.")
# get coordinates of non-zero values
coords = np.nonzero(sparse_arrays)
# create compact array
compact_array = np.array([coords[0], coords[1], sparse_arrays[coords]]).T
arr_range = range(compact_array[:, 0].max().astype(int) + 1)
arrs = [compact_array[compact_array[:, 0] == i, :][:, 1:3] for i in arr_range]
return [SparseEmbedding.from_compact_array(arr) for arr in arrs]
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from semantic_router.encoders import SparseEncoder from semantic_router.encoders.tfidf import TfidfEncoder
from semantic_router.utils.logger import logger from semantic_router.utils.logger import logger
from semantic_router.schema import SparseEmbedding
class BM25Encoder(SparseEncoder): class BM25Encoder(TfidfEncoder):
model: Optional[Any] = None model: Optional[Any] = None
idx_mapping: Optional[Dict[int, int]] = None idx_mapping: Optional[Dict[int, int]] = None
type: str = "sparse" type: str = "sparse"
...@@ -12,10 +13,9 @@ class BM25Encoder(SparseEncoder): ...@@ -12,10 +13,9 @@ class BM25Encoder(SparseEncoder):
def __init__( def __init__(
self, self,
name: str = "bm25", name: str = "bm25",
score_threshold: float = 0.82,
use_default_params: bool = True, use_default_params: bool = True,
): ):
super().__init__(name=name, score_threshold=score_threshold) super().__init__(name=name)
try: try:
from pinecone_text.sparse import BM25Encoder as encoder from pinecone_text.sparse import BM25Encoder as encoder
except ImportError: except ImportError:
...@@ -40,12 +40,13 @@ class BM25Encoder(SparseEncoder): ...@@ -40,12 +40,13 @@ class BM25Encoder(SparseEncoder):
else: else:
raise TypeError("Expected a dictionary for 'doc_freq'") raise TypeError("Expected a dictionary for 'doc_freq'")
def __call__(self, docs: List[str]) -> List[List[float]]: def __call__(self, docs: List[str]) -> list[SparseEmbedding]:
if self.model is None or self.idx_mapping is None: if self.model is None or self.idx_mapping is None:
raise ValueError("Model or index mapping is not initialized.") raise ValueError("Model or index mapping is not initialized.")
if len(docs) == 1: if len(docs) == 1:
sparse_dicts = self.model.encode_queries(docs) sparse_dicts = self.model.encode_queries(docs)
elif len(docs) > 1: elif len(docs) > 1:
print(docs)
sparse_dicts = self.model.encode_documents(docs) sparse_dicts = self.model.encode_documents(docs)
else: else:
raise ValueError("No documents to encode.") raise ValueError("No documents to encode.")
...@@ -60,8 +61,3 @@ class BM25Encoder(SparseEncoder): ...@@ -60,8 +61,3 @@ class BM25Encoder(SparseEncoder):
embeds[i][position] = val embeds[i][position] = val
return embeds return embeds
def fit(self, docs: List[str]):
if self.model is None:
raise ValueError("Model is not initialized.")
self.model.fit(docs)
self._set_idx_mapping()
...@@ -8,6 +8,7 @@ from numpy.linalg import norm ...@@ -8,6 +8,7 @@ from numpy.linalg import norm
from semantic_router.encoders import SparseEncoder from semantic_router.encoders import SparseEncoder
from semantic_router.route import Route from semantic_router.route import Route
from semantic_router.schema import SparseEmbedding
class TfidfEncoder(SparseEncoder): class TfidfEncoder(SparseEncoder):
...@@ -19,7 +20,7 @@ class TfidfEncoder(SparseEncoder): ...@@ -19,7 +20,7 @@ class TfidfEncoder(SparseEncoder):
self.word_index = {} self.word_index = {}
self.idf = np.array([]) self.idf = np.array([])
def __call__(self, docs: List[str]) -> List[List[float]]: def __call__(self, docs: List[str]) -> list[SparseEmbedding]:
if len(self.word_index) == 0 or self.idf.size == 0: if len(self.word_index) == 0 or self.idf.size == 0:
raise ValueError("Vectorizer is not initialized.") raise ValueError("Vectorizer is not initialized.")
if len(docs) == 0: if len(docs) == 0:
...@@ -28,7 +29,7 @@ class TfidfEncoder(SparseEncoder): ...@@ -28,7 +29,7 @@ class TfidfEncoder(SparseEncoder):
docs = [self._preprocess(doc) for doc in docs] docs = [self._preprocess(doc) for doc in docs]
tf = self._compute_tf(docs) tf = self._compute_tf(docs)
tfidf = tf * self.idf tfidf = tf * self.idf
return tfidf return self._array_to_sparse_embeddings(tfidf)
def fit(self, routes: List[Route]): def fit(self, routes: List[Route]):
docs = [] docs = []
......
...@@ -419,23 +419,28 @@ class SparseEmbedding(BaseModel): ...@@ -419,23 +419,28 @@ class SparseEmbedding(BaseModel):
arbitrary_types_allowed = True arbitrary_types_allowed = True
@classmethod @classmethod
def from_array(cls, array: np.ndarray): def from_compact_array(cls, array: np.ndarray):
if array.ndim != 2 or array.shape[1] != 2: if array.ndim != 2 or array.shape[1] != 2:
raise ValueError( raise ValueError(
f"Expected a 2D array with 2 columns, got a {array.ndim}D array with {array.shape[1]} columns. " f"Expected a 2D array with 2 columns, got a {array.ndim}D array with {array.shape[1]} columns. "
"Column 0 should contain index positions, and column 1 should contain respective values." "Column 0 should contain index positions, and column 1 should contain respective values."
) )
return cls(embedding=array) return cls(embedding=array)
@classmethod
def from_array(cls, array: np.ndarray):
"""Consumes a single sparse vector which contains zero-values.
"""
@classmethod @classmethod
def from_aurelio(cls, embedding: BM25Embedding): def from_aurelio(cls, embedding: BM25Embedding):
arr = np.array([embedding.indices, embedding.values]).T arr = np.array([embedding.indices, embedding.values]).T
return cls.from_array(arr) return cls.from_compact_array(arr)
@classmethod @classmethod
def from_dict(cls, sparse_dict: dict): def from_dict(cls, sparse_dict: dict):
arr = np.array([list(sparse_dict.keys()), list(sparse_dict.values())]).T arr = np.array([list(sparse_dict.keys()), list(sparse_dict.values())]).T
return cls.from_array(arr) return cls.from_compact_array(arr)
def to_dict(self): def to_dict(self):
return { return {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment