From 6bbafa4c17262ba3a16378fe9313cbf1bdf557cb Mon Sep 17 00:00:00 2001 From: jamescalam <james.briggs@hotmail.com> Date: Wed, 27 Nov 2024 18:59:12 +0100 Subject: [PATCH] feat: compatibility for tfidf --- semantic_router/encoders/base.py | 17 ++++++++++++++++- semantic_router/encoders/bm25.py | 16 ++++++---------- semantic_router/encoders/tfidf.py | 5 +++-- semantic_router/schema.py | 11 ++++++++--- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/semantic_router/encoders/base.py b/semantic_router/encoders/base.py index 1bcf3d9d..dcfe9aa3 100644 --- a/semantic_router/encoders/base.py +++ b/semantic_router/encoders/base.py @@ -1,6 +1,7 @@ from typing import Any, Coroutine, List, Optional from pydantic.v1 import BaseModel, Field, validator +import numpy as np from semantic_router.schema import SparseEmbedding @@ -34,5 +35,19 @@ class SparseEncoder(BaseModel): def __call__(self, docs: List[str]) -> List[SparseEmbedding]: raise NotImplementedError("Subclasses must implement this method") - def acall(self, docs: List[str]) -> Coroutine[Any, Any, List[SparseEmbedding]]: + async def acall(self, docs: List[str]) -> Coroutine[Any, Any, List[SparseEmbedding]]: raise NotImplementedError("Subclasses must implement this method") + + def _array_to_sparse_embeddings(self, sparse_arrays: np.ndarray) -> List[SparseEmbedding]: + """Consumes several sparse vectors containing zero-values and returns a compact array. + """ + if sparse_arrays.ndim != 2: + raise ValueError(f"Expected a 2D array, got a {sparse_arrays.ndim}D array.") + # get coordinates of non-zero values + coords = np.nonzero(sparse_arrays) + # create compact array + compact_array = np.array([coords[0], coords[1], sparse_arrays[coords]]).T + arr_range = range(compact_array[:, 0].max().astype(int) + 1) + arrs = [compact_array[compact_array[:, 0] == i, :][:, 1:3] for i in arr_range] + return [SparseEmbedding.from_compact_array(arr) for arr in arrs] + diff --git a/semantic_router/encoders/bm25.py b/semantic_router/encoders/bm25.py index 4eac26e7..0ec36499 100644 --- a/semantic_router/encoders/bm25.py +++ b/semantic_router/encoders/bm25.py @@ -1,10 +1,11 @@ from typing import Any, Dict, List, Optional -from semantic_router.encoders import SparseEncoder +from semantic_router.encoders.tfidf import TfidfEncoder from semantic_router.utils.logger import logger +from semantic_router.schema import SparseEmbedding -class BM25Encoder(SparseEncoder): +class BM25Encoder(TfidfEncoder): model: Optional[Any] = None idx_mapping: Optional[Dict[int, int]] = None type: str = "sparse" @@ -12,10 +13,9 @@ class BM25Encoder(SparseEncoder): def __init__( self, name: str = "bm25", - score_threshold: float = 0.82, use_default_params: bool = True, ): - super().__init__(name=name, score_threshold=score_threshold) + super().__init__(name=name) try: from pinecone_text.sparse import BM25Encoder as encoder except ImportError: @@ -40,12 +40,13 @@ class BM25Encoder(SparseEncoder): else: raise TypeError("Expected a dictionary for 'doc_freq'") - def __call__(self, docs: List[str]) -> List[List[float]]: + def __call__(self, docs: List[str]) -> list[SparseEmbedding]: if self.model is None or self.idx_mapping is None: raise ValueError("Model or index mapping is not initialized.") if len(docs) == 1: sparse_dicts = self.model.encode_queries(docs) elif len(docs) > 1: + print(docs) sparse_dicts = self.model.encode_documents(docs) else: raise ValueError("No documents to encode.") @@ -60,8 +61,3 @@ class BM25Encoder(SparseEncoder): embeds[i][position] = val return embeds - def fit(self, docs: List[str]): - if self.model is None: - raise ValueError("Model is not initialized.") - self.model.fit(docs) - self._set_idx_mapping() diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py index 1bec7b9f..d9d97a47 100644 --- a/semantic_router/encoders/tfidf.py +++ b/semantic_router/encoders/tfidf.py @@ -8,6 +8,7 @@ from numpy.linalg import norm from semantic_router.encoders import SparseEncoder from semantic_router.route import Route +from semantic_router.schema import SparseEmbedding class TfidfEncoder(SparseEncoder): @@ -19,7 +20,7 @@ class TfidfEncoder(SparseEncoder): self.word_index = {} self.idf = np.array([]) - def __call__(self, docs: List[str]) -> List[List[float]]: + def __call__(self, docs: List[str]) -> list[SparseEmbedding]: if len(self.word_index) == 0 or self.idf.size == 0: raise ValueError("Vectorizer is not initialized.") if len(docs) == 0: @@ -28,7 +29,7 @@ class TfidfEncoder(SparseEncoder): docs = [self._preprocess(doc) for doc in docs] tf = self._compute_tf(docs) tfidf = tf * self.idf - return tfidf + return self._array_to_sparse_embeddings(tfidf) def fit(self, routes: List[Route]): docs = [] diff --git a/semantic_router/schema.py b/semantic_router/schema.py index 9596af50..616416e7 100644 --- a/semantic_router/schema.py +++ b/semantic_router/schema.py @@ -419,23 +419,28 @@ class SparseEmbedding(BaseModel): arbitrary_types_allowed = True @classmethod - def from_array(cls, array: np.ndarray): + def from_compact_array(cls, array: np.ndarray): if array.ndim != 2 or array.shape[1] != 2: raise ValueError( f"Expected a 2D array with 2 columns, got a {array.ndim}D array with {array.shape[1]} columns. " "Column 0 should contain index positions, and column 1 should contain respective values." ) return cls(embedding=array) + + @classmethod + def from_array(cls, array: np.ndarray): + """Consumes a single sparse vector which contains zero-values. + """ @classmethod def from_aurelio(cls, embedding: BM25Embedding): arr = np.array([embedding.indices, embedding.values]).T - return cls.from_array(arr) + return cls.from_compact_array(arr) @classmethod def from_dict(cls, sparse_dict: dict): arr = np.array([list(sparse_dict.keys()), list(sparse_dict.values())]).T - return cls.from_array(arr) + return cls.from_compact_array(arr) def to_dict(self): return { -- GitLab