From e80eaa40b28c11b71bf79cc164e0ccaf3edb74c7 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 12:42:12 -0800 Subject: [PATCH] lint and del tfidf --- semantic_router/__init__.py | 2 +- semantic_router/encoders/bm25.py | 3 ++- semantic_router/encoders/tfidf.py | 37 ------------------------------- semantic_router/layer.py | 28 +++++++++-------------- semantic_router/rankers/cohere.py | 10 ++++----- 5 files changed, 18 insertions(+), 62 deletions(-) delete mode 100644 semantic_router/encoders/tfidf.py diff --git a/semantic_router/__init__.py b/semantic_router/__init__.py index ac1e314e..734906f8 100644 --- a/semantic_router/__init__.py +++ b/semantic_router/__init__.py @@ -1,3 +1,3 @@ from .layer import DecisionLayer, HybridDecisionLayer -__all__ = ["DecisionLayer", "HybridDecisionLayer"] \ No newline at end of file +__all__ = ["DecisionLayer", "HybridDecisionLayer"] diff --git a/semantic_router/encoders/bm25.py b/semantic_router/encoders/bm25.py index 0f3985be..0d498197 100644 --- a/semantic_router/encoders/bm25.py +++ b/semantic_router/encoders/bm25.py @@ -12,7 +12,8 @@ class BM25Encoder(BaseEncoder): # initialize BM25 encoder with default params (trained on MSMarco) self.model = encoder.default() self.idx_mapping = { - idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) + idx: i + for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) } def __call__(self, docs: list[str]) -> list[list[float]]: diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py deleted file mode 100644 index 5dc7f34d..00000000 --- a/semantic_router/encoders/tfidf.py +++ /dev/null @@ -1,37 +0,0 @@ -from functools import partial - -from sklearn.feature_extraction.text import TfidfVectorizer - -from semantic_router.encoders import BaseEncoder - - -class TfidfEncoder(BaseEncoder): - model: encoder | None = None - - def __init__(self, name: str = "bm25"): - super().__init__(name=name) - # initialize BM25 encoder with default params (trained on MSMarco) - self.model = encoder.default() - self.idx_mapping = { - idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) - } - - def __call__(self, docs: list[str]) -> list[list[float]]: - if len(docs) == 1: - sparse_dicts = self.model.encode_query(docs[0]) - elif len(docs) > 1: - sparse_dicts = self.model.encode_documents(docs) - else: - raise ValueError("No documents to encode.") - # convert sparse dict to sparse vector - embeds = [0.0] * len(self.idx_mapping) - for output in sparse_dicts: - indices = output["indices"] - values = output["values"] - for idx, val in zip(indices, values): - position = self.idx_mapping[idx] - embeds[position] = val - return embeds - - def fit(self, docs: list[str]): - self.model.fit(docs) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 0c00e916..adff961c 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -6,7 +6,7 @@ from semantic_router.encoders import ( BaseEncoder, CohereEncoder, OpenAIEncoder, - BM25Encoder + BM25Encoder, ) from semantic_router.linear import similarity_matrix, top_scores from semantic_router.schema import Decision @@ -114,10 +114,7 @@ class HybridDecisionLayer: score_threshold = 0.82 def __init__( - self, - encoder: BaseEncoder, - decisions: list[Decision] = [], - alpha: float = 0.3 + self, encoder: BaseEncoder, decisions: list[Decision] = [], alpha: float = 0.3 ): self.encoder = encoder self.sparse_encoder = BM25Encoder() @@ -149,9 +146,7 @@ class HybridDecisionLayer: def _add_decision(self, decision: Decision): # create embeddings - dense_embeds = np.array( - self.encoder(decision.utterances) - ) # * self.alpha + dense_embeds = np.array(self.encoder(decision.utterances)) # * self.alpha sparse_embeds = np.array( self.sparse_encoder(decision.utterances) ) # * (1 - self.alpha) @@ -163,10 +158,9 @@ class HybridDecisionLayer: else: str_arr = np.array([decision.name] * len(decision.utterances)) self.categories = np.concatenate([self.categories, str_arr]) - self.utterances = np.concatenate([ - self.utterances, - np.array(decision.utterances) - ]) + self.utterances = np.concatenate( + [self.utterances, np.array(decision.utterances)] + ) # create utterance array (the dense index) if self.index is None: self.index = dense_embeds @@ -176,9 +170,7 @@ class HybridDecisionLayer: if self.sparse_index is None: self.sparse_index = sparse_embeds else: - self.sparse_index = np.concatenate([ - self.sparse_index, sparse_embeds - ]) + self.sparse_index = np.concatenate([self.sparse_index, sparse_embeds]) def _query(self, text: str, top_k: int = 5): """Given some text, encodes and searches the index vector space to @@ -202,7 +194,7 @@ class HybridDecisionLayer: sparse_norm = norm(self.sparse_index, axis=1) xq_s_norm = norm(xq_s.T) sim_s = np.dot(self.sparse_index, xq_s.T) / (sparse_norm * xq_s_norm) - total_sim = (sim_d + sim_s) + total_sim = sim_d + sim_s # get indices of top_k records top_k = min(top_k, total_sim.shape[0]) idx = np.argpartition(total_sim, -top_k)[-top_k:] @@ -214,7 +206,7 @@ class HybridDecisionLayer: ] else: return [] - + def _convex_scaling(self, dense: list[float], sparse: list[float]): # scale sparse and dense vecs dense = np.array(dense) * self.alpha @@ -244,4 +236,4 @@ class HybridDecisionLayer: if scores: return max(scores) > threshold else: - return False \ No newline at end of file + return False diff --git a/semantic_router/rankers/cohere.py b/semantic_router/rankers/cohere.py index e79608b8..7e6e8ad6 100644 --- a/semantic_router/rankers/cohere.py +++ b/semantic_router/rankers/cohere.py @@ -9,9 +9,10 @@ class CohereRanker(BaseRanker): client: cohere.Client | None def __init__( - self, name: str = "rerank-english-v2.0", + self, + name: str = "rerank-english-v2.0", top_n: int = 5, - cohere_api_key: str | None = None + cohere_api_key: str | None = None, ): super().__init__(name=name, top_n=top_n) cohere_api_key = cohere_api_key or os.getenv("COHERE_API_KEY") @@ -22,10 +23,9 @@ class CohereRanker(BaseRanker): def __call__(self, query: str, docs: list[str]) -> list[str]: # get top_n results results = self.client.rerank( - query=query, documents=docs, top_n=self.top_n, - model=self.name + query=query, documents=docs, top_n=self.top_n, model=self.name ) # get indices of entries that are ranked highest by cohere top_idx = [r.index for r in results] top_docs = [docs[i] for i in top_idx] - return top_idx, top_docs \ No newline at end of file + return top_idx, top_docs -- GitLab