From e80eaa40b28c11b71bf79cc164e0ccaf3edb74c7 Mon Sep 17 00:00:00 2001
From: James Briggs <35938317+jamescalam@users.noreply.github.com>
Date: Sat, 2 Dec 2023 12:42:12 -0800
Subject: [PATCH] lint and del tfidf

---
 semantic_router/__init__.py       |  2 +-
 semantic_router/encoders/bm25.py  |  3 ++-
 semantic_router/encoders/tfidf.py | 37 -------------------------------
 semantic_router/layer.py          | 28 +++++++++--------------
 semantic_router/rankers/cohere.py | 10 ++++-----
 5 files changed, 18 insertions(+), 62 deletions(-)
 delete mode 100644 semantic_router/encoders/tfidf.py

diff --git a/semantic_router/__init__.py b/semantic_router/__init__.py
index ac1e314e..734906f8 100644
--- a/semantic_router/__init__.py
+++ b/semantic_router/__init__.py
@@ -1,3 +1,3 @@
 from .layer import DecisionLayer, HybridDecisionLayer
 
-__all__ = ["DecisionLayer", "HybridDecisionLayer"]
\ No newline at end of file
+__all__ = ["DecisionLayer", "HybridDecisionLayer"]
diff --git a/semantic_router/encoders/bm25.py b/semantic_router/encoders/bm25.py
index 0f3985be..0d498197 100644
--- a/semantic_router/encoders/bm25.py
+++ b/semantic_router/encoders/bm25.py
@@ -12,7 +12,8 @@ class BM25Encoder(BaseEncoder):
         # initialize BM25 encoder with default params (trained on MSMarco)
         self.model = encoder.default()
         self.idx_mapping = {
-            idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"])
+            idx: i
+            for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"])
         }
 
     def __call__(self, docs: list[str]) -> list[list[float]]:
diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py
deleted file mode 100644
index 5dc7f34d..00000000
--- a/semantic_router/encoders/tfidf.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from functools import partial
-
-from sklearn.feature_extraction.text import TfidfVectorizer
-
-from semantic_router.encoders import BaseEncoder
-
-
-class TfidfEncoder(BaseEncoder):
-    model: encoder | None = None
-
-    def __init__(self, name: str = "bm25"):
-        super().__init__(name=name)
-        # initialize BM25 encoder with default params (trained on MSMarco)
-        self.model = encoder.default()
-        self.idx_mapping = {
-            idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"])
-        }
-
-    def __call__(self, docs: list[str]) -> list[list[float]]:
-        if len(docs) == 1:
-            sparse_dicts = self.model.encode_query(docs[0])
-        elif len(docs) > 1:
-            sparse_dicts = self.model.encode_documents(docs)
-        else:
-            raise ValueError("No documents to encode.")
-        # convert sparse dict to sparse vector
-        embeds = [0.0] * len(self.idx_mapping)
-        for output in sparse_dicts:
-            indices = output["indices"]
-            values = output["values"]
-            for idx, val in zip(indices, values):
-                position = self.idx_mapping[idx]
-                embeds[position] = val
-        return embeds
-
-    def fit(self, docs: list[str]):
-        self.model.fit(docs)
diff --git a/semantic_router/layer.py b/semantic_router/layer.py
index 0c00e916..adff961c 100644
--- a/semantic_router/layer.py
+++ b/semantic_router/layer.py
@@ -6,7 +6,7 @@ from semantic_router.encoders import (
     BaseEncoder,
     CohereEncoder,
     OpenAIEncoder,
-    BM25Encoder
+    BM25Encoder,
 )
 from semantic_router.linear import similarity_matrix, top_scores
 from semantic_router.schema import Decision
@@ -114,10 +114,7 @@ class HybridDecisionLayer:
     score_threshold = 0.82
 
     def __init__(
-        self,
-        encoder: BaseEncoder,
-        decisions: list[Decision] = [],
-        alpha: float = 0.3
+        self, encoder: BaseEncoder, decisions: list[Decision] = [], alpha: float = 0.3
     ):
         self.encoder = encoder
         self.sparse_encoder = BM25Encoder()
@@ -149,9 +146,7 @@ class HybridDecisionLayer:
 
     def _add_decision(self, decision: Decision):
         # create embeddings
-        dense_embeds = np.array(
-            self.encoder(decision.utterances)
-        )  # * self.alpha
+        dense_embeds = np.array(self.encoder(decision.utterances))  # * self.alpha
         sparse_embeds = np.array(
             self.sparse_encoder(decision.utterances)
         )  # * (1 - self.alpha)
@@ -163,10 +158,9 @@ class HybridDecisionLayer:
         else:
             str_arr = np.array([decision.name] * len(decision.utterances))
             self.categories = np.concatenate([self.categories, str_arr])
-            self.utterances = np.concatenate([
-                self.utterances,
-                np.array(decision.utterances)
-            ])
+            self.utterances = np.concatenate(
+                [self.utterances, np.array(decision.utterances)]
+            )
         # create utterance array (the dense index)
         if self.index is None:
             self.index = dense_embeds
@@ -176,9 +170,7 @@ class HybridDecisionLayer:
         if self.sparse_index is None:
             self.sparse_index = sparse_embeds
         else:
-            self.sparse_index = np.concatenate([
-                self.sparse_index, sparse_embeds
-            ])
+            self.sparse_index = np.concatenate([self.sparse_index, sparse_embeds])
 
     def _query(self, text: str, top_k: int = 5):
         """Given some text, encodes and searches the index vector space to
@@ -202,7 +194,7 @@ class HybridDecisionLayer:
             sparse_norm = norm(self.sparse_index, axis=1)
             xq_s_norm = norm(xq_s.T)
             sim_s = np.dot(self.sparse_index, xq_s.T) / (sparse_norm * xq_s_norm)
-            total_sim = (sim_d + sim_s)
+            total_sim = sim_d + sim_s
             # get indices of top_k records
             top_k = min(top_k, total_sim.shape[0])
             idx = np.argpartition(total_sim, -top_k)[-top_k:]
@@ -214,7 +206,7 @@ class HybridDecisionLayer:
             ]
         else:
             return []
-        
+
     def _convex_scaling(self, dense: list[float], sparse: list[float]):
         # scale sparse and dense vecs
         dense = np.array(dense) * self.alpha
@@ -244,4 +236,4 @@ class HybridDecisionLayer:
         if scores:
             return max(scores) > threshold
         else:
-            return False
\ No newline at end of file
+            return False
diff --git a/semantic_router/rankers/cohere.py b/semantic_router/rankers/cohere.py
index e79608b8..7e6e8ad6 100644
--- a/semantic_router/rankers/cohere.py
+++ b/semantic_router/rankers/cohere.py
@@ -9,9 +9,10 @@ class CohereRanker(BaseRanker):
     client: cohere.Client | None
 
     def __init__(
-        self, name: str = "rerank-english-v2.0",
+        self,
+        name: str = "rerank-english-v2.0",
         top_n: int = 5,
-        cohere_api_key: str | None = None
+        cohere_api_key: str | None = None,
     ):
         super().__init__(name=name, top_n=top_n)
         cohere_api_key = cohere_api_key or os.getenv("COHERE_API_KEY")
@@ -22,10 +23,9 @@ class CohereRanker(BaseRanker):
     def __call__(self, query: str, docs: list[str]) -> list[str]:
         # get top_n results
         results = self.client.rerank(
-            query=query, documents=docs, top_n=self.top_n,
-            model=self.name
+            query=query, documents=docs, top_n=self.top_n, model=self.name
         )
         # get indices of entries that are ranked highest by cohere
         top_idx = [r.index for r in results]
         top_docs = [docs[i] for i in top_idx]
-        return top_idx, top_docs
\ No newline at end of file
+        return top_idx, top_docs
-- 
GitLab