Skip to content
Snippets Groups Projects
Unverified Commit e80eaa40 authored by James Briggs's avatar James Briggs
Browse files

lint and del tfidf

parent 7ca32aab
No related branches found
No related tags found
No related merge requests found
from .layer import DecisionLayer, HybridDecisionLayer from .layer import DecisionLayer, HybridDecisionLayer
__all__ = ["DecisionLayer", "HybridDecisionLayer"] __all__ = ["DecisionLayer", "HybridDecisionLayer"]
\ No newline at end of file
...@@ -12,7 +12,8 @@ class BM25Encoder(BaseEncoder): ...@@ -12,7 +12,8 @@ class BM25Encoder(BaseEncoder):
# initialize BM25 encoder with default params (trained on MSMarco) # initialize BM25 encoder with default params (trained on MSMarco)
self.model = encoder.default() self.model = encoder.default()
self.idx_mapping = { self.idx_mapping = {
idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) idx: i
for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"])
} }
def __call__(self, docs: list[str]) -> list[list[float]]: def __call__(self, docs: list[str]) -> list[list[float]]:
......
from functools import partial
from sklearn.feature_extraction.text import TfidfVectorizer
from semantic_router.encoders import BaseEncoder
class TfidfEncoder(BaseEncoder):
model: encoder | None = None
def __init__(self, name: str = "bm25"):
super().__init__(name=name)
# initialize BM25 encoder with default params (trained on MSMarco)
self.model = encoder.default()
self.idx_mapping = {
idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"])
}
def __call__(self, docs: list[str]) -> list[list[float]]:
if len(docs) == 1:
sparse_dicts = self.model.encode_query(docs[0])
elif len(docs) > 1:
sparse_dicts = self.model.encode_documents(docs)
else:
raise ValueError("No documents to encode.")
# convert sparse dict to sparse vector
embeds = [0.0] * len(self.idx_mapping)
for output in sparse_dicts:
indices = output["indices"]
values = output["values"]
for idx, val in zip(indices, values):
position = self.idx_mapping[idx]
embeds[position] = val
return embeds
def fit(self, docs: list[str]):
self.model.fit(docs)
...@@ -6,7 +6,7 @@ from semantic_router.encoders import ( ...@@ -6,7 +6,7 @@ from semantic_router.encoders import (
BaseEncoder, BaseEncoder,
CohereEncoder, CohereEncoder,
OpenAIEncoder, OpenAIEncoder,
BM25Encoder BM25Encoder,
) )
from semantic_router.linear import similarity_matrix, top_scores from semantic_router.linear import similarity_matrix, top_scores
from semantic_router.schema import Decision from semantic_router.schema import Decision
...@@ -114,10 +114,7 @@ class HybridDecisionLayer: ...@@ -114,10 +114,7 @@ class HybridDecisionLayer:
score_threshold = 0.82 score_threshold = 0.82
def __init__( def __init__(
self, self, encoder: BaseEncoder, decisions: list[Decision] = [], alpha: float = 0.3
encoder: BaseEncoder,
decisions: list[Decision] = [],
alpha: float = 0.3
): ):
self.encoder = encoder self.encoder = encoder
self.sparse_encoder = BM25Encoder() self.sparse_encoder = BM25Encoder()
...@@ -149,9 +146,7 @@ class HybridDecisionLayer: ...@@ -149,9 +146,7 @@ class HybridDecisionLayer:
def _add_decision(self, decision: Decision): def _add_decision(self, decision: Decision):
# create embeddings # create embeddings
dense_embeds = np.array( dense_embeds = np.array(self.encoder(decision.utterances)) # * self.alpha
self.encoder(decision.utterances)
) # * self.alpha
sparse_embeds = np.array( sparse_embeds = np.array(
self.sparse_encoder(decision.utterances) self.sparse_encoder(decision.utterances)
) # * (1 - self.alpha) ) # * (1 - self.alpha)
...@@ -163,10 +158,9 @@ class HybridDecisionLayer: ...@@ -163,10 +158,9 @@ class HybridDecisionLayer:
else: else:
str_arr = np.array([decision.name] * len(decision.utterances)) str_arr = np.array([decision.name] * len(decision.utterances))
self.categories = np.concatenate([self.categories, str_arr]) self.categories = np.concatenate([self.categories, str_arr])
self.utterances = np.concatenate([ self.utterances = np.concatenate(
self.utterances, [self.utterances, np.array(decision.utterances)]
np.array(decision.utterances) )
])
# create utterance array (the dense index) # create utterance array (the dense index)
if self.index is None: if self.index is None:
self.index = dense_embeds self.index = dense_embeds
...@@ -176,9 +170,7 @@ class HybridDecisionLayer: ...@@ -176,9 +170,7 @@ class HybridDecisionLayer:
if self.sparse_index is None: if self.sparse_index is None:
self.sparse_index = sparse_embeds self.sparse_index = sparse_embeds
else: else:
self.sparse_index = np.concatenate([ self.sparse_index = np.concatenate([self.sparse_index, sparse_embeds])
self.sparse_index, sparse_embeds
])
def _query(self, text: str, top_k: int = 5): def _query(self, text: str, top_k: int = 5):
"""Given some text, encodes and searches the index vector space to """Given some text, encodes and searches the index vector space to
...@@ -202,7 +194,7 @@ class HybridDecisionLayer: ...@@ -202,7 +194,7 @@ class HybridDecisionLayer:
sparse_norm = norm(self.sparse_index, axis=1) sparse_norm = norm(self.sparse_index, axis=1)
xq_s_norm = norm(xq_s.T) xq_s_norm = norm(xq_s.T)
sim_s = np.dot(self.sparse_index, xq_s.T) / (sparse_norm * xq_s_norm) sim_s = np.dot(self.sparse_index, xq_s.T) / (sparse_norm * xq_s_norm)
total_sim = (sim_d + sim_s) total_sim = sim_d + sim_s
# get indices of top_k records # get indices of top_k records
top_k = min(top_k, total_sim.shape[0]) top_k = min(top_k, total_sim.shape[0])
idx = np.argpartition(total_sim, -top_k)[-top_k:] idx = np.argpartition(total_sim, -top_k)[-top_k:]
...@@ -214,7 +206,7 @@ class HybridDecisionLayer: ...@@ -214,7 +206,7 @@ class HybridDecisionLayer:
] ]
else: else:
return [] return []
def _convex_scaling(self, dense: list[float], sparse: list[float]): def _convex_scaling(self, dense: list[float], sparse: list[float]):
# scale sparse and dense vecs # scale sparse and dense vecs
dense = np.array(dense) * self.alpha dense = np.array(dense) * self.alpha
...@@ -244,4 +236,4 @@ class HybridDecisionLayer: ...@@ -244,4 +236,4 @@ class HybridDecisionLayer:
if scores: if scores:
return max(scores) > threshold return max(scores) > threshold
else: else:
return False return False
\ No newline at end of file
...@@ -9,9 +9,10 @@ class CohereRanker(BaseRanker): ...@@ -9,9 +9,10 @@ class CohereRanker(BaseRanker):
client: cohere.Client | None client: cohere.Client | None
def __init__( def __init__(
self, name: str = "rerank-english-v2.0", self,
name: str = "rerank-english-v2.0",
top_n: int = 5, top_n: int = 5,
cohere_api_key: str | None = None cohere_api_key: str | None = None,
): ):
super().__init__(name=name, top_n=top_n) super().__init__(name=name, top_n=top_n)
cohere_api_key = cohere_api_key or os.getenv("COHERE_API_KEY") cohere_api_key = cohere_api_key or os.getenv("COHERE_API_KEY")
...@@ -22,10 +23,9 @@ class CohereRanker(BaseRanker): ...@@ -22,10 +23,9 @@ class CohereRanker(BaseRanker):
def __call__(self, query: str, docs: list[str]) -> list[str]: def __call__(self, query: str, docs: list[str]) -> list[str]:
# get top_n results # get top_n results
results = self.client.rerank( results = self.client.rerank(
query=query, documents=docs, top_n=self.top_n, query=query, documents=docs, top_n=self.top_n, model=self.name
model=self.name
) )
# get indices of entries that are ranked highest by cohere # get indices of entries that are ranked highest by cohere
top_idx = [r.index for r in results] top_idx = [r.index for r in results]
top_docs = [docs[i] for i in top_idx] top_docs = [docs[i] for i in top_idx]
return top_idx, top_docs return top_idx, top_docs
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment