diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py index a7ac9136c08fb1717dfd1615f5b30c3ad263b767..b865c17098c108b17c8f056772a96dcaebbf076e 100644 --- a/semantic_router/encoders/tfidf.py +++ b/semantic_router/encoders/tfidf.py @@ -3,8 +3,6 @@ from collections import Counter from typing import Dict, List import numpy as np -from numpy import ndarray -from numpy.linalg import norm from semantic_router.encoders import SparseEncoder from semantic_router.route import Route @@ -12,7 +10,8 @@ from semantic_router.schema import SparseEmbedding class TfidfEncoder(SparseEncoder): - idf: ndarray = np.array([]) + idf: np.ndarray = np.array([]) + # TODO: add option to use default params like with BM25Encoder word_index: Dict = {} def __init__(self, name: str | None = None): @@ -39,14 +38,18 @@ class TfidfEncoder(SparseEncoder): for doc in route.utterances: docs.append(self._preprocess(doc)) # type: ignore self.word_index = self._build_word_index(docs) + if len(self.word_index) == 0: + raise ValueError(f"Too little data to fit {self.__class__.__name__}.") self.idf = self._compute_idf(docs) def _build_word_index(self, docs: List[str]) -> Dict: + print(docs) words = set() for doc in docs: for word in doc.split(): words.add(word) word_index = {word: i for i, word in enumerate(words)} + print(word_index) return word_index def _compute_tf(self, docs: List[str]) -> np.ndarray: @@ -59,7 +62,7 @@ class TfidfEncoder(SparseEncoder): if word in self.word_index: tf[i, self.word_index[word]] = count # L2 normalization - tf = tf / norm(tf, axis=1, keepdims=True) + tf = tf / np.linalg.norm(tf, axis=1, keepdims=True) return tf def _compute_idf(self, docs: List[str]) -> np.ndarray: diff --git a/semantic_router/index/hybrid_local.py b/semantic_router/index/hybrid_local.py index e2a75778bb97848ae1ee22ab0e958b5ff4c54469..2a5a43d5f023a6e9665b6082be104b731e8680f1 100644 --- a/semantic_router/index/hybrid_local.py +++ b/semantic_router/index/hybrid_local.py @@ -29,9 +29,9 @@ class HybridLocalIndex(LocalIndex): if sparse_embeddings is None: raise ValueError("Sparse embeddings are required for HybridLocalIndex.") if function_schemas is not None: - raise ValueError("Function schemas are not supported for HybridLocalIndex.") + logger.warning("Function schemas are not supported for HybridLocalIndex.") if metadata_list: - raise ValueError("Metadata is not supported for HybridLocalIndex.") + logger.warning("Metadata is not supported for HybridLocalIndex.") embeds = np.array(embeddings) routes_arr = np.array(routes) if isinstance(utterances[0], str): diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 25df1c47dd4f567e51d6b07de4b4bd6b8eef3794..303b47157533fbe71a74231ce550a2485d1f8e30 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -250,7 +250,6 @@ class PineconeIndex(BaseIndex): function_schemas = [{}] * len(embeddings) if sparse_embeddings is None: sparse_embeddings = [{}] * len(embeddings) - vectors_to_upsert = [ PineconeRecord( values=vector, diff --git a/semantic_router/routers/base.py b/semantic_router/routers/base.py index f21131c8140864f43f3bd5bb9e147acd41194b90..615d46999417852be8ba18ad6d5758f5fef6bdd4 100644 --- a/semantic_router/routers/base.py +++ b/semantic_router/routers/base.py @@ -718,40 +718,13 @@ class BaseRouter(BaseModel): else: raise ValueError(f"{type(encoder)} not supported for loading from config.") - def add(self, route: Route): + def add(self, routes: List[Route] | Route): """Add a route to the local SemanticRouter and index. :param route: The route to add. :type route: Route """ - current_local_hash = self._get_hash() - current_remote_hash = self.index._read_hash() - if current_remote_hash.value == "": - # if remote hash is empty, the index is to be initialized - current_remote_hash = current_local_hash - embedded_utterances = self.encoder(route.utterances) - self.index.add( - embeddings=embedded_utterances, - routes=[route.name] * len(route.utterances), - utterances=route.utterances, - function_schemas=( - route.function_schemas * len(route.utterances) - if route.function_schemas - else [{}] * len(route.utterances) - ), - metadata_list=[route.metadata if route.metadata else {}] - * len(route.utterances), - ) - - self.routes.append(route) - if current_local_hash.value == current_remote_hash.value: - self._write_hash() # update current hash in index - else: - logger.warning( - "Local and remote route layers were not aligned. Remote hash " - "not updated. Use `SemanticRouter.get_utterance_diff()` to see " - "details." - ) + raise NotImplementedError("This method must be implemented by subclasses.") def list_route_names(self) -> List[str]: return [route.name for route in self.routes] @@ -854,43 +827,6 @@ class BaseRouter(BaseModel): route = route_mapping[route_name] self.routes.append(route) - def _add_routes(self, routes: List[Route]): - current_local_hash = self._get_hash() - current_remote_hash = self.index._read_hash() - if current_remote_hash.value == "": - # if remote hash is empty, the index is to be initialized - current_remote_hash = current_local_hash - - if not routes: - logger.warning("No routes provided to add.") - return - # create embeddings for all routes - route_names, all_utterances, all_function_schemas, all_metadata = ( - self._extract_routes_details(routes, include_metadata=True) - ) - embedded_utterances = self.encoder(all_utterances) - try: - # Batch insertion into the index - self.index.add( - embeddings=embedded_utterances, - routes=route_names, - utterances=all_utterances, - function_schemas=all_function_schemas, - metadata_list=all_metadata, - ) - except Exception as e: - logger.error(f"Failed to add routes to the index: {e}") - raise Exception("Indexing error occurred") from e - - if current_local_hash.value == current_remote_hash.value: - self._write_hash() # update current hash in index - else: - logger.warning( - "Local and remote route layers were not aligned. Remote hash " - f"not updated. Use `{self.__class__.__name__}.get_utterance_diff()` " - "to see details." - ) - def _get_hash(self) -> ConfigParameter: config = self.to_config() return config.get_hash() diff --git a/semantic_router/routers/hybrid.py b/semantic_router/routers/hybrid.py index f07429c3d518cbaa2bf0e34a75fd4b0ba254e19a..f74e3e6bea7d431bd3dd530859039dcc3c2c7e34 100644 --- a/semantic_router/routers/hybrid.py +++ b/semantic_router/routers/hybrid.py @@ -57,12 +57,53 @@ class HybridRouter(BaseRouter): # fit sparse encoder if needed if isinstance(self.sparse_encoder, TfidfEncoder) and hasattr( self.sparse_encoder, "fit" - ): + ) and self.routes: self.sparse_encoder.fit(self.routes) # run initialize index now if auto sync is active if self.auto_sync: self._init_index_state() + def add(self, routes: List[Route] | Route): + """Add a route to the local HybridRouter and index. + + :param route: The route to add. + :type route: Route + """ + # TODO: merge into single method within BaseRouter + current_local_hash = self._get_hash() + current_remote_hash = self.index._read_hash() + if current_remote_hash.value == "": + # if remote hash is empty, the index is to be initialized + current_remote_hash = current_local_hash + if isinstance(routes, Route): + routes = [routes] + # create embeddings for all routes + route_names, all_utterances, all_function_schemas, all_metadata = ( + self._extract_routes_details(routes, include_metadata=True) + ) + # TODO: to merge, self._encode should probably output a special + # TODO Embedding type that can be either dense or hybrid + dense_emb, sparse_emb = self._encode(all_utterances) + print(f"{sparse_emb=}") + self.index.add( + embeddings=dense_emb.tolist(), + routes=route_names, + utterances=all_utterances, + function_schemas=all_function_schemas, + metadata_list=all_metadata, + sparse_embeddings=sparse_emb, # type: ignore + ) + + self.routes.extend(routes) + if current_local_hash.value == current_remote_hash.value: + self._write_hash() # update current hash in index + else: + logger.warning( + "Local and remote route layers were not aligned. Remote hash " + f"not updated. Use `{self.__class__.__name__}.get_utterance_diff()` " + "to see details." + ) + def _get_index(self, index: Optional[BaseIndex]) -> BaseIndex: if index is None: logger.warning("No index provided. Using default HybridLocalIndex.") @@ -93,6 +134,8 @@ class HybridRouter(BaseRouter): xq_s = self.sparse_encoder(text) # xq_s = np.squeeze(xq_s) # convex scaling + print(f"{self.sparse_encoder.__class__.__name__=}") + print(f"_encode: {xq_d.shape=}, {xq_s=}") xq_d, xq_s = self._convex_scaling(dense=xq_d, sparse=xq_s) return xq_d, xq_s @@ -113,6 +156,7 @@ class HybridRouter(BaseRouter): # create dense query vector xq_d = np.array(dense_vec) # convex scaling + print(f"_async_encode: {xq_d.shape=}, {xq_s=}") xq_d, xq_s = self._convex_scaling(dense=xq_d, sparse=xq_s) return xq_d, xq_s @@ -139,7 +183,7 @@ class HybridRouter(BaseRouter): ) if sparse_vector is None: raise ValueError("Sparse vector is required for HybridLocalIndex.") - vector_arr = vector_arr if vector_arr else np.array(vector) + vector_arr = vector_arr if vector_arr is not None else np.array(vector) # TODO: add alpha as a parameter scores, route_names = self.index.query( vector=vector_arr, diff --git a/semantic_router/routers/semantic.py b/semantic_router/routers/semantic.py index 64ccbaf2d24c35bed9e7bb2c003c495d1fe59e18..94c3e179461a236ae737e6cc315f3956c1918ade 100644 --- a/semantic_router/routers/semantic.py +++ b/semantic_router/routers/semantic.py @@ -5,6 +5,7 @@ import numpy as np from semantic_router.encoders import DenseEncoder from semantic_router.index.base import BaseIndex from semantic_router.llms import BaseLLM +from semantic_router.utils.logger import logger from semantic_router.route import Route from semantic_router.routers.base import BaseRouter @@ -45,3 +46,39 @@ class SemanticRouter(BaseRouter): xq = np.array(await self.encoder.acall(docs=text)) xq = np.squeeze(xq) # Reduce to 1d array. return xq + + def add(self, routes: List[Route] | Route): + """Add a route to the local SemanticRouter and index. + + :param route: The route to add. + :type route: Route + """ + current_local_hash = self._get_hash() + current_remote_hash = self.index._read_hash() + if current_remote_hash.value == "": + # if remote hash is empty, the index is to be initialized + current_remote_hash = current_local_hash + if isinstance(routes, Route): + routes = [routes] + # create embeddings for all routes + route_names, all_utterances, all_function_schemas, all_metadata = ( + self._extract_routes_details(routes, include_metadata=True) + ) + dense_emb = self._encode(all_utterances) + self.index.add( + embeddings=dense_emb.tolist(), + routes=route_names, + utterances=all_utterances, + function_schemas=all_function_schemas, + metadata_list=all_metadata, + ) + + self.routes.extend(routes) + if current_local_hash.value == current_remote_hash.value: + self._write_hash() # update current hash in index + else: + logger.warning( + "Local and remote route layers were not aligned. Remote hash " + f"not updated. Use `{self.__class__.__name__}.get_utterance_diff()` " + "to see details." + ) \ No newline at end of file diff --git a/tests/unit/test_hybrid_layer.py b/tests/unit/test_hybrid_layer.py index f3fbe6dab8e2dc2aa577850ad128cb543167c920..aadad86aef5ab82dd4b07f68243517ed7d15eff9 100644 --- a/tests/unit/test_hybrid_layer.py +++ b/tests/unit/test_hybrid_layer.py @@ -54,32 +54,42 @@ def azure_encoder(mocker): model="test_model", ) - -def bm25_encoder(mocker): - mocker.patch.object(BM25Encoder, "__call__", side_effect=mock_encoder_call) +@pytest.fixture +def bm25_encoder(): + #mocker.patch.object(BM25Encoder, "__call__", side_effect=mock_encoder_call) return BM25Encoder(name="test-bm25-encoder") @pytest.fixture -def tfidf_encoder(mocker): - mocker.patch.object(TfidfEncoder, "__call__", side_effect=mock_encoder_call) +def tfidf_encoder(): + #mocker.patch.object(TfidfEncoder, "__call__", side_effect=mock_encoder_call) return TfidfEncoder(name="test-tfidf-encoder") @pytest.fixture def routes(): return [ - Route(name="Route 1", utterances=["Hello", "Hi"]), - Route(name="Route 2", utterances=["Goodbye", "Bye", "Au revoir"]), + Route(name="Route 1", utterances=[ + "Hello we need this text to be a little longer for our sparse encoders", + "In this case they need to learn from recurring tokens, ie words." + ]), + Route(name="Route 2", utterances=[ + "We give ourselves several examples from our encoders to learn from.", + "But given this is only an example we don't need too many", + "Just enough to test that our sparse encoders work as expected" + ]), ] -sparse_encoder = BM25Encoder(use_default_params=False) +sparse_encoder = TfidfEncoder() sparse_encoder.fit( [ Route( name="Route 1", - utterances=["The quick brown fox", "jumps over the lazy dog"], + utterances=[ + "The quick brown fox jumps over the lazy dog", + "some other useful text containing words like fox and dog" + ], ), Route(name="Route 2", utterances=["Hello, world!"]), ] @@ -95,13 +105,13 @@ class TestHybridRouter: top_k=10, alpha=0.8, ) - assert route_layer.index is not None and route_layer.categories is not None + assert route_layer.index is not None and route_layer.routes is not None assert openai_encoder.score_threshold == 0.3 assert route_layer.score_threshold == 0.3 assert route_layer.top_k == 10 assert route_layer.alpha == 0.8 - assert len(route_layer.index) == 5 - assert len(set(route_layer.categories)) == 2 + assert route_layer.index.route_names is None + assert len(route_layer.routes) == 2 def test_initialization_different_encoders(self, cohere_encoder, openai_encoder): route_layer_cohere = HybridRouter( @@ -114,25 +124,23 @@ class TestHybridRouter: ) assert route_layer_openai.score_threshold == 0.3 - def test_add_route(self, openai_encoder): + def test_add_route(self, openai_encoder, routes): route_layer = HybridRouter( encoder=openai_encoder, sparse_encoder=sparse_encoder ) - route = Route(name="Route 3", utterances=["Yes", "No"]) - route_layer._add_routes([route]) - assert route_layer.index is not None and route_layer.categories is not None - assert len(route_layer.index) == 2 - assert len(set(route_layer.categories)) == 1 + route_layer.add(routes=routes[0]) + assert route_layer.index is not None, "route_layer.index is None" + assert route_layer.routes is not None, "route_layer.routes is None" + assert len(route_layer.routes) == 1, "route_layer.routes is not 1" def test_add_multiple_routes(self, openai_encoder, routes): route_layer = HybridRouter( encoder=openai_encoder, sparse_encoder=sparse_encoder ) - for route in routes: - route_layer.add(route) - assert route_layer.index is not None and route_layer.categories is not None - assert len(route_layer.index) == 5 - assert len(set(route_layer.categories)) == 2 + route_layer.add(routes=routes) + assert route_layer.index is not None, "route_layer.index is None" + assert route_layer.routes is not None, "route_layer.routes is None" + assert len(route_layer.routes) == 2, "route_layer.routes is not 2" def test_query_and_classification(self, openai_encoder, routes): route_layer = HybridRouter( @@ -145,6 +153,14 @@ class TestHybridRouter: route_layer = HybridRouter( encoder=openai_encoder, sparse_encoder=sparse_encoder ) + assert isinstance( + route_layer.sparse_encoder, BM25Encoder + ) or isinstance( + route_layer.sparse_encoder, TfidfEncoder + ), ( + f"route_layer.sparse_encoder is {route_layer.sparse_encoder.__class__.__name__} " + "not BM25Encoder or TfidfEncoder" + ) assert route_layer("Anything") is None def test_semantic_classify(self, openai_encoder, routes): @@ -192,12 +208,12 @@ class TestHybridRouter: sparse_encoder=tfidf_encoder, routes=routes[:-1], ) - hybrid_route_layer.add(routes[-1]) + hybrid_route_layer.add(routes=routes[-1]) all_utterances = [ utterance for route in routes for utterance in route.utterances ] - assert hybrid_route_layer.sparse_index is not None - assert len(hybrid_route_layer.sparse_index) == len(all_utterances) + assert hybrid_route_layer.index.sparse_index is not None, "sparse_index is None" + assert len(hybrid_route_layer.index.sparse_index) == len(all_utterances), "sparse_index length mismatch" def test_setting_aggregation_methods(self, openai_encoder, routes): for agg in ["sum", "mean", "max"]: diff --git a/tests/unit/test_router.py b/tests/unit/test_router.py index 62b49fcd6b1888d720c309ae96291fb1b3781f6a..ef36e0ab4b4180b3ed5e5746d984691efd2b5150 100644 --- a/tests/unit/test_router.py +++ b/tests/unit/test_router.py @@ -281,7 +281,7 @@ class TestSemanticRouter: assert route_layer.index.get_utterances() == [] # Add route1 and check - route_layer.add(route=routes[0]) + route_layer.add(routes=routes[0]) if index_cls is PineconeIndex: time.sleep(PINECONE_SLEEP) # allow for index to be populated assert route_layer.routes == [routes[0]] @@ -289,7 +289,7 @@ class TestSemanticRouter: assert len(route_layer.index.get_utterances()) == 2 # Add route2 and check - route_layer.add(route=routes[1]) + route_layer.add(routes=routes[1]) if index_cls is PineconeIndex: time.sleep(PINECONE_SLEEP) # allow for index to be populated assert route_layer.routes == [routes[0], routes[1]] @@ -354,7 +354,7 @@ class TestSemanticRouter: ) if index_cls is PineconeIndex: time.sleep(PINECONE_SLEEP) - route_layer._add_routes(routes=routes) + route_layer.add(routes=routes) if index_cls is PineconeIndex: time.sleep(PINECONE_SLEEP) # allow for index to be populated assert route_layer.index is not None