From f3b2535d928ecfb6049735d5dab6a0e57954cfc9 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sun, 11 Feb 2024 15:00:17 +0400 Subject: [PATCH] modularization and cleanup --- semantic_router/index/base.py | 6 ++--- semantic_router/index/local.py | 34 ++++++++++++++++++++----- semantic_router/index/pinecone.py | 8 +++--- semantic_router/layer.py | 42 +++++++++++++------------------ 4 files changed, 52 insertions(+), 38 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 4a7764e3..598f0167 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -25,9 +25,9 @@ class BaseIndex(BaseModel): """ raise NotImplementedError("This method should be implemented by subclasses.") - def delete(self, indices_to_remove: List[int]): + def delete(self, route_name: str): """ - Remove items from the index by their indices. + Deletes route by route name. This method should be implemented by subclasses. """ raise NotImplementedError("This method should be implemented by subclasses.") @@ -39,7 +39,7 @@ class BaseIndex(BaseModel): """ raise NotImplementedError("This method should be implemented by subclasses.") - def query(self, query_vector: Any, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]: + def query(self, vector: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]: """ Search the index for the query_vector and return top_k results. This method should be implemented by subclasses. diff --git a/semantic_router/index/local.py b/semantic_router/index/local.py index 27e39704..72c474ae 100644 --- a/semantic_router/index/local.py +++ b/semantic_router/index/local.py @@ -15,17 +15,35 @@ class LocalIndex(BaseIndex): def add(self, embeddings: List[List[float]], routes: List[str], utterances: List[str]): embeds = np.array(embeddings) # type: ignore + routes_arr = np.array(routes) + utterances_arr = np.array(utterances) if self.index is None: self.index = embeds # type: ignore + self.routes = routes_arr + self.utterances = utterances_arr else: self.index = np.concatenate([self.index, embeds]) + self.routes = np.concatenate([self.routes, routes_arr]) + self.utterances = np.concatenate([self.utterances, utterances_arr]) - def delete(self, indices_to_remove: List[int]): + def _get_indices_for_route(self, route_name: str): + """Gets an array of indices for a specific route. """ - Remove all items of a specific category from the index. + idx = [ + i for i, route in enumerate(self.routes) + if route == route_name + ] + return idx + + def delete(self, route_name: str): + """ + Delete all records of a specific route from the index. """ if self.index is not None: - self.index = np.delete(self.index, indices_to_remove, axis=0) + delete_idx = self._get_indices_for_route(route_name=route_name) + self.index = np.delete(self.index, delete_idx, axis=0) + self.routes = np.delete(self.routes, delete_idx, axis=0) + self.utterances = np.delete(self.utterances, delete_idx, axis=0) def describe(self): return { @@ -34,14 +52,18 @@ class LocalIndex(BaseIndex): "vectors": self.index.shape[0] if self.index is not None else 0 } - def query(self, query_vector: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, List[str]]: + def query(self, vector: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, List[str]]: """ Search the index for the query and return top_k results. """ if self.index is None: raise ValueError("Index is not populated.") - sim = similarity_matrix(query_vector, self.index) - return top_scores(sim, top_k) + sim = similarity_matrix(vector, self.index) + # extract the index values of top scoring vectors + scores, idx = top_scores(sim, top_k) + # get routes from index values + route_names = self.routes[idx].copy() + return scores, route_names def delete_index(self): """ diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 792381e2..2f6d5611 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -16,7 +16,7 @@ class PineconeRecord(BaseModel): def to_dict(self): return { - "id": self.id, + "id": f"{self.route}#{self.id}", "values": self.values, "metadata": { "sr_route": self.route, @@ -105,7 +105,7 @@ class PineconeIndex(BaseIndex): vectors_to_upsert.append(record.to_dict()) self.index.upsert(vectors=vectors_to_upsert) - def delete(self, ids_to_remove: List[str]): + def delete(self, route_name: str): self.index.delete(ids=ids_to_remove) def delete_all(self): @@ -119,8 +119,8 @@ class PineconeIndex(BaseIndex): "vectors": stats["total_vector_count"] } - def query(self, query_vector: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, List[str]]: - query_vector_list = query_vector.tolist() + def query(self, vector: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, List[str]]: + query_vector_list = vector.tolist() results = self.index.query( vector=[query_vector_list], top_k=top_k, diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 89d084e8..21d208f1 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -152,7 +152,6 @@ class LayerConfig: class RouteLayer: - categories: Optional[np.ndarray] = None score_threshold: float encoder: BaseEncoder index: BaseIndex @@ -166,7 +165,6 @@ class RouteLayer: ): logger.info("local") self.index: BaseIndex = index - self.categories = None if encoder is None: logger.warning( "No encoder provided. Using default OpenAIEncoder. Ensure " @@ -208,7 +206,7 @@ class RouteLayer: vector_arr = self._encode(text=text) else: vector_arr = np.array(vector) - # get relevant utterances + # get relevant results (scores and routes) results = self._retrieve(xq=vector_arr) # decide most relevant routes top_class, top_class_scores = self._semantic_classify(results) @@ -285,24 +283,23 @@ class RouteLayer: def list_route_names(self) -> List[str]: return [route.name for route in self.routes] - - def remove(self, name: str): - if name not in [route.name for route in self.routes]: - err_msg = f"Route `{name}` not found" + + def update(self, route_name: str, utterances: List[str]): + raise NotImplementedError("This method has not yet been implemented.") + + def delete(self, route_name: str): + """Deletes a route given a specific route name. + + :param route_name: the name of the route to be deleted + :type str: + """ + if route_name not in [route.name for route in self.routes]: + err_msg = f"Route `{route_name}` not found" logger.error(err_msg) raise ValueError(err_msg) else: - self.routes = [route for route in self.routes if route.name != name] - logger.info(f"Removed route `{name}`") - # Also remove from index and categories - if self.categories is not None and self.index.is_index_populated(): - indices_to_remove = [ - i - for i, route_name in enumerate(self.categories) - if route_name == name - ] - self.index.remove(indices_to_remove) - self.categories = np.delete(self.categories, indices_to_remove, axis=0) + self.routes = [route for route in self.routes if route.name != route_name] + self.index.delete(route_name=route_name) def _add_routes(self, routes: List[Route]): # create embeddings for all routes @@ -326,13 +323,8 @@ class RouteLayer: def _retrieve(self, xq: Any, top_k: int = 5) -> List[dict]: """Given a query vector, retrieve the top_k most similar records.""" - # calculate similarity matrix - if self.index.type == "local": - scores, idx = self.index.query(xq, top_k) - # get the utterance categories (route names) - routes = self.categories[idx] if self.categories is not None else [] - elif self.index.type == "pinecone": - scores, routes = self.index.query(xq, top_k) + # get scores and routes + scores, routes = self.index.query(vector=xq, top_k=top_k) return [{"route": d, "score": s.item()} for d, s in zip(routes, scores)] def _semantic_classify(self, query_results: List[dict]) -> Tuple[str, List[float]]: -- GitLab