From 417a2b21b5747a460825e92ea4025c939c9d9eef Mon Sep 17 00:00:00 2001 From: Siraj R Aizlewood <siraj@aurelio.ai> Date: Thu, 1 Feb 2024 14:15:24 +0400 Subject: [PATCH] Removed Unused Splitter and Created New Splitter Module --- semantic_router/splitters/consecutive_sim.py | 99 -------------------- semantic_router/splitters/running_avg_sim.py | 55 +++++++++++ 2 files changed, 55 insertions(+), 99 deletions(-) create mode 100644 semantic_router/splitters/running_avg_sim.py diff --git a/semantic_router/splitters/consecutive_sim.py b/semantic_router/splitters/consecutive_sim.py index b61610e5..a9038750 100644 --- a/semantic_router/splitters/consecutive_sim.py +++ b/semantic_router/splitters/consecutive_sim.py @@ -44,103 +44,4 @@ class ConsecutiveSimSplitter(BaseSplitter): curr_split_start_idx = idx curr_split_num += 1 splits.append(DocumentSplit(docs=list(docs[curr_split_start_idx:]))) - return splits - - -class ConsecutiveAvgSimSplitter(BaseSplitter): - def __init__( - self, - encoder: BaseEncoder, - name: str = "consecutive_similarity_splitter", - similarity_threshold: float = 0.45, - drop_threshold: float = 0.1 # Additional parameter to control the drop threshold - ): - super().__init__( - name=name, - similarity_threshold=similarity_threshold, - encoder=encoder - ) - - def __call__(self, docs: List[str], drop_threshold): - doc_embeds = self.encoder(docs) - norm_embeds = doc_embeds / np.linalg.norm(doc_embeds, axis=1, keepdims=True) - sim_matrix = np.matmul(norm_embeds, norm_embeds.T) - total_docs = len(docs) - splits = [] - curr_split_start_idx = 0 - - # Calculate similarity scores between consecutive documents - sim_scores = [sim_matrix[i][i+1] for i in range(total_docs - 1)] - - # Calculate running average of similarity scores - running_avg = [np.mean(sim_scores[:i+1]) for i in range(len(sim_scores))] - - for idx, curr_sim_score in enumerate(sim_scores): - # Check for a significant drop in similarity compared to the running average - if idx > 0 and (running_avg[idx-1] - curr_sim_score) > drop_threshold: - splits.append( - DocumentSplit( - docs=list(docs[curr_split_start_idx:idx+1]), # Include current doc in the split - is_triggered=True, - triggered_score=curr_sim_score, - ) - ) - curr_split_start_idx = idx + 1 # Update the start index for the next split - - # Add the last split - if curr_split_start_idx < total_docs: - splits.append(DocumentSplit(docs=list(docs[curr_split_start_idx:]))) - - return splits - - -class ConsecutiveAvgSimSplitter2(BaseSplitter): - def __init__( - self, - encoder: BaseEncoder, - name: str = "consecutive_similarity_splitter", - similarity_threshold: float = 0.45, - drop_threshold: float = 0.1 # Additional parameter to control the drop threshold - ): - super().__init__( - name=name, - similarity_threshold=similarity_threshold, - encoder=encoder - ) - - def __call__(self, docs: List[str], drop_threshold): - doc_embeds = self.encoder(docs) - norm_embeds = doc_embeds / np.linalg.norm(doc_embeds, axis=1, keepdims=True) - sim_matrix = np.matmul(norm_embeds, norm_embeds.T) - total_docs = len(docs) - splits = [] - curr_split_start_idx = 0 - - # Initialize an empty list to store similarity scores for the current topic segment - segment_sim_scores = [] - - for idx in range(total_docs - 1): - curr_sim_score = sim_matrix[idx][idx + 1] - segment_sim_scores.append(curr_sim_score) - - # Calculate running average of similarity scores for the current segment - running_avg = np.mean(segment_sim_scores) - - # Check for a significant drop in similarity compared to the running average - if idx > 0 and (running_avg - curr_sim_score) > drop_threshold: - splits.append( - DocumentSplit( - docs=list(docs[curr_split_start_idx:idx + 1]), # Include current doc in the split - is_triggered=True, - triggered_score=curr_sim_score, - ) - ) - curr_split_start_idx = idx + 1 - # Reset the similarity scores for the new segment - segment_sim_scores = [curr_sim_score] - - # Add the last split - if curr_split_start_idx < total_docs: - splits.append(DocumentSplit(docs=list(docs[curr_split_start_idx:]))) - return splits \ No newline at end of file diff --git a/semantic_router/splitters/running_avg_sim.py b/semantic_router/splitters/running_avg_sim.py new file mode 100644 index 00000000..dd05c7e2 --- /dev/null +++ b/semantic_router/splitters/running_avg_sim.py @@ -0,0 +1,55 @@ +from typing import List +from semantic_router.splitters.base import BaseSplitter +from semantic_router.encoders import BaseEncoder +import numpy as np +from semantic_router.schema import DocumentSplit + +class RunningAvgSimSplitter(BaseSplitter): + def __init__( + self, + encoder: BaseEncoder, + name: str = "consecutive_similarity_splitter", + similarity_threshold: float = 0.04, + ): + super().__init__( + name=name, + similarity_threshold=similarity_threshold, + encoder=encoder + ) + + def __call__(self, docs: List[str]): + doc_embeds = self.encoder(docs) + norm_embeds = doc_embeds / np.linalg.norm(doc_embeds, axis=1, keepdims=True) + sim_matrix = np.matmul(norm_embeds, norm_embeds.T) + total_docs = len(docs) + splits = [] + curr_split_start_idx = 0 + + # Initialize an empty list to store similarity scores for the current topic segment + segment_sim_scores = [] + + for idx in range(total_docs - 1): + curr_sim_score = sim_matrix[idx][idx + 1] + segment_sim_scores.append(curr_sim_score) + + # Calculate running average of similarity scores for the current segment + running_avg = np.mean(segment_sim_scores) + + # Check for a significant drop in similarity compared to the running average + if idx > 0 and (running_avg - curr_sim_score) > self.similarity_threshold: + splits.append( + DocumentSplit( + docs=list(docs[curr_split_start_idx:idx + 1]), # Include current doc in the split + is_triggered=True, + triggered_score=curr_sim_score, + ) + ) + curr_split_start_idx = idx + 1 + # Reset the similarity scores for the new segment + segment_sim_scores = [curr_sim_score] + + # Add the last split + if curr_split_start_idx < total_docs: + splits.append(DocumentSplit(docs=list(docs[curr_split_start_idx:]))) + + return splits \ No newline at end of file -- GitLab