diff --git a/semantic_router/schema.py b/semantic_router/schema.py index b7a3c9faf7817e48215bcebdab8f3e7140aae970..465cfaacb1ecbbc0c26bb2ee8f67f814da308e1e 100644 --- a/semantic_router/schema.py +++ b/semantic_router/schema.py @@ -9,6 +9,8 @@ from semantic_router.encoders import ( OpenAIEncoder, ) +from semantic_router.utils.splitters import semantic_splitter + class EncoderType(Enum): HUGGINGFACE = "huggingface" @@ -41,3 +43,23 @@ class Encoder: def __call__(self, texts: list[str]) -> list[list[float]]: return self.model(texts) + + +class Message(BaseModel): + role: str + content: str + + +class Conversation(BaseModel): + messages: list[Message] + + def split_by_topic( + self, + encoder: BaseEncoder, + threshold: float = 0.5, + split_method: str = "consecutive_similarity_drop", + ): + docs = [f"{m.role}: {m.content}" for m in self.messages] + return semantic_splitter( + encoder=encoder, docs=docs, threshold=threshold, split_method=split_method + ) diff --git a/semantic_router/utils/splitters.py b/semantic_router/utils/splitters.py new file mode 100644 index 0000000000000000000000000000000000000000..514ae8217ff4717c32b6909fcafc57665217410d --- /dev/null +++ b/semantic_router/utils/splitters.py @@ -0,0 +1,71 @@ +import numpy as np +from semantic_router.encoders import BaseEncoder + + +def semantic_splitter( + encoder: BaseEncoder, + docs: list[str], + threshold: float, + split_method: str = "consecutive_similarity_drop", +) -> dict[str, list[str]]: + """ + Splits a list of documents base on semantic similarity changes. + + Method 1: "consecutive_similarity_drop" - This method splits documents based on + the changes in similarity scores between consecutive documents. + Method 2: "cumulative_similarity_drop" - This method segments the documents based on the + changes in cumulative similarity score of the documents within the same split. + + Args: + encoder (BaseEncoder): Encoder for document embeddings. + docs (list[str]): Documents to split. + threshold (float): The similarity drop value that will trigger a new document split. + split_method (str): The method to use for splitting. + + Returns: + Dict[str, list[str]]: Splits with corresponding documents. + """ + total_docs = len(docs) + splits = {} + curr_split_start_idx = 0 + curr_split_num = 1 + + if split_method == "consecutive_similarity_drop": + doc_embeds = encoder(docs) + norm_embeds = doc_embeds / np.linalg.norm(doc_embeds, axis=1, keepdims=True) + sim_matrix = np.matmul(norm_embeds, norm_embeds.T) + + for idx in range(1, total_docs): + if idx < len(sim_matrix) and sim_matrix[idx - 1][idx] < threshold: + splits[f"split {curr_split_num}"] = docs[curr_split_start_idx:idx] + curr_split_start_idx = idx + curr_split_num += 1 + + elif split_method == "cumulative_similarity_drop": + for idx in range(1, total_docs): + if idx + 1 < total_docs: + curr_split_docs = "\n".join(docs[curr_split_start_idx : idx + 1]) + next_doc = docs[idx + 1] + + curr_split_docs_embed = encoder([curr_split_docs])[0] + next_doc_embed = encoder([next_doc])[0] + + similarity = np.dot(curr_split_docs_embed, next_doc_embed) / ( + np.linalg.norm(curr_split_docs_embed) + * np.linalg.norm(next_doc_embed) + ) + + if similarity < threshold: + splits[f"split {curr_split_num}"] = docs[ + curr_split_start_idx : idx + 1 + ] + curr_split_start_idx = idx + 1 + curr_split_num += 1 + + else: + raise ValueError( + "Invalid 'split_method'. Choose either 'consecutive_similarity_drop' or 'cumulative_similarity_drop'." + ) + + splits[f"split {curr_split_num}"] = docs[curr_split_start_idx:] + return splits