Skip to content
Snippets Groups Projects
Commit d3bffff0 authored by Ismail Ashraq's avatar Ismail Ashraq
Browse files

semantic splitter

parent 412d74cf
No related branches found
No related tags found
No related merge requests found
...@@ -9,6 +9,8 @@ from semantic_router.encoders import ( ...@@ -9,6 +9,8 @@ from semantic_router.encoders import (
OpenAIEncoder, OpenAIEncoder,
) )
from semantic_router.utils.splitters import semantic_splitter
class EncoderType(Enum): class EncoderType(Enum):
HUGGINGFACE = "huggingface" HUGGINGFACE = "huggingface"
...@@ -41,3 +43,23 @@ class Encoder: ...@@ -41,3 +43,23 @@ class Encoder:
def __call__(self, texts: list[str]) -> list[list[float]]: def __call__(self, texts: list[str]) -> list[list[float]]:
return self.model(texts) return self.model(texts)
class Message(BaseModel):
role: str
content: str
class Conversation(BaseModel):
messages: list[Message]
def split_by_topic(
self,
encoder: BaseEncoder,
threshold: float = 0.5,
split_method: str = "consecutive_similarity_drop",
):
docs = [f"{m.role}: {m.content}" for m in self.messages]
return semantic_splitter(
encoder=encoder, docs=docs, threshold=threshold, split_method=split_method
)
import numpy as np
from semantic_router.encoders import BaseEncoder
def semantic_splitter(
encoder: BaseEncoder,
docs: list[str],
threshold: float,
split_method: str = "consecutive_similarity_drop",
) -> dict[str, list[str]]:
"""
Splits a list of documents base on semantic similarity changes.
Method 1: "consecutive_similarity_drop" - This method splits documents based on
the changes in similarity scores between consecutive documents.
Method 2: "cumulative_similarity_drop" - This method segments the documents based on the
changes in cumulative similarity score of the documents within the same split.
Args:
encoder (BaseEncoder): Encoder for document embeddings.
docs (list[str]): Documents to split.
threshold (float): The similarity drop value that will trigger a new document split.
split_method (str): The method to use for splitting.
Returns:
Dict[str, list[str]]: Splits with corresponding documents.
"""
total_docs = len(docs)
splits = {}
curr_split_start_idx = 0
curr_split_num = 1
if split_method == "consecutive_similarity_drop":
doc_embeds = encoder(docs)
norm_embeds = doc_embeds / np.linalg.norm(doc_embeds, axis=1, keepdims=True)
sim_matrix = np.matmul(norm_embeds, norm_embeds.T)
for idx in range(1, total_docs):
if idx < len(sim_matrix) and sim_matrix[idx - 1][idx] < threshold:
splits[f"split {curr_split_num}"] = docs[curr_split_start_idx:idx]
curr_split_start_idx = idx
curr_split_num += 1
elif split_method == "cumulative_similarity_drop":
for idx in range(1, total_docs):
if idx + 1 < total_docs:
curr_split_docs = "\n".join(docs[curr_split_start_idx : idx + 1])
next_doc = docs[idx + 1]
curr_split_docs_embed = encoder([curr_split_docs])[0]
next_doc_embed = encoder([next_doc])[0]
similarity = np.dot(curr_split_docs_embed, next_doc_embed) / (
np.linalg.norm(curr_split_docs_embed)
* np.linalg.norm(next_doc_embed)
)
if similarity < threshold:
splits[f"split {curr_split_num}"] = docs[
curr_split_start_idx : idx + 1
]
curr_split_start_idx = idx + 1
curr_split_num += 1
else:
raise ValueError(
"Invalid 'split_method'. Choose either 'consecutive_similarity_drop' or 'cumulative_similarity_drop'."
)
splits[f"split {curr_split_num}"] = docs[curr_split_start_idx:]
return splits
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment