diff --git a/Makefile b/Makefile index 3891e91208740fe4db727f61516a7cf2af6c84d7..a7c69643704677ab7650f8b350c74cad0b5c7a4e 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ format: - poetry run black --target-version py39 . + poetry run black --target-version py39 -l 88 . poetry run ruff --select I --fix . PYTHON_FILES=. @@ -7,7 +7,7 @@ lint: PYTHON_FILES=. lint_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep -E '\.py$$') lint lint_diff: - poetry run black --target-version py39 $(PYTHON_FILES) --check + poetry run black --target-version py39 -l 88 $(PYTHON_FILES) --check poetry run ruff . poetry run mypy $(PYTHON_FILES) diff --git a/semantic_router/splitters/base.py b/semantic_router/splitters/base.py index 8e68cd04937fc04f72995a22edd309186e5de604..0514e014e34e50b03a1285395591d71f67775b17 100644 --- a/semantic_router/splitters/base.py +++ b/semantic_router/splitters/base.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List from colorama import Fore, Style from pydantic.v1 import BaseModel, Extra @@ -10,7 +10,6 @@ from semantic_router.schema import DocumentSplit class BaseSplitter(BaseModel): name: str encoder: BaseEncoder - score_threshold: Optional[float] class Config: extra = Extra.allow diff --git a/semantic_router/splitters/consecutive_sim.py b/semantic_router/splitters/consecutive_sim.py index f30bbc755061640300f861e04fc7febba57fe947..4a2e110645ae90932871c36c5eb30f537943ad6f 100644 --- a/semantic_router/splitters/consecutive_sim.py +++ b/semantic_router/splitters/consecutive_sim.py @@ -19,8 +19,9 @@ class ConsecutiveSimSplitter(BaseSplitter): name: str = "consecutive_similarity_splitter", score_threshold: float = 0.45, ): - super().__init__(name=name, score_threshold=score_threshold, encoder=encoder) + super().__init__(name=name, encoder=encoder) encoder.score_threshold = score_threshold + self.score_threshold = score_threshold def __call__(self, docs: List[Any]): # Check if there's only a single document diff --git a/semantic_router/splitters/cumulative_sim.py b/semantic_router/splitters/cumulative_sim.py index f7a6475ad809a8b1eb877f592cf9ca0799941ba2..e9dd8deb2b1f24f519fab47c8fb61cc7e8f0f221 100644 --- a/semantic_router/splitters/cumulative_sim.py +++ b/semantic_router/splitters/cumulative_sim.py @@ -8,9 +8,9 @@ from semantic_router.splitters.base import BaseSplitter class CumulativeSimSplitter(BaseSplitter): - """ - Called "cumulative sim" because we check the similarities of the embeddings of cumulative concatenated documents with the next document. + Called "cumulative sim" because we check the similarities of the + embeddings of cumulative concatenated documents with the next document. """ def __init__( @@ -19,15 +19,17 @@ class CumulativeSimSplitter(BaseSplitter): name: str = "cumulative_similarity_splitter", score_threshold: float = 0.45, ): - super().__init__(name=name, score_threshold=score_threshold, encoder=encoder) + super().__init__(name=name, encoder=encoder) encoder.score_threshold = score_threshold + self.score_threshold = score_threshold def __call__(self, docs: List[str]): total_docs = len(docs) # Check if there's only a single document if total_docs == 1: raise ValueError( - "There is only one document provided; at least two are required to determine topics based on similarity." + "There is only one document provided; at least two are required " + "to determine topics based on similarity." ) splits = [] curr_split_start_idx = 0 @@ -35,10 +37,12 @@ class CumulativeSimSplitter(BaseSplitter): for idx in range(0, total_docs): if idx + 1 < total_docs: # Ensure there is a next document to compare with. if idx == 0: - # On the first iteration, compare the first document directly to the second. + # On the first iteration, compare the + # first document directly to the second. curr_split_docs = docs[idx] else: - # For subsequent iterations, compare cumulative documents up to the current one with the next. + # For subsequent iterations, compare cumulative + # documents up to the current one with the next. curr_split_docs = "\n".join(docs[curr_split_start_idx : idx + 1]) next_doc = docs[idx + 1] diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index 4a746f7cd9988f4418dfb73c23e868cafb61a653..0369c746dd5e0ac38f8339c03457373d648dffb1 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -9,7 +9,6 @@ from semantic_router.splitters.utils import split_to_sentences, tiktoken_length from semantic_router.utils.logger import logger - class RollingWindowSplitter(BaseSplitter): def __init__( self, @@ -20,7 +19,7 @@ class RollingWindowSplitter(BaseSplitter): max_split_tokens=300, split_tokens_tolerance=10, plot_splits=False, - name = "rolling_window_splitter", + name="rolling_window_splitter", ): super().__init__(name=name, encoder=encoder) self.calculated_threshold: float diff --git a/semantic_router/splitters/utils.py b/semantic_router/splitters/utils.py index 296df2ad364cc6c972e73a112b1b7fa09714e515..349c3eaac76017c53dd5425abfe65e7e2952a679 100644 --- a/semantic_router/splitters/utils.py +++ b/semantic_router/splitters/utils.py @@ -1,10 +1,5 @@ -from typing import List - import regex import tiktoken -from colorama import Fore, Style - -from semantic_router.schema import DocumentSplit def split_to_sentences(text: str) -> list[str]: @@ -66,4 +61,3 @@ def tiktoken_length(text: str) -> int: tokenizer = tiktoken.get_encoding("cl100k_base") tokens = tokenizer.encode(text, disallowed_special=()) return len(tokens) -