From ff2ed5f94090e66b606e516c5bb63cb96ae71178 Mon Sep 17 00:00:00 2001 From: Simonas <20096648+simjak@users.noreply.github.com> Date: Fri, 23 Feb 2024 11:10:19 +0200 Subject: [PATCH] chore: lint --- Makefile | 4 ++-- semantic_router/splitters/base.py | 3 +-- semantic_router/splitters/consecutive_sim.py | 3 ++- semantic_router/splitters/cumulative_sim.py | 16 ++++++++++------ semantic_router/splitters/rolling_window.py | 3 +-- semantic_router/splitters/utils.py | 6 ------ 6 files changed, 16 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 3891e912..a7c69643 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ format: - poetry run black --target-version py39 . + poetry run black --target-version py39 -l 88 . poetry run ruff --select I --fix . PYTHON_FILES=. @@ -7,7 +7,7 @@ lint: PYTHON_FILES=. lint_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep -E '\.py$$') lint lint_diff: - poetry run black --target-version py39 $(PYTHON_FILES) --check + poetry run black --target-version py39 -l 88 $(PYTHON_FILES) --check poetry run ruff . poetry run mypy $(PYTHON_FILES) diff --git a/semantic_router/splitters/base.py b/semantic_router/splitters/base.py index 8e68cd04..0514e014 100644 --- a/semantic_router/splitters/base.py +++ b/semantic_router/splitters/base.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List from colorama import Fore, Style from pydantic.v1 import BaseModel, Extra @@ -10,7 +10,6 @@ from semantic_router.schema import DocumentSplit class BaseSplitter(BaseModel): name: str encoder: BaseEncoder - score_threshold: Optional[float] class Config: extra = Extra.allow diff --git a/semantic_router/splitters/consecutive_sim.py b/semantic_router/splitters/consecutive_sim.py index f30bbc75..4a2e1106 100644 --- a/semantic_router/splitters/consecutive_sim.py +++ b/semantic_router/splitters/consecutive_sim.py @@ -19,8 +19,9 @@ class ConsecutiveSimSplitter(BaseSplitter): name: str = "consecutive_similarity_splitter", score_threshold: float = 0.45, ): - super().__init__(name=name, score_threshold=score_threshold, encoder=encoder) + super().__init__(name=name, encoder=encoder) encoder.score_threshold = score_threshold + self.score_threshold = score_threshold def __call__(self, docs: List[Any]): # Check if there's only a single document diff --git a/semantic_router/splitters/cumulative_sim.py b/semantic_router/splitters/cumulative_sim.py index f7a6475a..e9dd8deb 100644 --- a/semantic_router/splitters/cumulative_sim.py +++ b/semantic_router/splitters/cumulative_sim.py @@ -8,9 +8,9 @@ from semantic_router.splitters.base import BaseSplitter class CumulativeSimSplitter(BaseSplitter): - """ - Called "cumulative sim" because we check the similarities of the embeddings of cumulative concatenated documents with the next document. + Called "cumulative sim" because we check the similarities of the + embeddings of cumulative concatenated documents with the next document. """ def __init__( @@ -19,15 +19,17 @@ class CumulativeSimSplitter(BaseSplitter): name: str = "cumulative_similarity_splitter", score_threshold: float = 0.45, ): - super().__init__(name=name, score_threshold=score_threshold, encoder=encoder) + super().__init__(name=name, encoder=encoder) encoder.score_threshold = score_threshold + self.score_threshold = score_threshold def __call__(self, docs: List[str]): total_docs = len(docs) # Check if there's only a single document if total_docs == 1: raise ValueError( - "There is only one document provided; at least two are required to determine topics based on similarity." + "There is only one document provided; at least two are required " + "to determine topics based on similarity." ) splits = [] curr_split_start_idx = 0 @@ -35,10 +37,12 @@ class CumulativeSimSplitter(BaseSplitter): for idx in range(0, total_docs): if idx + 1 < total_docs: # Ensure there is a next document to compare with. if idx == 0: - # On the first iteration, compare the first document directly to the second. + # On the first iteration, compare the + # first document directly to the second. curr_split_docs = docs[idx] else: - # For subsequent iterations, compare cumulative documents up to the current one with the next. + # For subsequent iterations, compare cumulative + # documents up to the current one with the next. curr_split_docs = "\n".join(docs[curr_split_start_idx : idx + 1]) next_doc = docs[idx + 1] diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index 4a746f7c..0369c746 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -9,7 +9,6 @@ from semantic_router.splitters.utils import split_to_sentences, tiktoken_length from semantic_router.utils.logger import logger - class RollingWindowSplitter(BaseSplitter): def __init__( self, @@ -20,7 +19,7 @@ class RollingWindowSplitter(BaseSplitter): max_split_tokens=300, split_tokens_tolerance=10, plot_splits=False, - name = "rolling_window_splitter", + name="rolling_window_splitter", ): super().__init__(name=name, encoder=encoder) self.calculated_threshold: float diff --git a/semantic_router/splitters/utils.py b/semantic_router/splitters/utils.py index 296df2ad..349c3eaa 100644 --- a/semantic_router/splitters/utils.py +++ b/semantic_router/splitters/utils.py @@ -1,10 +1,5 @@ -from typing import List - import regex import tiktoken -from colorama import Fore, Style - -from semantic_router.schema import DocumentSplit def split_to_sentences(text: str) -> list[str]: @@ -66,4 +61,3 @@ def tiktoken_length(text: str) -> int: tokenizer = tiktoken.get_encoding("cl100k_base") tokens = tokenizer.encode(text, disallowed_special=()) return len(tokens) - -- GitLab