Skip to content
Snippets Groups Projects
Commit 1ab26d73 authored by Simonas's avatar Simonas
Browse files

fix: Hard split for max token size

parent 6cf55222
No related branches found
No related tags found
No related merge requests found
...@@ -215,7 +215,7 @@ class RollingWindowSplitter(BaseSplitter): ...@@ -215,7 +215,7 @@ class RollingWindowSplitter(BaseSplitter):
logger.debug(f"Document token count: {doc_token_count} tokens") logger.debug(f"Document token count: {doc_token_count} tokens")
# Check if current index is a split point based on similarity # Check if current index is a split point based on similarity
if doc_idx + 1 in split_indices: if doc_idx + 1 in split_indices:
if current_tokens_count + doc_token_count >= self.min_split_tokens: if self.min_split_tokens <= current_tokens_count + doc_token_count < self.max_split_tokens:
# Include the current document before splitting # Include the current document before splitting
# if it doesn't exceed the max limit # if it doesn't exceed the max limit
current_split.append(doc) current_split.append(doc)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment