Skip to content
Snippets Groups Projects
Unverified Commit 7f0909e8 authored by James Briggs's avatar James Briggs Committed by GitHub
Browse files

Merge pull request #249 from aurelio-labs/simonas/splitter

fix: Hard split for max token size
parents 6cf55222 86bea989
No related branches found
No related tags found
No related merge requests found
...@@ -215,7 +215,11 @@ class RollingWindowSplitter(BaseSplitter): ...@@ -215,7 +215,11 @@ class RollingWindowSplitter(BaseSplitter):
logger.debug(f"Document token count: {doc_token_count} tokens") logger.debug(f"Document token count: {doc_token_count} tokens")
# Check if current index is a split point based on similarity # Check if current index is a split point based on similarity
if doc_idx + 1 in split_indices: if doc_idx + 1 in split_indices:
if current_tokens_count + doc_token_count >= self.min_split_tokens: if (
self.min_split_tokens
<= current_tokens_count + doc_token_count
< self.max_split_tokens
):
# Include the current document before splitting # Include the current document before splitting
# if it doesn't exceed the max limit # if it doesn't exceed the max limit
current_split.append(doc) current_split.append(doc)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment