Skip to content
Snippets Groups Projects
Unverified Commit 7f0909e8 authored by James Briggs's avatar James Briggs Committed by GitHub
Browse files

Merge pull request #249 from aurelio-labs/simonas/splitter

fix: Hard split for max token size
parents 6cf55222 86bea989
No related branches found
No related tags found
No related merge requests found
......@@ -215,7 +215,11 @@ class RollingWindowSplitter(BaseSplitter):
logger.debug(f"Document token count: {doc_token_count} tokens")
# Check if current index is a split point based on similarity
if doc_idx + 1 in split_indices:
if current_tokens_count + doc_token_count >= self.min_split_tokens:
if (
self.min_split_tokens
<= current_tokens_count + doc_token_count
< self.max_split_tokens
):
# Include the current document before splitting
# if it doesn't exceed the max limit
current_split.append(doc)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment