From e486157f418979c116eb54f8eef611f7d0bfda4d Mon Sep 17 00:00:00 2001 From: polarbear567 <269739606@qq.com> Date: Thu, 6 Mar 2025 00:21:05 +0800 Subject: [PATCH] =?UTF-8?q?add=20a=20field=20keep=5Fwhitespaces=20to=20det?= =?UTF-8?q?ermine=20whether=20to=20keep=20leading/tra=E2=80=A6=20(#17998)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/node_parser/text/token.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/llama-index-core/llama_index/core/node_parser/text/token.py b/llama-index-core/llama_index/core/node_parser/text/token.py index 48a2930269..dd3c151ad8 100644 --- a/llama-index-core/llama_index/core/node_parser/text/token.py +++ b/llama-index-core/llama_index/core/node_parser/text/token.py @@ -39,6 +39,11 @@ class TokenTextSplitter(MetadataAwareTextSplitter): default_factory=list, description="Additional separators for splitting." ) + keep_whitespaces: bool = Field( + default=False, + description="Whether to keep leading/trailing whitespaces in the chunk.", + ) + _tokenizer: Callable = PrivateAttr() _split_fns: List[Callable] = PrivateAttr() @@ -50,6 +55,7 @@ class TokenTextSplitter(MetadataAwareTextSplitter): callback_manager: Optional[CallbackManager] = None, separator: str = " ", backup_separators: Optional[List[str]] = ["\n"], + keep_whitespaces: bool = False, include_metadata: bool = True, include_prev_next_rel: bool = True, id_func: Optional[Callable[[int, Document], str]] = None, @@ -67,6 +73,7 @@ class TokenTextSplitter(MetadataAwareTextSplitter): chunk_overlap=chunk_overlap, separator=separator, backup_separators=backup_separators, + keep_whitespaces=keep_whitespaces, callback_manager=callback_manager, include_metadata=include_metadata, include_prev_next_rel=include_prev_next_rel, @@ -84,6 +91,7 @@ class TokenTextSplitter(MetadataAwareTextSplitter): separator: str = " ", backup_separators: Optional[List[str]] = ["\n"], callback_manager: Optional[CallbackManager] = None, + keep_whitespaces: bool = False, include_metadata: bool = True, include_prev_next_rel: bool = True, id_func: Optional[Callable[[int, Document], str]] = None, @@ -95,6 +103,7 @@ class TokenTextSplitter(MetadataAwareTextSplitter): chunk_overlap=chunk_overlap, separator=separator, backup_separators=backup_separators, + keep_whitespaces=keep_whitespaces, callback_manager=callback_manager, include_metadata=include_metadata, include_prev_next_rel=include_prev_next_rel, @@ -200,7 +209,11 @@ class TokenTextSplitter(MetadataAwareTextSplitter): # we need to end the current chunk and start a new one if cur_len + split_len > chunk_size: # end the previous chunk - chunk = "".join(cur_chunk).strip() + chunk = ( + "".join(cur_chunk) + if self.keep_whitespaces + else "".join(cur_chunk).strip() + ) if chunk: chunks.append(chunk) @@ -217,7 +230,9 @@ class TokenTextSplitter(MetadataAwareTextSplitter): cur_len += split_len # handle the last chunk - chunk = "".join(cur_chunk).strip() + chunk = ( + "".join(cur_chunk) if self.keep_whitespaces else "".join(cur_chunk).strip() + ) if chunk: chunks.append(chunk) -- GitLab