Skip to content
Snippets Groups Projects
Unverified Commit d3a861fa authored by Noah Christiansen's avatar Noah Christiansen Committed by GitHub
Browse files

added merging_separator argument to allow for specifying chunk merge (#18027)

parent d569009a
No related branches found
No related tags found
No related merge requests found
......@@ -74,6 +74,7 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
merging_threshold (float): sets threshold for merging whole chunks
max_chunk_size (int): maximum size of chunk (in characters)
merging_range (int): How many chunks 'ahead' beyond the nearest neighbor to be merged if similar (1 or 2 available)
merging_separator (str): The separator to use when merging chunks. Defaults to a single space.
sentence_splitter (Optional[Callable]): splits text into sentences
"""
......@@ -123,6 +124,11 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
),
)
merging_separator: str = Field(
default=" ",
description="The separator to use when merging chunks. Defaults to a single space.",
)
sentence_splitter: Callable[[str], List[str]] = Field(
default_factory=split_by_sentence_tokenizer,
description="The text splitter to use when splitting documents.",
......@@ -142,6 +148,7 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
merging_threshold: Optional[float] = 0.8,
max_chunk_size: Optional[int] = 1000,
merging_range: Optional[int] = 1,
merging_separator: Optional[str] = " ",
sentence_splitter: Optional[Callable[[str], List[str]]] = None,
original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
include_metadata: bool = True,
......@@ -162,6 +169,7 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
merging_threshold=merging_threshold,
max_chunk_size=max_chunk_size,
merging_range=merging_range,
merging_separator=merging_separator,
sentence_splitter=sentence_splitter,
original_text_metadata_key=original_text_metadata_key,
include_metadata=include_metadata,
......@@ -239,14 +247,14 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
chunk_sentences = [chunk]
if len(chunk) + len(sentence) + 1 <= self.max_chunk_size:
chunk_sentences.append(sentence)
chunk = " ".join(chunk_sentences)
chunk = self.merging_separator.join(chunk_sentences)
new = False
else:
new = True
initial_chunks.append(chunk)
chunk = sentence
continue
last_sentences = " ".join(chunk_sentences[-2:])
last_sentences = self.merging_separator.join(chunk_sentences[-2:])
# new = False
elif (
......@@ -261,8 +269,8 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
):
# elif nlp(last_sentences).similarity(nlp(sentence)) > self.threshold:
chunk_sentences.append(sentence)
last_sentences = " ".join(chunk_sentences[-2:])
chunk += " " + sentence
last_sentences = self.merging_separator.join(chunk_sentences[-2:])
chunk += self.merging_separator + sentence
else:
initial_chunks.append(chunk)
chunk = sentence # ""
......@@ -302,7 +310,7 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
> self.merging_threshold
and len(current) + len(initial_chunks[i]) + 1 <= self.max_chunk_size
):
current += " " + initial_chunks[i]
current += self.merging_separator + initial_chunks[i]
# check if 1st and 3rd chunk are similar, if yes then merge 1st, 2nd, 3rd together
elif (
......@@ -319,7 +327,12 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
+ 2
<= self.max_chunk_size
):
current += " " + initial_chunks[i] + " " + initial_chunks[i + 1]
current += (
self.merging_separator
+ initial_chunks[i]
+ self.merging_separator
+ initial_chunks[i + 1]
)
skip = 1
# check if 1st and 4th chunk are smilar, if yes then merge 1st, 2nd, 3rd and 4th together
......@@ -340,11 +353,11 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
<= self.max_chunk_size
):
current += (
" "
self.merging_separator
+ initial_chunks[i]
+ " "
+ self.merging_separator
+ initial_chunks[i + 1]
+ " "
+ self.merging_separator
+ initial_chunks[i + 2]
)
skip = 2
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment