From bea4af984f00884accc3f48ed2bafa745a2fc5aa Mon Sep 17 00:00:00 2001 From: Yi Ding <yi.s.ding@gmail.com> Date: Thu, 27 Jul 2023 08:24:38 -0700 Subject: [PATCH] fix sentence splitter overlap logic --- .changeset/quick-pots-taste.md | 5 ++ .../docs/docs/api/classes/SentenceSplitter.md | 30 ++++--- apps/simple/split.ts | 17 ++++ packages/core/src/TextSplitter.ts | 90 +++++++++++-------- 4 files changed, 90 insertions(+), 52 deletions(-) create mode 100644 .changeset/quick-pots-taste.md create mode 100644 apps/simple/split.ts diff --git a/.changeset/quick-pots-taste.md b/.changeset/quick-pots-taste.md new file mode 100644 index 000000000..aa65cd8ab --- /dev/null +++ b/.changeset/quick-pots-taste.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Fixed sentence splitter overlap logic diff --git a/apps/docs/docs/api/classes/SentenceSplitter.md b/apps/docs/docs/api/classes/SentenceSplitter.md index 81d855091..329a9562b 100644 --- a/apps/docs/docs/api/classes/SentenceSplitter.md +++ b/apps/docs/docs/api/classes/SentenceSplitter.md @@ -8,6 +8,8 @@ custom_edit_url: null SentenceSplitter is our default text splitter that supports splitting into sentences, paragraphs, or fixed length chunks with overlap. +One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together. + ## Constructors ### constructor @@ -27,7 +29,7 @@ SentenceSplitter is our default text splitter that supports splitting into sente #### Defined in -[TextSplitter.ts:33](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L33) +[TextSplitter.ts:35](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L35) ## Properties @@ -37,7 +39,7 @@ SentenceSplitter is our default text splitter that supports splitting into sente #### Defined in -[TextSplitter.ts:26](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L26) +[TextSplitter.ts:28](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L28) ___ @@ -47,7 +49,7 @@ ___ #### Defined in -[TextSplitter.ts:25](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L25) +[TextSplitter.ts:27](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L27) ___ @@ -57,7 +59,7 @@ ___ #### Defined in -[TextSplitter.ts:30](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L30) +[TextSplitter.ts:32](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L32) ___ @@ -67,7 +69,7 @@ ___ #### Defined in -[TextSplitter.ts:29](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L29) +[TextSplitter.ts:31](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L31) ___ @@ -77,7 +79,7 @@ ___ #### Defined in -[TextSplitter.ts:27](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L27) +[TextSplitter.ts:29](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L29) ___ @@ -87,7 +89,7 @@ ___ #### Defined in -[TextSplitter.ts:28](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L28) +[TextSplitter.ts:30](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L30) ## Methods @@ -108,7 +110,7 @@ ___ #### Defined in -[TextSplitter.ts:153](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L153) +[TextSplitter.ts:155](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L155) ___ @@ -128,7 +130,7 @@ ___ #### Defined in -[TextSplitter.ts:72](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L72) +[TextSplitter.ts:74](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L74) ___ @@ -149,7 +151,7 @@ ___ #### Defined in -[TextSplitter.ts:89](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L89) +[TextSplitter.ts:91](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L91) ___ @@ -170,7 +172,7 @@ ___ #### Defined in -[TextSplitter.ts:115](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L115) +[TextSplitter.ts:117](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L117) ___ @@ -191,7 +193,7 @@ ___ #### Defined in -[TextSplitter.ts:128](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L128) +[TextSplitter.ts:130](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L130) ___ @@ -212,7 +214,7 @@ ___ #### Defined in -[TextSplitter.ts:233](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L233) +[TextSplitter.ts:247](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L247) ___ @@ -233,4 +235,4 @@ ___ #### Defined in -[TextSplitter.ts:205](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L205) +[TextSplitter.ts:219](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L219) diff --git a/apps/simple/split.ts b/apps/simple/split.ts new file mode 100644 index 000000000..c3cedb071 --- /dev/null +++ b/apps/simple/split.ts @@ -0,0 +1,17 @@ +import fs from "fs/promises"; +import { SentenceSplitter } from "llamaindex"; + +async function main() { + const essay = await fs.readFile( + "node_modules/llamaindex/examples/abramov.txt", + "utf-8" + ); + + const textSplitter = new SentenceSplitter(); + + const chunks = textSplitter.splitTextWithOverlaps(essay); + + console.log(chunks); +} + +main(); diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index b6719c266..afa9d9d56 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -16,10 +16,12 @@ class TextSplit { } } -type SplitRep = [text: string, numTokens: number]; +type SplitRep = { text: string; numTokens: number }; /** * SentenceSplitter is our default text splitter that supports splitting into sentences, paragraphs, or fixed length chunks with overlap. + * + * One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together. */ export class SentenceSplitter { private chunkSize: number; @@ -129,21 +131,21 @@ export class SentenceSplitter { sentenceSplits: string[], effectiveChunkSize: number ): SplitRep[] { - // Process entence splits + // Process sentence splits // Primarily check if any sentences exceed the chunk size. If they don't, // force split by tokenizer let newSplits: SplitRep[] = []; for (const split of sentenceSplits) { let splitTokens = this.tokenizer(split); - const split_len = splitTokens.length; - if (split_len <= effectiveChunkSize) { - newSplits.push([split, split_len]); + const splitLen = splitTokens.length; + if (splitLen <= effectiveChunkSize) { + newSplits.push({ text: split, numTokens: splitLen }); } else { - for (let i = 0; i < split_len; i += effectiveChunkSize) { + for (let i = 0; i < splitLen; i += effectiveChunkSize) { const cur_split = this.tokenizerDecoder( splitTokens.slice(i, i + effectiveChunkSize) ); - newSplits.push([cur_split, effectiveChunkSize]); + newSplits.push({ text: cur_split, numTokens: effectiveChunkSize }); } } } @@ -158,47 +160,59 @@ export class SentenceSplitter { // docs represents final list of text chunks let docs: TextSplit[] = []; - // curDocList represents the current list of sentence splits (that) + // curChunkSentences represents the current list of sentence splits (that) // will be merged into a chunk - let curDocList: string[] = []; - let bufferTokens = 0; - let curDocTokens = 0; - // curDocBuffer represents the current document buffer - let curDocBuffer: SplitRep[] = []; + let curChunkSentences: SplitRep[] = []; + let curChunkTokens = 0; for (let i = 0; i < newSentenceSplits.length; i++) { - // update buffer - curDocBuffer.push(newSentenceSplits[i]); - bufferTokens += newSentenceSplits[i][1] + 1; - - while (bufferTokens > this.chunkOverlap) { - // remove first element from curDocBuffer - let first_element = curDocBuffer.shift(); - if (first_element == undefined) { - throw new Error("first_element is undefined"); - } - bufferTokens -= first_element[1]; - bufferTokens -= 1; - } - // if adding newSentenceSplits[i] to curDocBuffer would exceed effectiveChunkSize, // then we need to add the current curDocBuffer to docs - if (curDocTokens + newSentenceSplits[i][1] > effectiveChunkSize) { + if ( + curChunkTokens + newSentenceSplits[i].numTokens > + effectiveChunkSize + ) { // push curent doc list to docs - docs.push(new TextSplit(curDocList.join(" ").trim())); - // reset docs list with buffer - curDocTokens = 0; - curDocList = []; - for (let j = 0; j < curDocBuffer.length; j++) { - curDocList.push(curDocBuffer[j][0]); - curDocTokens += curDocBuffer[j][1] + 1; + docs.push( + new TextSplit( + curChunkSentences + .map((sentence) => sentence.text) + .join(" ") + .trim() + ) + ); + + const lastChunkSentences = curChunkSentences; + + // reset docs list + curChunkTokens = 0; + curChunkSentences = []; + + // add the last sentences from the last chunk until we've hit the overlap + // do it in reverse order + for (let j = lastChunkSentences.length - 1; j >= 0; j--) { + if ( + curChunkTokens + lastChunkSentences[j].numTokens > + this.chunkOverlap + ) { + break; + } + curChunkSentences.unshift(lastChunkSentences[j]); + curChunkTokens += lastChunkSentences[j].numTokens + 1; } } - curDocList.push(newSentenceSplits[i][0]); - curDocTokens += newSentenceSplits[i][1] + 1; + curChunkSentences.push(newSentenceSplits[i]); + curChunkTokens += newSentenceSplits[i].numTokens + 1; } - docs.push(new TextSplit(curDocList.join(" ").trim())); + docs.push( + new TextSplit( + curChunkSentences + .map((sentence) => sentence.text) + .join(" ") + .trim() + ) + ); return docs; } -- GitLab