From bea4af984f00884accc3f48ed2bafa745a2fc5aa Mon Sep 17 00:00:00 2001
From: Yi Ding <yi.s.ding@gmail.com>
Date: Thu, 27 Jul 2023 08:24:38 -0700
Subject: [PATCH] fix sentence splitter overlap logic

---
 .changeset/quick-pots-taste.md                |  5 ++
 .../docs/docs/api/classes/SentenceSplitter.md | 30 ++++---
 apps/simple/split.ts                          | 17 ++++
 packages/core/src/TextSplitter.ts             | 90 +++++++++++--------
 4 files changed, 90 insertions(+), 52 deletions(-)
 create mode 100644 .changeset/quick-pots-taste.md
 create mode 100644 apps/simple/split.ts

diff --git a/.changeset/quick-pots-taste.md b/.changeset/quick-pots-taste.md
new file mode 100644
index 000000000..aa65cd8ab
--- /dev/null
+++ b/.changeset/quick-pots-taste.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Fixed sentence splitter overlap logic
diff --git a/apps/docs/docs/api/classes/SentenceSplitter.md b/apps/docs/docs/api/classes/SentenceSplitter.md
index 81d855091..329a9562b 100644
--- a/apps/docs/docs/api/classes/SentenceSplitter.md
+++ b/apps/docs/docs/api/classes/SentenceSplitter.md
@@ -8,6 +8,8 @@ custom_edit_url: null
 
 SentenceSplitter is our default text splitter that supports splitting into sentences, paragraphs, or fixed length chunks with overlap.
 
+One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together.
+
 ## Constructors
 
 ### constructor
@@ -27,7 +29,7 @@ SentenceSplitter is our default text splitter that supports splitting into sente
 
 #### Defined in
 
-[TextSplitter.ts:33](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L33)
+[TextSplitter.ts:35](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L35)
 
 ## Properties
 
@@ -37,7 +39,7 @@ SentenceSplitter is our default text splitter that supports splitting into sente
 
 #### Defined in
 
-[TextSplitter.ts:26](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L26)
+[TextSplitter.ts:28](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L28)
 
 ___
 
@@ -47,7 +49,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:25](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L25)
+[TextSplitter.ts:27](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L27)
 
 ___
 
@@ -57,7 +59,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:30](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L30)
+[TextSplitter.ts:32](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L32)
 
 ___
 
@@ -67,7 +69,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:29](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L29)
+[TextSplitter.ts:31](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L31)
 
 ___
 
@@ -77,7 +79,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:27](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L27)
+[TextSplitter.ts:29](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L29)
 
 ___
 
@@ -87,7 +89,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:28](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L28)
+[TextSplitter.ts:30](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L30)
 
 ## Methods
 
@@ -108,7 +110,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:153](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L153)
+[TextSplitter.ts:155](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L155)
 
 ___
 
@@ -128,7 +130,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:72](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L72)
+[TextSplitter.ts:74](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L74)
 
 ___
 
@@ -149,7 +151,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:89](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L89)
+[TextSplitter.ts:91](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L91)
 
 ___
 
@@ -170,7 +172,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:115](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L115)
+[TextSplitter.ts:117](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L117)
 
 ___
 
@@ -191,7 +193,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:128](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L128)
+[TextSplitter.ts:130](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L130)
 
 ___
 
@@ -212,7 +214,7 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:233](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L233)
+[TextSplitter.ts:247](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L247)
 
 ___
 
@@ -233,4 +235,4 @@ ___
 
 #### Defined in
 
-[TextSplitter.ts:205](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L205)
+[TextSplitter.ts:219](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/TextSplitter.ts#L219)
diff --git a/apps/simple/split.ts b/apps/simple/split.ts
new file mode 100644
index 000000000..c3cedb071
--- /dev/null
+++ b/apps/simple/split.ts
@@ -0,0 +1,17 @@
+import fs from "fs/promises";
+import { SentenceSplitter } from "llamaindex";
+
+async function main() {
+  const essay = await fs.readFile(
+    "node_modules/llamaindex/examples/abramov.txt",
+    "utf-8"
+  );
+
+  const textSplitter = new SentenceSplitter();
+
+  const chunks = textSplitter.splitTextWithOverlaps(essay);
+
+  console.log(chunks);
+}
+
+main();
diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts
index b6719c266..afa9d9d56 100644
--- a/packages/core/src/TextSplitter.ts
+++ b/packages/core/src/TextSplitter.ts
@@ -16,10 +16,12 @@ class TextSplit {
   }
 }
 
-type SplitRep = [text: string, numTokens: number];
+type SplitRep = { text: string; numTokens: number };
 
 /**
  * SentenceSplitter is our default text splitter that supports splitting into sentences, paragraphs, or fixed length chunks with overlap.
+ *
+ * One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together.
  */
 export class SentenceSplitter {
   private chunkSize: number;
@@ -129,21 +131,21 @@ export class SentenceSplitter {
     sentenceSplits: string[],
     effectiveChunkSize: number
   ): SplitRep[] {
-    // Process entence splits
+    // Process sentence splits
     // Primarily check if any sentences exceed the chunk size. If they don't,
     // force split by tokenizer
     let newSplits: SplitRep[] = [];
     for (const split of sentenceSplits) {
       let splitTokens = this.tokenizer(split);
-      const split_len = splitTokens.length;
-      if (split_len <= effectiveChunkSize) {
-        newSplits.push([split, split_len]);
+      const splitLen = splitTokens.length;
+      if (splitLen <= effectiveChunkSize) {
+        newSplits.push({ text: split, numTokens: splitLen });
       } else {
-        for (let i = 0; i < split_len; i += effectiveChunkSize) {
+        for (let i = 0; i < splitLen; i += effectiveChunkSize) {
           const cur_split = this.tokenizerDecoder(
             splitTokens.slice(i, i + effectiveChunkSize)
           );
-          newSplits.push([cur_split, effectiveChunkSize]);
+          newSplits.push({ text: cur_split, numTokens: effectiveChunkSize });
         }
       }
     }
@@ -158,47 +160,59 @@ export class SentenceSplitter {
 
     // docs represents final list of text chunks
     let docs: TextSplit[] = [];
-    // curDocList represents the current list of sentence splits (that)
+    // curChunkSentences represents the current list of sentence splits (that)
     // will be merged into a chunk
-    let curDocList: string[] = [];
-    let bufferTokens = 0;
-    let curDocTokens = 0;
-    // curDocBuffer represents the current document buffer
-    let curDocBuffer: SplitRep[] = [];
+    let curChunkSentences: SplitRep[] = [];
+    let curChunkTokens = 0;
 
     for (let i = 0; i < newSentenceSplits.length; i++) {
-      // update buffer
-      curDocBuffer.push(newSentenceSplits[i]);
-      bufferTokens += newSentenceSplits[i][1] + 1;
-
-      while (bufferTokens > this.chunkOverlap) {
-        // remove first element from curDocBuffer
-        let first_element = curDocBuffer.shift();
-        if (first_element == undefined) {
-          throw new Error("first_element is undefined");
-        }
-        bufferTokens -= first_element[1];
-        bufferTokens -= 1;
-      }
-
       // if adding newSentenceSplits[i] to curDocBuffer would exceed effectiveChunkSize,
       // then we need to add the current curDocBuffer to docs
-      if (curDocTokens + newSentenceSplits[i][1] > effectiveChunkSize) {
+      if (
+        curChunkTokens + newSentenceSplits[i].numTokens >
+        effectiveChunkSize
+      ) {
         // push curent doc list to docs
-        docs.push(new TextSplit(curDocList.join(" ").trim()));
-        // reset docs list with buffer
-        curDocTokens = 0;
-        curDocList = [];
-        for (let j = 0; j < curDocBuffer.length; j++) {
-          curDocList.push(curDocBuffer[j][0]);
-          curDocTokens += curDocBuffer[j][1] + 1;
+        docs.push(
+          new TextSplit(
+            curChunkSentences
+              .map((sentence) => sentence.text)
+              .join(" ")
+              .trim()
+          )
+        );
+
+        const lastChunkSentences = curChunkSentences;
+
+        // reset docs list
+        curChunkTokens = 0;
+        curChunkSentences = [];
+
+        // add the last sentences from the last chunk until we've hit the overlap
+        // do it in reverse order
+        for (let j = lastChunkSentences.length - 1; j >= 0; j--) {
+          if (
+            curChunkTokens + lastChunkSentences[j].numTokens >
+            this.chunkOverlap
+          ) {
+            break;
+          }
+          curChunkSentences.unshift(lastChunkSentences[j]);
+          curChunkTokens += lastChunkSentences[j].numTokens + 1;
         }
       }
 
-      curDocList.push(newSentenceSplits[i][0]);
-      curDocTokens += newSentenceSplits[i][1] + 1;
+      curChunkSentences.push(newSentenceSplits[i]);
+      curChunkTokens += newSentenceSplits[i].numTokens + 1;
     }
-    docs.push(new TextSplit(curDocList.join(" ").trim()));
+    docs.push(
+      new TextSplit(
+        curChunkSentences
+          .map((sentence) => sentence.text)
+          .join(" ")
+          .trim()
+      )
+    );
     return docs;
   }
 
-- 
GitLab