From 90027a7b4412314853a1f9561fcadedbe8e9a996 Mon Sep 17 00:00:00 2001
From: Thuc Pham <51660321+thucpn@users.noreply.github.com>
Date: Tue, 27 Feb 2024 13:44:04 +0700
Subject: [PATCH] fix: enable split long sentence by default (#568)

Co-authored-by: Marcus Schiesser <mail@marcusschiesser.de>
---
 .changeset/afraid-toys-fetch.md               |  5 ++++
 examples/longText.ts                          | 26 +++++++++++++++++++
 .../core/src/nodeParsers/SimpleNodeParser.ts  |  2 ++
 3 files changed, 33 insertions(+)
 create mode 100644 .changeset/afraid-toys-fetch.md
 create mode 100644 examples/longText.ts

diff --git a/.changeset/afraid-toys-fetch.md b/.changeset/afraid-toys-fetch.md
new file mode 100644
index 000000000..45208f82c
--- /dev/null
+++ b/.changeset/afraid-toys-fetch.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Add splitLongSentences option to SimpleNodeParser
diff --git a/examples/longText.ts b/examples/longText.ts
new file mode 100644
index 000000000..9766d96b5
--- /dev/null
+++ b/examples/longText.ts
@@ -0,0 +1,26 @@
+import {
+  Document,
+  SimpleNodeParser,
+  VectorStoreIndex,
+  serviceContextFromDefaults,
+} from "llamaindex";
+
+export const STORAGE_DIR = "./data";
+
+(async () => {
+  // create service context that is splitting sentences longer than CHUNK_SIZE
+  const serviceContext = serviceContextFromDefaults({
+    nodeParser: new SimpleNodeParser({
+      chunkSize: 512,
+      chunkOverlap: 20,
+      splitLongSentences: true,
+    }),
+  });
+
+  // generate a document with a very long sentence (9000 words long)
+  const longSentence = "is ".repeat(9000) + ".";
+  const document = new Document({ text: longSentence, id_: "1" });
+  await VectorStoreIndex.fromDocuments([document], {
+    serviceContext,
+  });
+})();
diff --git a/packages/core/src/nodeParsers/SimpleNodeParser.ts b/packages/core/src/nodeParsers/SimpleNodeParser.ts
index 4fd7240e5..6a554219e 100644
--- a/packages/core/src/nodeParsers/SimpleNodeParser.ts
+++ b/packages/core/src/nodeParsers/SimpleNodeParser.ts
@@ -27,12 +27,14 @@ export class SimpleNodeParser implements NodeParser {
     includePrevNextRel?: boolean;
     chunkSize?: number;
     chunkOverlap?: number;
+    splitLongSentences?: boolean;
   }) {
     this.textSplitter =
       init?.textSplitter ??
       new SentenceSplitter({
         chunkSize: init?.chunkSize ?? DEFAULT_CHUNK_SIZE,
         chunkOverlap: init?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP,
+        splitLongSentences: init?.splitLongSentences ?? false,
       });
     this.includeMetadata = init?.includeMetadata ?? true;
     this.includePrevNextRel = init?.includePrevNextRel ?? true;
-- 
GitLab