From 0452af91cce7de015de75c3fb0b6cc0bd7901f02 Mon Sep 17 00:00:00 2001
From: Marcus Schiesser <mail@marcusschiesser.de>
Date: Tue, 30 Jul 2024 18:36:58 +0200
Subject: [PATCH] fix: handling errors in splitBySentenceTokenizer (#1087)

Co-authored-by: Alex Yang <himself65@outlook.com>
---
 .changeset/thin-pens-deliver.md                    |  6 ++++++
 packages/core/src/node-parser/utils.ts             |  6 +++++-
 .../core/tests/node-parser/text-splitter.test.ts   | 14 ++++++++++++--
 3 files changed, 23 insertions(+), 3 deletions(-)
 create mode 100644 .changeset/thin-pens-deliver.md

diff --git a/.changeset/thin-pens-deliver.md b/.changeset/thin-pens-deliver.md
new file mode 100644
index 000000000..3b1270d61
--- /dev/null
+++ b/.changeset/thin-pens-deliver.md
@@ -0,0 +1,6 @@
+---
+"@llamaindex/core": patch
+"@llamaindex/core-tests": patch
+---
+
+fix: handling errors in splitBySentenceTokenizer
diff --git a/packages/core/src/node-parser/utils.ts b/packages/core/src/node-parser/utils.ts
index 1b9410c2d..74351b6e7 100644
--- a/packages/core/src/node-parser/utils.ts
+++ b/packages/core/src/node-parser/utils.ts
@@ -39,7 +39,11 @@ export const splitBySentenceTokenizer = (): TextSplitterFn => {
   }
   const tokenizer = sentenceTokenizer;
   return (text: string) => {
-    return tokenizer.tokenize(text);
+    try {
+      return tokenizer.tokenize(text);
+    } catch {
+      return [text];
+    }
   };
 };
 
diff --git a/packages/core/tests/node-parser/text-splitter.test.ts b/packages/core/tests/node-parser/text-splitter.test.ts
index 7618e781a..1ccf9a93a 100644
--- a/packages/core/tests/node-parser/text-splitter.test.ts
+++ b/packages/core/tests/node-parser/text-splitter.test.ts
@@ -1,7 +1,10 @@
-import { SentenceSplitter } from "@llamaindex/core/node-parser";
+import {
+  SentenceSplitter,
+  splitBySentenceTokenizer,
+} from "@llamaindex/core/node-parser";
 import { describe, expect, test } from "vitest";
 
-describe("SentenceSplitter", () => {
+describe("sentence splitter", () => {
   test("initializes", () => {
     const sentenceSplitter = new SentenceSplitter();
     expect(sentenceSplitter).toBeDefined();
@@ -105,4 +108,11 @@ describe("SentenceSplitter", () => {
       "因为他照了人类,连我都在内。",
     ]);
   });
+
+  test("issue 1087 - edge case when input with brackets", () => {
+    const text =
+      "A card must be of uniform thickness and made of unfolded and uncreased paper or cardstock of approximately the quality and weight of a stamped card (i.e., a card available from USPS).";
+    const split = splitBySentenceTokenizer();
+    expect(split(text)).toEqual([text]);
+  });
 });
-- 
GitLab