From 0452af91cce7de015de75c3fb0b6cc0bd7901f02 Mon Sep 17 00:00:00 2001 From: Marcus Schiesser <mail@marcusschiesser.de> Date: Tue, 30 Jul 2024 18:36:58 +0200 Subject: [PATCH] fix: handling errors in splitBySentenceTokenizer (#1087) Co-authored-by: Alex Yang <himself65@outlook.com> --- .changeset/thin-pens-deliver.md | 6 ++++++ packages/core/src/node-parser/utils.ts | 6 +++++- .../core/tests/node-parser/text-splitter.test.ts | 14 ++++++++++++-- 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 .changeset/thin-pens-deliver.md diff --git a/.changeset/thin-pens-deliver.md b/.changeset/thin-pens-deliver.md new file mode 100644 index 000000000..3b1270d61 --- /dev/null +++ b/.changeset/thin-pens-deliver.md @@ -0,0 +1,6 @@ +--- +"@llamaindex/core": patch +"@llamaindex/core-tests": patch +--- + +fix: handling errors in splitBySentenceTokenizer diff --git a/packages/core/src/node-parser/utils.ts b/packages/core/src/node-parser/utils.ts index 1b9410c2d..74351b6e7 100644 --- a/packages/core/src/node-parser/utils.ts +++ b/packages/core/src/node-parser/utils.ts @@ -39,7 +39,11 @@ export const splitBySentenceTokenizer = (): TextSplitterFn => { } const tokenizer = sentenceTokenizer; return (text: string) => { - return tokenizer.tokenize(text); + try { + return tokenizer.tokenize(text); + } catch { + return [text]; + } }; }; diff --git a/packages/core/tests/node-parser/text-splitter.test.ts b/packages/core/tests/node-parser/text-splitter.test.ts index 7618e781a..1ccf9a93a 100644 --- a/packages/core/tests/node-parser/text-splitter.test.ts +++ b/packages/core/tests/node-parser/text-splitter.test.ts @@ -1,7 +1,10 @@ -import { SentenceSplitter } from "@llamaindex/core/node-parser"; +import { + SentenceSplitter, + splitBySentenceTokenizer, +} from "@llamaindex/core/node-parser"; import { describe, expect, test } from "vitest"; -describe("SentenceSplitter", () => { +describe("sentence splitter", () => { test("initializes", () => { const sentenceSplitter = new SentenceSplitter(); expect(sentenceSplitter).toBeDefined(); @@ -105,4 +108,11 @@ describe("SentenceSplitter", () => { "å› ä¸ºä»–ç…§äº†äººç±»ï¼Œè¿žæˆ‘éƒ½åœ¨å†…ã€‚", ]); }); + + test("issue 1087 - edge case when input with brackets", () => { + const text = + "A card must be of uniform thickness and made of unfolded and uncreased paper or cardstock of approximately the quality and weight of a stamped card (i.e., a card available from USPS)."; + const split = splitBySentenceTokenizer(); + expect(split(text)).toEqual([text]); + }); }); -- GitLab