diff --git a/.changeset/thin-pens-deliver.md b/.changeset/thin-pens-deliver.md new file mode 100644 index 0000000000000000000000000000000000000000..3b1270d61c794f33abd0d0dbf90aef11a46e34b9 --- /dev/null +++ b/.changeset/thin-pens-deliver.md @@ -0,0 +1,6 @@ +--- +"@llamaindex/core": patch +"@llamaindex/core-tests": patch +--- + +fix: handling errors in splitBySentenceTokenizer diff --git a/packages/core/src/node-parser/utils.ts b/packages/core/src/node-parser/utils.ts index 1b9410c2dccaf6673ee5bb96e59d072e4b6c2ee8..74351b6e7c1ed095a5d46dce74a8bd2b1842f40a 100644 --- a/packages/core/src/node-parser/utils.ts +++ b/packages/core/src/node-parser/utils.ts @@ -39,7 +39,11 @@ export const splitBySentenceTokenizer = (): TextSplitterFn => { } const tokenizer = sentenceTokenizer; return (text: string) => { - return tokenizer.tokenize(text); + try { + return tokenizer.tokenize(text); + } catch { + return [text]; + } }; }; diff --git a/packages/core/tests/node-parser/text-splitter.test.ts b/packages/core/tests/node-parser/text-splitter.test.ts index 7618e781ab10d63b94afa84831ecb24dbf606c97..1ccf9a93ab8772cef1d1550c467254f2291e90ad 100644 --- a/packages/core/tests/node-parser/text-splitter.test.ts +++ b/packages/core/tests/node-parser/text-splitter.test.ts @@ -1,7 +1,10 @@ -import { SentenceSplitter } from "@llamaindex/core/node-parser"; +import { + SentenceSplitter, + splitBySentenceTokenizer, +} from "@llamaindex/core/node-parser"; import { describe, expect, test } from "vitest"; -describe("SentenceSplitter", () => { +describe("sentence splitter", () => { test("initializes", () => { const sentenceSplitter = new SentenceSplitter(); expect(sentenceSplitter).toBeDefined(); @@ -105,4 +108,11 @@ describe("SentenceSplitter", () => { "å› ä¸ºä»–ç…§äº†äººç±»ï¼Œè¿žæˆ‘éƒ½åœ¨å†…ã€‚", ]); }); + + test("issue 1087 - edge case when input with brackets", () => { + const text = + "A card must be of uniform thickness and made of unfolded and uncreased paper or cardstock of approximately the quality and weight of a stamped card (i.e., a card available from USPS)."; + const split = splitBySentenceTokenizer(); + expect(split(text)).toEqual([text]); + }); });