Skip to content
Snippets Groups Projects
Unverified Commit 0452af91 authored by Marcus Schiesser's avatar Marcus Schiesser Committed by GitHub
Browse files

fix: handling errors in splitBySentenceTokenizer (#1087)


Co-authored-by: default avatarAlex Yang <himself65@outlook.com>
parent da5cfc42
No related branches found
No related tags found
No related merge requests found
---
"@llamaindex/core": patch
"@llamaindex/core-tests": patch
---
fix: handling errors in splitBySentenceTokenizer
......@@ -39,7 +39,11 @@ export const splitBySentenceTokenizer = (): TextSplitterFn => {
}
const tokenizer = sentenceTokenizer;
return (text: string) => {
return tokenizer.tokenize(text);
try {
return tokenizer.tokenize(text);
} catch {
return [text];
}
};
};
......
import { SentenceSplitter } from "@llamaindex/core/node-parser";
import {
SentenceSplitter,
splitBySentenceTokenizer,
} from "@llamaindex/core/node-parser";
import { describe, expect, test } from "vitest";
describe("SentenceSplitter", () => {
describe("sentence splitter", () => {
test("initializes", () => {
const sentenceSplitter = new SentenceSplitter();
expect(sentenceSplitter).toBeDefined();
......@@ -105,4 +108,11 @@ describe("SentenceSplitter", () => {
"因为他照了人类,连我都在内。",
]);
});
test("issue 1087 - edge case when input with brackets", () => {
const text =
"A card must be of uniform thickness and made of unfolded and uncreased paper or cardstock of approximately the quality and weight of a stamped card (i.e., a card available from USPS).";
const split = splitBySentenceTokenizer();
expect(split(text)).toEqual([text]);
});
});
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment