From 90027a7b4412314853a1f9561fcadedbe8e9a996 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Tue, 27 Feb 2024 13:44:04 +0700 Subject: [PATCH] fix: enable split long sentence by default (#568) Co-authored-by: Marcus Schiesser <mail@marcusschiesser.de> --- .changeset/afraid-toys-fetch.md | 5 ++++ examples/longText.ts | 26 +++++++++++++++++++ .../core/src/nodeParsers/SimpleNodeParser.ts | 2 ++ 3 files changed, 33 insertions(+) create mode 100644 .changeset/afraid-toys-fetch.md create mode 100644 examples/longText.ts diff --git a/.changeset/afraid-toys-fetch.md b/.changeset/afraid-toys-fetch.md new file mode 100644 index 000000000..45208f82c --- /dev/null +++ b/.changeset/afraid-toys-fetch.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Add splitLongSentences option to SimpleNodeParser diff --git a/examples/longText.ts b/examples/longText.ts new file mode 100644 index 000000000..9766d96b5 --- /dev/null +++ b/examples/longText.ts @@ -0,0 +1,26 @@ +import { + Document, + SimpleNodeParser, + VectorStoreIndex, + serviceContextFromDefaults, +} from "llamaindex"; + +export const STORAGE_DIR = "./data"; + +(async () => { + // create service context that is splitting sentences longer than CHUNK_SIZE + const serviceContext = serviceContextFromDefaults({ + nodeParser: new SimpleNodeParser({ + chunkSize: 512, + chunkOverlap: 20, + splitLongSentences: true, + }), + }); + + // generate a document with a very long sentence (9000 words long) + const longSentence = "is ".repeat(9000) + "."; + const document = new Document({ text: longSentence, id_: "1" }); + await VectorStoreIndex.fromDocuments([document], { + serviceContext, + }); +})(); diff --git a/packages/core/src/nodeParsers/SimpleNodeParser.ts b/packages/core/src/nodeParsers/SimpleNodeParser.ts index 4fd7240e5..6a554219e 100644 --- a/packages/core/src/nodeParsers/SimpleNodeParser.ts +++ b/packages/core/src/nodeParsers/SimpleNodeParser.ts @@ -27,12 +27,14 @@ export class SimpleNodeParser implements NodeParser { includePrevNextRel?: boolean; chunkSize?: number; chunkOverlap?: number; + splitLongSentences?: boolean; }) { this.textSplitter = init?.textSplitter ?? new SentenceSplitter({ chunkSize: init?.chunkSize ?? DEFAULT_CHUNK_SIZE, chunkOverlap: init?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP, + splitLongSentences: init?.splitLongSentences ?? false, }); this.includeMetadata = init?.includeMetadata ?? true; this.includePrevNextRel = init?.includePrevNextRel ?? true; -- GitLab