diff --git a/.changeset/afraid-toys-fetch.md b/.changeset/afraid-toys-fetch.md new file mode 100644 index 0000000000000000000000000000000000000000..45208f82c681b7b4bdf692dcbfbaad89ef072e74 --- /dev/null +++ b/.changeset/afraid-toys-fetch.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Add splitLongSentences option to SimpleNodeParser diff --git a/examples/longText.ts b/examples/longText.ts new file mode 100644 index 0000000000000000000000000000000000000000..9766d96b5f4767c1e2fb64ab3f7b419378e56ef0 --- /dev/null +++ b/examples/longText.ts @@ -0,0 +1,26 @@ +import { + Document, + SimpleNodeParser, + VectorStoreIndex, + serviceContextFromDefaults, +} from "llamaindex"; + +export const STORAGE_DIR = "./data"; + +(async () => { + // create service context that is splitting sentences longer than CHUNK_SIZE + const serviceContext = serviceContextFromDefaults({ + nodeParser: new SimpleNodeParser({ + chunkSize: 512, + chunkOverlap: 20, + splitLongSentences: true, + }), + }); + + // generate a document with a very long sentence (9000 words long) + const longSentence = "is ".repeat(9000) + "."; + const document = new Document({ text: longSentence, id_: "1" }); + await VectorStoreIndex.fromDocuments([document], { + serviceContext, + }); +})(); diff --git a/packages/core/src/nodeParsers/SimpleNodeParser.ts b/packages/core/src/nodeParsers/SimpleNodeParser.ts index 4fd7240e51cb28e749c9afb5d4e30f5f6fbe9659..6a554219eb22a72d21da8d57e97842508d7ac1db 100644 --- a/packages/core/src/nodeParsers/SimpleNodeParser.ts +++ b/packages/core/src/nodeParsers/SimpleNodeParser.ts @@ -27,12 +27,14 @@ export class SimpleNodeParser implements NodeParser { includePrevNextRel?: boolean; chunkSize?: number; chunkOverlap?: number; + splitLongSentences?: boolean; }) { this.textSplitter = init?.textSplitter ?? new SentenceSplitter({ chunkSize: init?.chunkSize ?? DEFAULT_CHUNK_SIZE, chunkOverlap: init?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP, + splitLongSentences: init?.splitLongSentences ?? false, }); this.includeMetadata = init?.includeMetadata ?? true; this.includePrevNextRel = init?.includePrevNextRel ?? true;