From 2d558c3963f7c1eea17890ea7eb05faa16e2da54 Mon Sep 17 00:00:00 2001 From: Yi Ding <yi.s.ding@gmail.com> Date: Wed, 23 Aug 2023 23:05:38 -0700 Subject: [PATCH] stop splitting sentences by default --- packages/core/src/TextSplitter.ts | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index 4dec73b62..9ecbb83a0 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -62,7 +62,7 @@ export class SentenceSplitter { private tokenizerDecoder: any; private paragraphSeparator: string; private chunkingTokenizerFn: (text: string) => RegExpMatchArray | null; - // private _callback_manager: any; + private splitLongSentences: boolean; constructor(options?: { chunkSize?: number; @@ -71,6 +71,7 @@ export class SentenceSplitter { tokenizerDecoder?: any; paragraphSeparator?: string; chunkingTokenizerFn?: (text: string) => RegExpMatchArray | null; + splitLongSentences?: boolean; }) { const { chunkSize = DEFAULT_CHUNK_SIZE, @@ -79,6 +80,7 @@ export class SentenceSplitter { tokenizerDecoder = null, paragraphSeparator = unixParagraphSeparator, chunkingTokenizerFn = undefined, + splitLongSentences = false, } = options ?? {}; if (chunkOverlap > chunkSize) { @@ -96,6 +98,7 @@ export class SentenceSplitter { this.paragraphSeparator = paragraphSeparator; this.chunkingTokenizerFn = chunkingTokenizerFn ?? englishSentenceTokenizer; + this.splitLongSentences = splitLongSentences; } private getEffectiveChunkSize(extraInfoStr?: string): number { @@ -159,13 +162,28 @@ export class SentenceSplitter { return splits; } + /** + * Splits sentences into chunks if necessary. + * + * This isn't great behavior because it can split down the middle of a + * word or in non-English split down the middle of a Unicode codepoint + * so the splitting is turned off by default. If you need it, please + * set the splitLongSentences option to true. + * @param sentenceSplits + * @param effectiveChunkSize + * @returns + */ private processSentenceSplits( sentenceSplits: string[], effectiveChunkSize: number, ): SplitRep[] { - // Process sentence splits - // Primarily check if any sentences exceed the chunk size. If they do, - // force split by tokenizer + if (!this.splitLongSentences) { + return sentenceSplits.map((split) => ({ + text: split, + numTokens: this.tokenizer(split).length, + })); + } + let newSplits: SplitRep[] = []; for (const split of sentenceSplits) { let splitTokens = this.tokenizer(split); -- GitLab