diff --git a/packages/core/src/NodeParser.ts b/packages/core/src/NodeParser.ts index 4860e66258ceafbb17dc263a2ef5faaa61bd78c1..f3d064ba5738702f4a11ba12483d6eb8122735d1 100644 --- a/packages/core/src/NodeParser.ts +++ b/packages/core/src/NodeParser.ts @@ -10,7 +10,7 @@ import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE } from "./constants"; */ export function getTextSplitsFromDocument( document: Document, - textSplitter: SentenceSplitter + textSplitter: SentenceSplitter, ) { const text = document.getText(); const splits = textSplitter.splitText(text); @@ -30,7 +30,7 @@ export function getNodesFromDocument( document: Document, textSplitter: SentenceSplitter, includeMetadata: boolean = true, - includePrevNextRel: boolean = true + includePrevNextRel: boolean = true, ) { let nodes: TextNode[] = []; @@ -100,10 +100,10 @@ export class SimpleNodeParser implements NodeParser { }) { this.textSplitter = init?.textSplitter ?? - new SentenceSplitter( - init?.chunkSize ?? DEFAULT_CHUNK_SIZE, - init?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP - ); + new SentenceSplitter({ + chunkSize: init?.chunkSize ?? DEFAULT_CHUNK_SIZE, + chunkOverlap: init?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP, + }); this.includeMetadata = init?.includeMetadata ?? true; this.includePrevNextRel = init?.includePrevNextRel ?? true; } diff --git a/packages/core/src/PromptHelper.ts b/packages/core/src/PromptHelper.ts index 0f01988134697bf70688efa517e328f006704481..5ea6ab8a378ed48c142748d3537682ec7ac8feaf 100644 --- a/packages/core/src/PromptHelper.ts +++ b/packages/core/src/PromptHelper.ts @@ -2,9 +2,9 @@ import { globalsHelper } from "./GlobalsHelper"; import { SimplePrompt } from "./Prompt"; import { SentenceSplitter } from "./TextSplitter"; import { + DEFAULT_CHUNK_OVERLAP_RATIO, DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS, - DEFAULT_CHUNK_OVERLAP_RATIO, DEFAULT_PADDING, } from "./constants"; @@ -43,7 +43,7 @@ export class PromptHelper { chunkOverlapRatio = DEFAULT_CHUNK_OVERLAP_RATIO, chunkSizeLimit?: number, tokenizer?: (text: string) => number[], - separator = " " + separator = " ", ) { this.contextWindow = contextWindow; this.numOutput = numOutput; @@ -76,7 +76,7 @@ export class PromptHelper { private getAvailableChunkSize( prompt: SimplePrompt, numChunks = 1, - padding = 5 + padding = 5, ) { const availableContextSize = this.getAvailableContextSize(prompt); @@ -99,14 +99,14 @@ export class PromptHelper { getTextSplitterGivenPrompt( prompt: SimplePrompt, numChunks = 1, - padding = DEFAULT_PADDING + padding = DEFAULT_PADDING, ) { const chunkSize = this.getAvailableChunkSize(prompt, numChunks, padding); if (chunkSize === 0) { throw new Error("Got 0 as available chunk size"); } const chunkOverlap = this.chunkOverlapRatio * chunkSize; - const textSplitter = new SentenceSplitter(chunkSize, chunkOverlap); + const textSplitter = new SentenceSplitter({ chunkSize, chunkOverlap }); return textSplitter; } @@ -120,7 +120,7 @@ export class PromptHelper { repack( prompt: SimplePrompt, textChunks: string[], - padding = DEFAULT_PADDING + padding = DEFAULT_PADDING, ) { const textSplitter = this.getTextSplitterGivenPrompt(prompt, 1, padding); const combinedStr = textChunks.join("\n\n"); diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index afa9d9d56cdbe74877f97901f2a9f16266da8b74..4dec73b62372098a5ee562cb5d2c5ec5d035ccff 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -1,7 +1,7 @@ // GitHub translated import { globalsHelper } from "./GlobalsHelper"; -import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "./constants"; +import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE } from "./constants"; class TextSplit { textChunk: string; @@ -9,7 +9,7 @@ class TextSplit { constructor( textChunk: string, - numCharOverlap: number | undefined = undefined + numCharOverlap: number | undefined = undefined, ) { this.textChunk = textChunk; this.numCharOverlap = numCharOverlap; @@ -18,6 +18,38 @@ class TextSplit { type SplitRep = { text: string; numTokens: number }; +/** + * Tokenizes sentences. Suitable for English and most European languages. + * @param text + * @returns + */ +export const englishSentenceTokenizer = (text: string) => { + // The first part is a lazy match for any character. + return text.match(/.+?[.?!]+[\])'"`’”]*(?:\s|$)|.+/g); +}; + +/** + * Tokenizes sentences. Suitable for Chinese, Japanese, and Korean. + * @param text + * @returns + */ +export const cjkSentenceTokenizer = (text: string) => { + // Accepts english style sentence endings with space and + // CJK style sentence endings with no space. + return text.match( + /.+?[.?!]+[\])'"`’”]*(?:\s|$)|.+?[。?!]+[\])'"`’”]*(?:\s|$)?|.+/g, + ); +}; + +export const unixLineSeparator = "\n"; +export const windowsLineSeparator = "\r\n"; +export const unixParagraphSeparator = unixLineSeparator + unixLineSeparator; +export const windowsParagraphSeparator = + windowsLineSeparator + windowsLineSeparator; + +// In theory there's also Mac style \r only, but it's pre-OSX and I don't think +// many documents will use it. + /** * SentenceSplitter is our default text splitter that supports splitting into sentences, paragraphs, or fixed length chunks with overlap. * @@ -29,46 +61,41 @@ export class SentenceSplitter { private tokenizer: any; private tokenizerDecoder: any; private paragraphSeparator: string; - private chunkingTokenizerFn: any; + private chunkingTokenizerFn: (text: string) => RegExpMatchArray | null; // private _callback_manager: any; - constructor( - chunkSize: number = DEFAULT_CHUNK_SIZE, - chunkOverlap: number = DEFAULT_CHUNK_OVERLAP, - tokenizer: any = null, - tokenizerDecoder: any = null, - paragraphSeparator: string = "\n\n\n", - chunkingTokenizerFn: any = undefined - // callback_manager: any = undefined - ) { + constructor(options?: { + chunkSize?: number; + chunkOverlap?: number; + tokenizer?: any; + tokenizerDecoder?: any; + paragraphSeparator?: string; + chunkingTokenizerFn?: (text: string) => RegExpMatchArray | null; + }) { + const { + chunkSize = DEFAULT_CHUNK_SIZE, + chunkOverlap = DEFAULT_CHUNK_OVERLAP, + tokenizer = null, + tokenizerDecoder = null, + paragraphSeparator = unixParagraphSeparator, + chunkingTokenizerFn = undefined, + } = options ?? {}; + if (chunkOverlap > chunkSize) { throw new Error( - `Got a larger chunk overlap (${chunkOverlap}) than chunk size (${chunkSize}), should be smaller.` + `Got a larger chunk overlap (${chunkOverlap}) than chunk size (${chunkSize}), should be smaller.`, ); } this.chunkSize = chunkSize; this.chunkOverlap = chunkOverlap; // this._callback_manager = callback_manager || new CallbackManager([]); - if (chunkingTokenizerFn == undefined) { - // define a callable mapping a string to a list of strings - const defaultChunkingTokenizerFn = (text: string) => { - var result = text.match(/[^.?!]+[.!?]+[\])'"`’”]*|.+/g); - return result; - }; - - chunkingTokenizerFn = defaultChunkingTokenizerFn; - } - - if (tokenizer == undefined || tokenizerDecoder == undefined) { - tokenizer = globalsHelper.tokenizer(); - tokenizerDecoder = globalsHelper.tokenizerDecoder(); - } - this.tokenizer = tokenizer; - this.tokenizerDecoder = tokenizerDecoder; + this.tokenizer = tokenizer ?? globalsHelper.tokenizer(); + this.tokenizerDecoder = + tokenizerDecoder ?? globalsHelper.tokenizerDecoder(); this.paragraphSeparator = paragraphSeparator; - this.chunkingTokenizerFn = chunkingTokenizerFn; + this.chunkingTokenizerFn = chunkingTokenizerFn ?? englishSentenceTokenizer; } private getEffectiveChunkSize(extraInfoStr?: string): number { @@ -79,7 +106,7 @@ export class SentenceSplitter { effectiveChunkSize = this.chunkSize - numExtraTokens; if (effectiveChunkSize <= 0) { throw new Error( - "Effective chunk size is non positive after considering extra_info" + "Effective chunk size is non positive after considering extra_info", ); } } else { @@ -119,7 +146,12 @@ export class SentenceSplitter { // Next we split the text using the chunk tokenizer fn/ let splits = []; for (const parText of paragraphSplits) { - let sentenceSplits = this.chunkingTokenizerFn(parText); + const sentenceSplits = this.chunkingTokenizerFn(parText); + + if (!sentenceSplits) { + continue; + } + for (const sentence_split of sentenceSplits) { splits.push(sentence_split.trim()); } @@ -129,10 +161,10 @@ export class SentenceSplitter { private processSentenceSplits( sentenceSplits: string[], - effectiveChunkSize: number + effectiveChunkSize: number, ): SplitRep[] { // Process sentence splits - // Primarily check if any sentences exceed the chunk size. If they don't, + // Primarily check if any sentences exceed the chunk size. If they do, // force split by tokenizer let newSplits: SplitRep[] = []; for (const split of sentenceSplits) { @@ -143,7 +175,7 @@ export class SentenceSplitter { } else { for (let i = 0; i < splitLen; i += effectiveChunkSize) { const cur_split = this.tokenizerDecoder( - splitTokens.slice(i, i + effectiveChunkSize) + splitTokens.slice(i, i + effectiveChunkSize), ); newSplits.push({ text: cur_split, numTokens: effectiveChunkSize }); } @@ -154,7 +186,7 @@ export class SentenceSplitter { combineTextSplits( newSentenceSplits: SplitRep[], - effectiveChunkSize: number + effectiveChunkSize: number, ): TextSplit[] { // go through sentence splits, combine to chunks that are within the chunk size @@ -178,8 +210,8 @@ export class SentenceSplitter { curChunkSentences .map((sentence) => sentence.text) .join(" ") - .trim() - ) + .trim(), + ), ); const lastChunkSentences = curChunkSentences; @@ -210,8 +242,8 @@ export class SentenceSplitter { curChunkSentences .map((sentence) => sentence.text) .join(" ") - .trim() - ) + .trim(), + ), ); return docs; } @@ -232,13 +264,13 @@ export class SentenceSplitter { // force split by tokenizer let newSentenceSplits = this.processSentenceSplits( sentenceSplits, - effectiveChunkSize + effectiveChunkSize, ); // combine sentence splits into chunks of text that can then be returned let combinedTextSplits = this.combineTextSplits( newSentenceSplits, - effectiveChunkSize + effectiveChunkSize, ); return combinedTextSplits; diff --git a/packages/core/src/tests/TextSplitter.test.ts b/packages/core/src/tests/TextSplitter.test.ts index 3f577b543d8a741016e3fa97724164a107c689f9..bfaa5381dfed21851029f938b48e54eb070ba379 100644 --- a/packages/core/src/tests/TextSplitter.test.ts +++ b/packages/core/src/tests/TextSplitter.test.ts @@ -1,4 +1,4 @@ -import { SentenceSplitter } from "../TextSplitter"; +import { SentenceSplitter, cjkSentenceTokenizer } from "../TextSplitter"; describe("SentenceSplitter", () => { test("initializes", () => { @@ -7,17 +7,11 @@ describe("SentenceSplitter", () => { }); test("splits paragraphs w/o effective chunk size", () => { - const sentenceSplitter = new SentenceSplitter( - undefined, - undefined, - undefined, - undefined, - "\n" - ); + const sentenceSplitter = new SentenceSplitter({}); // generate the same line as above but correct syntax errors let splits = sentenceSplitter.getParagraphSplits( - "This is a paragraph.\nThis is another paragraph.", - undefined + "This is a paragraph.\n\nThis is another paragraph.", + undefined, ); expect(splits).toEqual([ "This is a paragraph.", @@ -26,17 +20,13 @@ describe("SentenceSplitter", () => { }); test("splits paragraphs with effective chunk size", () => { - const sentenceSplitter = new SentenceSplitter( - undefined, - undefined, - undefined, - undefined, - "\n" - ); + const sentenceSplitter = new SentenceSplitter({ + paragraphSeparator: "\n", + }); // generate the same line as above but correct syntax errors let splits = sentenceSplitter.getParagraphSplits( "This is a paragraph.\nThis is another paragraph.", - 1000 + 1000, ); expect(splits).toEqual([ "This is a paragraph.\nThis is another paragraph.", @@ -47,7 +37,7 @@ describe("SentenceSplitter", () => { const sentenceSplitter = new SentenceSplitter(); let splits = sentenceSplitter.getSentenceSplits( "This is a sentence. This is another sentence.", - undefined + undefined, ); expect(splits).toEqual([ "This is a sentence.", @@ -56,19 +46,48 @@ describe("SentenceSplitter", () => { }); test("overall split text", () => { - let sentenceSplitter = new SentenceSplitter(5, 0); + let sentenceSplitter = new SentenceSplitter({ + chunkSize: 5, + chunkOverlap: 0, + }); let splits = sentenceSplitter.splitText( - "This is a sentence. This is another sentence." + "This is a sentence. This is another sentence.", ); expect(splits).toEqual([ "This is a sentence.", "This is another sentence.", ]); - sentenceSplitter = new SentenceSplitter(1000); + sentenceSplitter = new SentenceSplitter({ chunkSize: 1000 }); splits = sentenceSplitter.splitText( - "This is a sentence. This is another sentence." + "This is a sentence. This is another sentence.", ); expect(splits).toEqual(["This is a sentence. This is another sentence."]); }); + + test("doesn't split decimals", () => { + let sentenceSplitter = new SentenceSplitter({ + chunkSize: 5, + chunkOverlap: 0, + }); + let splits = sentenceSplitter.splitText( + "This is a sentence. This is another sentence. 1.0", + ); + expect(splits).toEqual([ + "This is a sentence.", + "This is another sentence.", + "1.0", + ]); + }); + + test("splits cjk", () => { + let sentenceSplitter = new SentenceSplitter({ + chunkSize: 12, + chunkOverlap: 0, + chunkingTokenizerFn: cjkSentenceTokenizer, + }); + + const splits = sentenceSplitter.splitText("这是一个句子!这是另一个句子。"); + expect(splits).toEqual(["这是一个句子!", "这是另一个句子。"]); + }); });