From e62d01e693757864ac30902559a4cde92f5cfed6 Mon Sep 17 00:00:00 2001 From: Jerry Liu <jerryjliu98@gmail.com> Date: Sun, 18 Jun 2023 01:04:37 -0700 Subject: [PATCH] cr --- packages/core/src/TextSplitter.ts | 282 +++++++++++-------- packages/core/src/tests/TextSplitter.test.ts | 0 2 files changed, 169 insertions(+), 113 deletions(-) create mode 100644 packages/core/src/tests/TextSplitter.test.ts diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index ee86d1e80..c80e454c8 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -2,21 +2,25 @@ import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "./constants"; -class TokenTextSplitter { + +class SentenceSplitter { private _separator: string; private _chunk_size: number; private _chunk_overlap: number; private tokenizer: any; private _backup_separators: string[]; - private callback_manager: any; + // private _callback_manager: any; constructor( separator: string = " ", chunk_size: number = DEFAULT_CHUNK_SIZE, chunk_overlap: number = DEFAULT_CHUNK_OVERLAP, tokenizer: any = null, - backup_separators: string[] = ["\n"] - // callback_manager: any = null + backup_separators: string[] = ["\n"], + paragraph_separator: string | undefined = "\n\n\n", + chunking_tokenizer_fn: any = undefined, + secondary_chunking_regex: string = "[^,.;。]+[,.;。]?", + // callback_manager: any = undefined ) { if (chunk_overlap > chunk_size) { throw new Error( @@ -26,121 +30,173 @@ class TokenTextSplitter { this._separator = separator; this._chunk_size = chunk_size; this._chunk_overlap = chunk_overlap; - this.tokenizer = tokenizer || globals_helper.tokenizer; + this.tokenizer = tokenizer; this._backup_separators = backup_separators; - // this.callback_manager = callback_manager || new CallbackManager([]); - } + // this._callback_manager = callback_manager || new CallbackManager([]); - private _reduceChunkSize( - start_idx: number, - cur_idx: number, - splits: string[] - ): number { - let current_doc_total = this.tokenizer( - splits.slice(start_idx, cur_idx).join(this._separator) - ).length; - while (current_doc_total > this._chunk_size) { - const percent_to_reduce = - (current_doc_total - this._chunk_size) / current_doc_total; - const num_to_reduce = - parseInt(percent_to_reduce.toString()) * (cur_idx - start_idx) + 1; - cur_idx -= num_to_reduce; - current_doc_total = this.tokenizer( - splits.slice(start_idx, cur_idx).join(this._separator) - ).length; - } - return cur_idx; - } + if (chunking_tokenizer_fn == undefined) { + // use default tokenizer that splits by sentences + const winkNLP = require("wink-nlp"); + // Load "its" helper to extract item properties. + const its = require( 'wink-nlp/src/its.js' ); + // Load english language model — light version. + const model = require( 'wink-eng-lite-model' ); + // Instantiate winkNLP. + const nlp = winkNLP( model ); - _preprocessSplits(splits: Array<string>, chunk_size: number): Array<string> { - const new_splits: Array<string> = []; - for (const split of splits) { - const num_cur_tokens = tokenizer(split).length; - if (num_cur_tokens <= chunk_size) { - new_splits.push(split); - } else { - let cur_splits: Array<string> = [split]; - if (backup_separators) { - for (const sep of backup_separators) { - if (split.includes(sep)) { - cur_splits = split.split(sep); - break; - } - } - } else { - cur_splits = [split]; - } - - const cur_splits2: Array<string> = []; - for (const cur_split of cur_splits) { - const num_cur_tokens = tokenizer(cur_split).length; - if (num_cur_tokens <= chunk_size) { - cur_splits2.push(cur_split); - } else { - // split cur_split according to chunk size of the token numbers - const cur_split_chunks: Array<string> = []; - let end_idx = cur_split.length; - while (tokenizer(cur_split.slice(0, end_idx)).length > chunk_size) { - for (let i = 1; i < end_idx; i++) { - const tmp_split = cur_split.slice(0, end_idx - i); - if (tokenizer(tmp_split).length <= chunk_size) { - cur_split_chunks.push(tmp_split); - cur_splits2.push(cur_split.slice(end_idx - i, end_idx)); - end_idx = cur_split.length; - break; - } - } - } - cur_split_chunks.push(cur_split); - cur_splits2.push(...cur_split_chunks); - } - } - new_splits.push(...cur_splits2); - } + // Input text + const text = 'AI Inc. is focussing on AI. It is based in the U.S.A. It was started on 06.12.2007.'; + // Read text + const doc = nlp.readDoc( text ); + // Extract sentences from the data + const sentences = doc.sentences().out(); } - return new_splits; - } - _postprocessSplits(docs: TextSplit[]): TextSplit[] { - const new_docs: TextSplit[] = []; - for (const doc of docs) { - if (doc.text_chunk.replace(" ", "") == "") { - continue; - } - new_docs.push(doc); - } - return new_docs; } - splitText(text: string, extra_info_str?: string): string[] { - const text_splits = this.splitTextWithOverlaps(text); - const chunks = text_splits.map((text_split) => text_split.text_chunk); - return chunks; - } +} - splitTextWithOverlaps(text: string) {} - truncateText(text: string, separator: string, chunk_size: number): string { - if (text == "") { - return ""; - } - // First we naively split the large input into a bunch of smaller ones. - let splits: string[] = text.split(separator); - splits = preprocessSplits(splits, chunk_size); - - let start_idx = 0; - let cur_idx = 0; - let cur_total = 0; - while (cur_idx < splits.length) { - let cur_token = splits[cur_idx]; - let num_cur_tokens = Math.max(tokenizer(cur_token).length, 1); - if (cur_total + num_cur_tokens > chunk_size) { - cur_idx = reduce_chunk_size(start_idx, cur_idx, splits); - break; - } - cur_total += num_cur_tokens; - cur_idx += 1; - } - return splits.slice(start_idx, cur_idx).join(separator); - } -} + +// class TokenTextSplitter { +// private _separator: string; +// private _chunk_size: number; +// private _chunk_overlap: number; +// private tokenizer: any; +// private _backup_separators: string[]; +// private callback_manager: any; + +// constructor( +// separator: string = " ", +// chunk_size: number = DEFAULT_CHUNK_SIZE, +// chunk_overlap: number = DEFAULT_CHUNK_OVERLAP, +// tokenizer: any = null, +// backup_separators: string[] = ["\n"] +// // callback_manager: any = null +// ) { +// if (chunk_overlap > chunk_size) { +// throw new Error( +// `Got a larger chunk overlap (${chunk_overlap}) than chunk size (${chunk_size}), should be smaller.` +// ); +// } +// this._separator = separator; +// this._chunk_size = chunk_size; +// this._chunk_overlap = chunk_overlap; +// this.tokenizer = tokenizer || globals_helper.tokenizer; +// this._backup_separators = backup_separators; +// // this.callback_manager = callback_manager || new CallbackManager([]); +// } + +// private _reduceChunkSize( +// start_idx: number, +// cur_idx: number, +// splits: string[] +// ): number { +// let current_doc_total = this.tokenizer( +// splits.slice(start_idx, cur_idx).join(this._separator) +// ).length; +// while (current_doc_total > this._chunk_size) { +// const percent_to_reduce = +// (current_doc_total - this._chunk_size) / current_doc_total; +// const num_to_reduce = +// parseInt(percent_to_reduce.toString()) * (cur_idx - start_idx) + 1; +// cur_idx -= num_to_reduce; +// current_doc_total = this.tokenizer( +// splits.slice(start_idx, cur_idx).join(this._separator) +// ).length; +// } +// return cur_idx; +// } + +// _preprocessSplits(splits: Array<string>, chunk_size: number): Array<string> { +// const new_splits: Array<string> = []; +// for (const split of splits) { +// const num_cur_tokens = tokenizer(split).length; +// if (num_cur_tokens <= chunk_size) { +// new_splits.push(split); +// } else { +// let cur_splits: Array<string> = [split]; +// if (backup_separators) { +// for (const sep of backup_separators) { +// if (split.includes(sep)) { +// cur_splits = split.split(sep); +// break; +// } +// } +// } else { +// cur_splits = [split]; +// } + +// const cur_splits2: Array<string> = []; +// for (const cur_split of cur_splits) { +// const num_cur_tokens = tokenizer(cur_split).length; +// if (num_cur_tokens <= chunk_size) { +// cur_splits2.push(cur_split); +// } else { +// // split cur_split according to chunk size of the token numbers +// const cur_split_chunks: Array<string> = []; +// let end_idx = cur_split.length; +// while (tokenizer(cur_split.slice(0, end_idx)).length > chunk_size) { +// for (let i = 1; i < end_idx; i++) { +// const tmp_split = cur_split.slice(0, end_idx - i); +// if (tokenizer(tmp_split).length <= chunk_size) { +// cur_split_chunks.push(tmp_split); +// cur_splits2.push(cur_split.slice(end_idx - i, end_idx)); +// end_idx = cur_split.length; +// break; +// } +// } +// } +// cur_split_chunks.push(cur_split); +// cur_splits2.push(...cur_split_chunks); +// } +// } +// new_splits.push(...cur_splits2); +// } +// } +// return new_splits; +// } + +// _postprocessSplits(docs: TextSplit[]): TextSplit[] { +// const new_docs: TextSplit[] = []; +// for (const doc of docs) { +// if (doc.text_chunk.replace(" ", "") == "") { +// continue; +// } +// new_docs.push(doc); +// } +// return new_docs; +// } + +// splitText(text: string, extra_info_str?: string): string[] { +// const text_splits = this.splitTextWithOverlaps(text); +// const chunks = text_splits.map((text_split) => text_split.text_chunk); +// return chunks; +// } + +// splitTextWithOverlaps(text: string) {} + +// truncateText(text: string, separator: string, chunk_size: number): string { +// if (text == "") { +// return ""; +// } +// // First we naively split the large input into a bunch of smaller ones. +// let splits: string[] = text.split(separator); +// splits = preprocessSplits(splits, chunk_size); + +// let start_idx = 0; +// let cur_idx = 0; +// let cur_total = 0; +// while (cur_idx < splits.length) { +// let cur_token = splits[cur_idx]; +// let num_cur_tokens = Math.max(tokenizer(cur_token).length, 1); +// if (cur_total + num_cur_tokens > chunk_size) { +// cur_idx = reduce_chunk_size(start_idx, cur_idx, splits); +// break; +// } +// cur_total += num_cur_tokens; +// cur_idx += 1; +// } +// return splits.slice(start_idx, cur_idx).join(separator); +// } +// } diff --git a/packages/core/src/tests/TextSplitter.test.ts b/packages/core/src/tests/TextSplitter.test.ts new file mode 100644 index 000000000..e69de29bb -- GitLab