Skip to content
Snippets Groups Projects
Commit e62d01e6 authored by Jerry Liu's avatar Jerry Liu
Browse files

cr

parent 1b941d2c
No related branches found
No related tags found
No related merge requests found
......@@ -2,21 +2,25 @@
import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "./constants";
class TokenTextSplitter {
class SentenceSplitter {
private _separator: string;
private _chunk_size: number;
private _chunk_overlap: number;
private tokenizer: any;
private _backup_separators: string[];
private callback_manager: any;
// private _callback_manager: any;
constructor(
separator: string = " ",
chunk_size: number = DEFAULT_CHUNK_SIZE,
chunk_overlap: number = DEFAULT_CHUNK_OVERLAP,
tokenizer: any = null,
backup_separators: string[] = ["\n"]
// callback_manager: any = null
backup_separators: string[] = ["\n"],
paragraph_separator: string | undefined = "\n\n\n",
chunking_tokenizer_fn: any = undefined,
secondary_chunking_regex: string = "[^,.;。]+[,.;。]?",
// callback_manager: any = undefined
) {
if (chunk_overlap > chunk_size) {
throw new Error(
......@@ -26,121 +30,173 @@ class TokenTextSplitter {
this._separator = separator;
this._chunk_size = chunk_size;
this._chunk_overlap = chunk_overlap;
this.tokenizer = tokenizer || globals_helper.tokenizer;
this.tokenizer = tokenizer;
this._backup_separators = backup_separators;
// this.callback_manager = callback_manager || new CallbackManager([]);
}
// this._callback_manager = callback_manager || new CallbackManager([]);
private _reduceChunkSize(
start_idx: number,
cur_idx: number,
splits: string[]
): number {
let current_doc_total = this.tokenizer(
splits.slice(start_idx, cur_idx).join(this._separator)
).length;
while (current_doc_total > this._chunk_size) {
const percent_to_reduce =
(current_doc_total - this._chunk_size) / current_doc_total;
const num_to_reduce =
parseInt(percent_to_reduce.toString()) * (cur_idx - start_idx) + 1;
cur_idx -= num_to_reduce;
current_doc_total = this.tokenizer(
splits.slice(start_idx, cur_idx).join(this._separator)
).length;
}
return cur_idx;
}
if (chunking_tokenizer_fn == undefined) {
// use default tokenizer that splits by sentences
const winkNLP = require("wink-nlp");
// Load "its" helper to extract item properties.
const its = require( 'wink-nlp/src/its.js' );
// Load english language model — light version.
const model = require( 'wink-eng-lite-model' );
// Instantiate winkNLP.
const nlp = winkNLP( model );
_preprocessSplits(splits: Array<string>, chunk_size: number): Array<string> {
const new_splits: Array<string> = [];
for (const split of splits) {
const num_cur_tokens = tokenizer(split).length;
if (num_cur_tokens <= chunk_size) {
new_splits.push(split);
} else {
let cur_splits: Array<string> = [split];
if (backup_separators) {
for (const sep of backup_separators) {
if (split.includes(sep)) {
cur_splits = split.split(sep);
break;
}
}
} else {
cur_splits = [split];
}
const cur_splits2: Array<string> = [];
for (const cur_split of cur_splits) {
const num_cur_tokens = tokenizer(cur_split).length;
if (num_cur_tokens <= chunk_size) {
cur_splits2.push(cur_split);
} else {
// split cur_split according to chunk size of the token numbers
const cur_split_chunks: Array<string> = [];
let end_idx = cur_split.length;
while (tokenizer(cur_split.slice(0, end_idx)).length > chunk_size) {
for (let i = 1; i < end_idx; i++) {
const tmp_split = cur_split.slice(0, end_idx - i);
if (tokenizer(tmp_split).length <= chunk_size) {
cur_split_chunks.push(tmp_split);
cur_splits2.push(cur_split.slice(end_idx - i, end_idx));
end_idx = cur_split.length;
break;
}
}
}
cur_split_chunks.push(cur_split);
cur_splits2.push(...cur_split_chunks);
}
}
new_splits.push(...cur_splits2);
}
// Input text
const text = 'AI Inc. is focussing on AI. It is based in the U.S.A. It was started on 06.12.2007.';
// Read text
const doc = nlp.readDoc( text );
// Extract sentences from the data
const sentences = doc.sentences().out();
}
return new_splits;
}
_postprocessSplits(docs: TextSplit[]): TextSplit[] {
const new_docs: TextSplit[] = [];
for (const doc of docs) {
if (doc.text_chunk.replace(" ", "") == "") {
continue;
}
new_docs.push(doc);
}
return new_docs;
}
splitText(text: string, extra_info_str?: string): string[] {
const text_splits = this.splitTextWithOverlaps(text);
const chunks = text_splits.map((text_split) => text_split.text_chunk);
return chunks;
}
}
splitTextWithOverlaps(text: string) {}
truncateText(text: string, separator: string, chunk_size: number): string {
if (text == "") {
return "";
}
// First we naively split the large input into a bunch of smaller ones.
let splits: string[] = text.split(separator);
splits = preprocessSplits(splits, chunk_size);
let start_idx = 0;
let cur_idx = 0;
let cur_total = 0;
while (cur_idx < splits.length) {
let cur_token = splits[cur_idx];
let num_cur_tokens = Math.max(tokenizer(cur_token).length, 1);
if (cur_total + num_cur_tokens > chunk_size) {
cur_idx = reduce_chunk_size(start_idx, cur_idx, splits);
break;
}
cur_total += num_cur_tokens;
cur_idx += 1;
}
return splits.slice(start_idx, cur_idx).join(separator);
}
}
// class TokenTextSplitter {
// private _separator: string;
// private _chunk_size: number;
// private _chunk_overlap: number;
// private tokenizer: any;
// private _backup_separators: string[];
// private callback_manager: any;
// constructor(
// separator: string = " ",
// chunk_size: number = DEFAULT_CHUNK_SIZE,
// chunk_overlap: number = DEFAULT_CHUNK_OVERLAP,
// tokenizer: any = null,
// backup_separators: string[] = ["\n"]
// // callback_manager: any = null
// ) {
// if (chunk_overlap > chunk_size) {
// throw new Error(
// `Got a larger chunk overlap (${chunk_overlap}) than chunk size (${chunk_size}), should be smaller.`
// );
// }
// this._separator = separator;
// this._chunk_size = chunk_size;
// this._chunk_overlap = chunk_overlap;
// this.tokenizer = tokenizer || globals_helper.tokenizer;
// this._backup_separators = backup_separators;
// // this.callback_manager = callback_manager || new CallbackManager([]);
// }
// private _reduceChunkSize(
// start_idx: number,
// cur_idx: number,
// splits: string[]
// ): number {
// let current_doc_total = this.tokenizer(
// splits.slice(start_idx, cur_idx).join(this._separator)
// ).length;
// while (current_doc_total > this._chunk_size) {
// const percent_to_reduce =
// (current_doc_total - this._chunk_size) / current_doc_total;
// const num_to_reduce =
// parseInt(percent_to_reduce.toString()) * (cur_idx - start_idx) + 1;
// cur_idx -= num_to_reduce;
// current_doc_total = this.tokenizer(
// splits.slice(start_idx, cur_idx).join(this._separator)
// ).length;
// }
// return cur_idx;
// }
// _preprocessSplits(splits: Array<string>, chunk_size: number): Array<string> {
// const new_splits: Array<string> = [];
// for (const split of splits) {
// const num_cur_tokens = tokenizer(split).length;
// if (num_cur_tokens <= chunk_size) {
// new_splits.push(split);
// } else {
// let cur_splits: Array<string> = [split];
// if (backup_separators) {
// for (const sep of backup_separators) {
// if (split.includes(sep)) {
// cur_splits = split.split(sep);
// break;
// }
// }
// } else {
// cur_splits = [split];
// }
// const cur_splits2: Array<string> = [];
// for (const cur_split of cur_splits) {
// const num_cur_tokens = tokenizer(cur_split).length;
// if (num_cur_tokens <= chunk_size) {
// cur_splits2.push(cur_split);
// } else {
// // split cur_split according to chunk size of the token numbers
// const cur_split_chunks: Array<string> = [];
// let end_idx = cur_split.length;
// while (tokenizer(cur_split.slice(0, end_idx)).length > chunk_size) {
// for (let i = 1; i < end_idx; i++) {
// const tmp_split = cur_split.slice(0, end_idx - i);
// if (tokenizer(tmp_split).length <= chunk_size) {
// cur_split_chunks.push(tmp_split);
// cur_splits2.push(cur_split.slice(end_idx - i, end_idx));
// end_idx = cur_split.length;
// break;
// }
// }
// }
// cur_split_chunks.push(cur_split);
// cur_splits2.push(...cur_split_chunks);
// }
// }
// new_splits.push(...cur_splits2);
// }
// }
// return new_splits;
// }
// _postprocessSplits(docs: TextSplit[]): TextSplit[] {
// const new_docs: TextSplit[] = [];
// for (const doc of docs) {
// if (doc.text_chunk.replace(" ", "") == "") {
// continue;
// }
// new_docs.push(doc);
// }
// return new_docs;
// }
// splitText(text: string, extra_info_str?: string): string[] {
// const text_splits = this.splitTextWithOverlaps(text);
// const chunks = text_splits.map((text_split) => text_split.text_chunk);
// return chunks;
// }
// splitTextWithOverlaps(text: string) {}
// truncateText(text: string, separator: string, chunk_size: number): string {
// if (text == "") {
// return "";
// }
// // First we naively split the large input into a bunch of smaller ones.
// let splits: string[] = text.split(separator);
// splits = preprocessSplits(splits, chunk_size);
// let start_idx = 0;
// let cur_idx = 0;
// let cur_total = 0;
// while (cur_idx < splits.length) {
// let cur_token = splits[cur_idx];
// let num_cur_tokens = Math.max(tokenizer(cur_token).length, 1);
// if (cur_total + num_cur_tokens > chunk_size) {
// cur_idx = reduce_chunk_size(start_idx, cur_idx, splits);
// break;
// }
// cur_total += num_cur_tokens;
// cur_idx += 1;
// }
// return splits.slice(start_idx, cur_idx).join(separator);
// }
// }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment