diff --git a/package.json b/package.json index 5bf808928028a6b3065e1ed206c16e13b796e777..e177cc22af4e303b477911fd422ae619ee7a2135 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,9 @@ "prettier": "^2.5.1", "prettier-plugin-tailwindcss": "^0.3.0", "ts-jest": "^29.1.0", - "turbo": "latest" + "turbo": "latest", + "wink-nlp": "latest", + "tiktoken-node": "latest" }, "packageManager": "pnpm@7.15.0", "name": "llamascript" diff --git a/packages/core/package.json b/packages/core/package.json index fe9910ac054a566b0d6ddffaf372ec97495539ad..6df77fdcd9a96187d27c0baffa920f2181072dbe 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -5,6 +5,9 @@ "js-tiktoken": "^1.0.7", "openai": "^3.3.0" }, + "devDependencies": { + "@types/node": "^18" + }, "main": "src/index.ts", "types": "src/index.ts", "scripts": { diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index ee86d1e80cb76ac693404d446eebbc2a90133c1d..ba188df5b11a353aa2d127361e0e40fe8557d7e0 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -2,145 +2,241 @@ import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "./constants"; -class TokenTextSplitter { - private _separator: string; - private _chunk_size: number; - private _chunk_overlap: number; +class TextSplit { + textChunk: string; + numCharOverlap: number | undefined; + + constructor( + textChunk: string, + numCharOverlap: number | undefined = undefined + ) { + this.textChunk = textChunk; + this.numCharOverlap = numCharOverlap; + } +} + +type SplitRep = [text: string, numTokens: number]; + +export class SentenceSplitter { + private chunkSize: number; + private chunkOverlap: number; private tokenizer: any; - private _backup_separators: string[]; - private callback_manager: any; + private tokenizerDecoder: any; + private paragraphSeparator: string; + private chunkingTokenizerFn: any; + // private _callback_manager: any; constructor( - separator: string = " ", - chunk_size: number = DEFAULT_CHUNK_SIZE, - chunk_overlap: number = DEFAULT_CHUNK_OVERLAP, + chunkSize: number = DEFAULT_CHUNK_SIZE, + chunkOverlap: number = DEFAULT_CHUNK_OVERLAP, tokenizer: any = null, - backup_separators: string[] = ["\n"] - // callback_manager: any = null + tokenizerDecoder: any = null, + paragraphSeparator: string = "\n\n\n", + chunkingTokenizerFn: any = undefined + // callback_manager: any = undefined ) { - if (chunk_overlap > chunk_size) { + if (chunkOverlap > chunkSize) { throw new Error( - `Got a larger chunk overlap (${chunk_overlap}) than chunk size (${chunk_size}), should be smaller.` + `Got a larger chunk overlap (${chunkOverlap}) than chunk size (${chunkSize}), should be smaller.` ); } - this._separator = separator; - this._chunk_size = chunk_size; - this._chunk_overlap = chunk_overlap; - this.tokenizer = tokenizer || globals_helper.tokenizer; - this._backup_separators = backup_separators; - // this.callback_manager = callback_manager || new CallbackManager([]); + this.chunkSize = chunkSize; + this.chunkOverlap = chunkOverlap; + // this._callback_manager = callback_manager || new CallbackManager([]); + + if (chunkingTokenizerFn == undefined) { + // define a callable mapping a string to a list of strings + const defaultChunkingTokenizerFn = (text: string) => { + var result = text.match(/[^.?!]+[.!?]+[\])'"`’”]*|.+/g); + return result; + }; + + chunkingTokenizerFn = defaultChunkingTokenizerFn; + } + + if (tokenizer == undefined || tokenizerDecoder == undefined) { + const tiktoken = require("tiktoken-node"); + let enc = new tiktoken.getEncoding("gpt2"); + const default_tokenizer = (text: string) => { + return enc.encode(text); + }; + const defaultTokenizerDecoder = (text: string) => { + return enc.decode(text); + }; + tokenizer = default_tokenizer; + tokenizerDecoder = defaultTokenizerDecoder; + } + this.tokenizer = tokenizer; + this.tokenizerDecoder = tokenizerDecoder; + + this.paragraphSeparator = paragraphSeparator; + this.chunkingTokenizerFn = chunkingTokenizerFn; } - private _reduceChunkSize( - start_idx: number, - cur_idx: number, - splits: string[] - ): number { - let current_doc_total = this.tokenizer( - splits.slice(start_idx, cur_idx).join(this._separator) - ).length; - while (current_doc_total > this._chunk_size) { - const percent_to_reduce = - (current_doc_total - this._chunk_size) / current_doc_total; - const num_to_reduce = - parseInt(percent_to_reduce.toString()) * (cur_idx - start_idx) + 1; - cur_idx -= num_to_reduce; - current_doc_total = this.tokenizer( - splits.slice(start_idx, cur_idx).join(this._separator) - ).length; + private getEffectiveChunkSize(extraInfoStr?: string): number { + // get "effective" chunk size by removing the metadata + let effectiveChunkSize; + if (extraInfoStr != undefined) { + const numExtraTokens = this.tokenizer(`${extraInfoStr}\n\n`).length + 1; + effectiveChunkSize = this.chunkSize - numExtraTokens; + if (effectiveChunkSize <= 0) { + throw new Error( + "Effective chunk size is non positive after considering extra_info" + ); + } + } else { + effectiveChunkSize = this.chunkSize; } - return cur_idx; + return effectiveChunkSize; } - _preprocessSplits(splits: Array<string>, chunk_size: number): Array<string> { - const new_splits: Array<string> = []; - for (const split of splits) { - const num_cur_tokens = tokenizer(split).length; - if (num_cur_tokens <= chunk_size) { - new_splits.push(split); + getParagraphSplits(text: string, effectiveChunkSize?: number): string[] { + // get paragraph splits + let paragraphSplits: string[] = text.split(this.paragraphSeparator); + let idx = 0; + if (effectiveChunkSize == undefined) { + return paragraphSplits; + } + + // merge paragraphs that are too small + while (idx < paragraphSplits.length) { + if ( + idx < paragraphSplits.length - 1 && + paragraphSplits[idx].length < effectiveChunkSize + ) { + paragraphSplits[idx] = [ + paragraphSplits[idx], + paragraphSplits[idx + 1], + ].join(this.paragraphSeparator); + paragraphSplits.splice(idx + 1, 1); } else { - let cur_splits: Array<string> = [split]; - if (backup_separators) { - for (const sep of backup_separators) { - if (split.includes(sep)) { - cur_splits = split.split(sep); - break; - } - } - } else { - cur_splits = [split]; - } + idx += 1; + } + } + return paragraphSplits; + } - const cur_splits2: Array<string> = []; - for (const cur_split of cur_splits) { - const num_cur_tokens = tokenizer(cur_split).length; - if (num_cur_tokens <= chunk_size) { - cur_splits2.push(cur_split); - } else { - // split cur_split according to chunk size of the token numbers - const cur_split_chunks: Array<string> = []; - let end_idx = cur_split.length; - while (tokenizer(cur_split.slice(0, end_idx)).length > chunk_size) { - for (let i = 1; i < end_idx; i++) { - const tmp_split = cur_split.slice(0, end_idx - i); - if (tokenizer(tmp_split).length <= chunk_size) { - cur_split_chunks.push(tmp_split); - cur_splits2.push(cur_split.slice(end_idx - i, end_idx)); - end_idx = cur_split.length; - break; - } - } - } - cur_split_chunks.push(cur_split); - cur_splits2.push(...cur_split_chunks); - } - } - new_splits.push(...cur_splits2); + getSentenceSplits(text: string, effectiveChunkSize?: number): string[] { + let paragraphSplits = this.getParagraphSplits(text, effectiveChunkSize); + // Next we split the text using the chunk tokenizer fn/ + let splits = []; + for (const parText of paragraphSplits) { + let sentenceSplits = this.chunkingTokenizerFn(parText); + for (const sentence_split of sentenceSplits) { + splits.push(sentence_split.trim()); } } - return new_splits; + return splits; } - _postprocessSplits(docs: TextSplit[]): TextSplit[] { - const new_docs: TextSplit[] = []; - for (const doc of docs) { - if (doc.text_chunk.replace(" ", "") == "") { - continue; + private processSentenceSplits( + sentenceSplits: string[], + effectiveChunkSize: number + ): SplitRep[] { + // Process entence splits + // Primarily check if any sentences exceed the chunk size. If they don't, + // force split by tokenizer + let newSplits: SplitRep[] = []; + for (const split of sentenceSplits) { + let splitTokens = this.tokenizer(split); + const split_len = splitTokens.length; + if (split_len <= effectiveChunkSize) { + newSplits.push([split, split_len]); + } else { + for (let i = 0; i < split_len; i += effectiveChunkSize) { + const cur_split = this.tokenizerDecoder( + splitTokens.slice(i, i + effectiveChunkSize) + ); + newSplits.push([cur_split, effectiveChunkSize]); + } } - new_docs.push(doc); } - return new_docs; + return newSplits; } - splitText(text: string, extra_info_str?: string): string[] { - const text_splits = this.splitTextWithOverlaps(text); - const chunks = text_splits.map((text_split) => text_split.text_chunk); - return chunks; + combineTextSplits( + newSentenceSplits: SplitRep[], + effectiveChunkSize: number + ): TextSplit[] { + // go through sentence splits, combien to chunks that are within the chunk size + + // docs represents final list of text chunks + let docs: TextSplit[] = []; + // curDocList represents the current list of sentence splits (that) + // will be merged into a chunk + let curDocList: string[] = []; + let bufferTokens = 0; + let curDocTokens = 0; + // curDocBuffer represents the current document buffer + let curDocBuffer: SplitRep[] = []; + + for (let i = 0; i < newSentenceSplits.length; i++) { + // update buffer + curDocBuffer.push(newSentenceSplits[i]); + bufferTokens += newSentenceSplits[i][1] + 1; + + while (bufferTokens > this.chunkOverlap) { + // remove first element from curDocBuffer + let first_element = curDocBuffer.shift(); + if (first_element == undefined) { + throw new Error("first_element is undefined"); + } + bufferTokens -= first_element[1]; + bufferTokens -= 1; + } + + // if adding newSentenceSplits[i] to curDocBuffer would exceed effectiveChunkSize, + // then we need to add the current curDocBuffer to docs + if (curDocTokens + newSentenceSplits[i][1] > effectiveChunkSize) { + // push curent doc list to docs + docs.push(new TextSplit(curDocList.join(" ").trim())); + // reset docs list with buffer + curDocTokens = 0; + curDocList = []; + for (let j = 0; j < curDocBuffer.length; j++) { + curDocList.push(curDocBuffer[j][0]); + curDocTokens += curDocBuffer[j][1] + 1; + } + } + + curDocList.push(newSentenceSplits[i][0]); + curDocTokens += newSentenceSplits[i][1] + 1; + } + docs.push(new TextSplit(curDocList.join(" ").trim())); + return docs; } - splitTextWithOverlaps(text: string) {} + splitTextWithOverlaps(text: string, extraInfoStr?: string): TextSplit[] { + // Split incoming text and return chunks with overlap size. + // Has a preference for complete sentences, phrases, and minimal overlap. - truncateText(text: string, separator: string, chunk_size: number): string { + // here is the typescript code (skip callback manager) if (text == "") { - return ""; - } - // First we naively split the large input into a bunch of smaller ones. - let splits: string[] = text.split(separator); - splits = preprocessSplits(splits, chunk_size); - - let start_idx = 0; - let cur_idx = 0; - let cur_total = 0; - while (cur_idx < splits.length) { - let cur_token = splits[cur_idx]; - let num_cur_tokens = Math.max(tokenizer(cur_token).length, 1); - if (cur_total + num_cur_tokens > chunk_size) { - cur_idx = reduce_chunk_size(start_idx, cur_idx, splits); - break; - } - cur_total += num_cur_tokens; - cur_idx += 1; + return []; } - return splits.slice(start_idx, cur_idx).join(separator); + + let effectiveChunkSize = this.getEffectiveChunkSize(extraInfoStr); + let sentenceSplits = this.getSentenceSplits(text, effectiveChunkSize); + + // Check if any sentences exceed the chunk size. If they don't, + // force split by tokenizer + let newSentenceSplits = this.processSentenceSplits( + sentenceSplits, + effectiveChunkSize + ); + + // combine sentence splits into chunks of text that can then be returned + let combinedTextSplits = this.combineTextSplits( + newSentenceSplits, + effectiveChunkSize + ); + + return combinedTextSplits; + } + + splitText(text: string, extraInfoStr?: string): string[] { + const text_splits = this.splitTextWithOverlaps(text); + const chunks = text_splits.map((text_split) => text_split.textChunk); + return chunks; } } diff --git a/packages/core/src/tests/TextSplitter.test.ts b/packages/core/src/tests/TextSplitter.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..1cfd7abf614ffc4db55507b4052e677893dc7166 --- /dev/null +++ b/packages/core/src/tests/TextSplitter.test.ts @@ -0,0 +1,43 @@ +import { SentenceSplitter } from "../TextSplitter"; + +describe("SentenceSplitter", () => { + test("initializes", () => { + const sentenceSplitter = new SentenceSplitter(); + expect(sentenceSplitter).toBeDefined(); + }); + + test("splits paragraphs w/o effective chunk size", () => { + const sentenceSplitter = new SentenceSplitter( + undefined, undefined, undefined, undefined, "\n" + ); + // generate the same line as above but correct syntax errors + let splits = sentenceSplitter.getParagraphSplits("This is a paragraph.\nThis is another paragraph.", undefined); + expect(splits).toEqual(["This is a paragraph.", "This is another paragraph."]); + }); + + test("splits paragraphs with effective chunk size", () => { + const sentenceSplitter = new SentenceSplitter( + undefined, undefined, undefined, undefined, "\n" + ); + // generate the same line as above but correct syntax errors + let splits = sentenceSplitter.getParagraphSplits("This is a paragraph.\nThis is another paragraph.", 1000); + expect(splits).toEqual(["This is a paragraph.\nThis is another paragraph."]); + }); + + test("splits sentences", () => { + const sentenceSplitter = new SentenceSplitter(); + let splits = sentenceSplitter.getSentenceSplits("This is a sentence. This is another sentence.", undefined); + expect(splits).toEqual(["This is a sentence.", "This is another sentence."]); + }); + + test("overall split text", () => { + let sentenceSplitter = new SentenceSplitter(5, 0); + let splits = sentenceSplitter.splitText("This is a sentence. This is another sentence."); + expect(splits).toEqual(["This is a sentence.", "This is another sentence."]); + + sentenceSplitter = new SentenceSplitter(1000); + splits = sentenceSplitter.splitText("This is a sentence. This is another sentence."); + expect(splits).toEqual(["This is a sentence. This is another sentence."]); + }); + +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0dd717a424f7be485e041adf75188102f36f7b99..af9f950080dba1b67693cb936e6dd25694066d5b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -29,12 +29,18 @@ importers: prettier-plugin-tailwindcss: specifier: ^0.3.0 version: 0.3.0(prettier@2.8.8) + tiktoken-node: + specifier: latest + version: 0.0.6 ts-jest: specifier: ^29.1.0 version: 29.1.0(@babel/core@7.22.5)(jest@29.5.0)(typescript@4.9.5) turbo: specifier: latest version: 1.10.3 + wink-nlp: + specifier: latest + version: 1.14.1 apps/docs: dependencies: @@ -116,9 +122,6 @@ importers: packages/core: dependencies: - '@vespaiach/axios-fetch-adapter': - specifier: ^0.3.1 - version: 0.3.1(axios@0.26.1) axios: specifier: ^0.26.1 version: 0.26.1 @@ -128,6 +131,10 @@ importers: openai: specifier: ^3.3.0 version: 3.3.0 + devDependencies: + '@types/node': + specifier: ^18 + version: 18.6.0 packages/eslint-config-custom: dependencies: @@ -1207,14 +1214,6 @@ packages: eslint-visitor-keys: 3.4.0 dev: false - /@vespaiach/axios-fetch-adapter@0.3.1(axios@0.26.1): - resolution: {integrity: sha512-+1F52VWXmQHSRFSv4/H0wtnxfvjRMPK5531e880MIjypPdUSX6QZuoDgEVeCE1vjhzDdxCVX7rOqkub7StEUwQ==} - peerDependencies: - axios: '>=0.26.0' - dependencies: - axios: 0.26.1 - dev: false - /acorn-jsx@5.3.2(acorn@7.4.1): resolution: {integrity: sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==} peerDependencies: @@ -4763,6 +4762,11 @@ packages: resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==} dev: true + /tiktoken-node@0.0.6: + resolution: {integrity: sha512-MiprfzPhoKhCflzl0Jyds0VKibAgUGHfJLvBCAXPpum6Lru6ZoKQGsl8lJP0B94LPpby2B2WveOB2tZVfEZQOQ==} + engines: {node: '>= 14'} + dev: true + /title-case@2.1.1: resolution: {integrity: sha512-EkJoZ2O3zdCz3zJsYCsxyq2OC5hrxR9mfdd5I+w8h/tmFfeOxJ+vvkxsKxdmN0WtS9zLdHEgfgVOiMVgv+Po4Q==} dependencies: @@ -5114,6 +5118,10 @@ packages: dependencies: isexe: 2.0.0 + /wink-nlp@1.14.1: + resolution: {integrity: sha512-RIdUZI3ei3OB6OY5f3jNo74fmsfPV7cfwiJ2fvBM1xzGnnl2CjRJmwGwsO04n0xl28vDTtxj6AlhIb74XQLoqQ==} + dev: true + /word-wrap@1.2.3: resolution: {integrity: sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==} engines: {node: '>=0.10.0'}