diff --git a/packages/core/package.json b/packages/core/package.json index 09b49c39c2f0d779d813cc936fcb74872c33d121..5372394366bc5eadac21afe4e18f4c1cd4acf6a0 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -1,7 +1,8 @@ { "name": "@llamaindex/core", "dependencies": { - "openai": "latest" + "js-tiktoken": "^1.0.7", + "openai": "^3.3.0" }, "main": "src/index.ts", "types": "src/index.ts" diff --git a/packages/core/src/NodeParser.ts b/packages/core/src/NodeParser.ts new file mode 100644 index 0000000000000000000000000000000000000000..37b37c371c07ff9ccb1a9c4acf86256558a94b87 --- /dev/null +++ b/packages/core/src/NodeParser.ts @@ -0,0 +1,13 @@ +interface NodeParser {} + +class SimpleNodeParser implements NodeParser { + constructor( + textSplitter: any = null, + includeExtraInfo: boolean = true, + includePrevNextRel: boolean = true + ) {} + + static fromDefaults(): SimpleNodeParser { + return new SimpleNodeParser(); + } +} diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts new file mode 100644 index 0000000000000000000000000000000000000000..ee86d1e80cb76ac693404d446eebbc2a90133c1d --- /dev/null +++ b/packages/core/src/TextSplitter.ts @@ -0,0 +1,146 @@ +// GitHub translated + +import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "./constants"; + +class TokenTextSplitter { + private _separator: string; + private _chunk_size: number; + private _chunk_overlap: number; + private tokenizer: any; + private _backup_separators: string[]; + private callback_manager: any; + + constructor( + separator: string = " ", + chunk_size: number = DEFAULT_CHUNK_SIZE, + chunk_overlap: number = DEFAULT_CHUNK_OVERLAP, + tokenizer: any = null, + backup_separators: string[] = ["\n"] + // callback_manager: any = null + ) { + if (chunk_overlap > chunk_size) { + throw new Error( + `Got a larger chunk overlap (${chunk_overlap}) than chunk size (${chunk_size}), should be smaller.` + ); + } + this._separator = separator; + this._chunk_size = chunk_size; + this._chunk_overlap = chunk_overlap; + this.tokenizer = tokenizer || globals_helper.tokenizer; + this._backup_separators = backup_separators; + // this.callback_manager = callback_manager || new CallbackManager([]); + } + + private _reduceChunkSize( + start_idx: number, + cur_idx: number, + splits: string[] + ): number { + let current_doc_total = this.tokenizer( + splits.slice(start_idx, cur_idx).join(this._separator) + ).length; + while (current_doc_total > this._chunk_size) { + const percent_to_reduce = + (current_doc_total - this._chunk_size) / current_doc_total; + const num_to_reduce = + parseInt(percent_to_reduce.toString()) * (cur_idx - start_idx) + 1; + cur_idx -= num_to_reduce; + current_doc_total = this.tokenizer( + splits.slice(start_idx, cur_idx).join(this._separator) + ).length; + } + return cur_idx; + } + + _preprocessSplits(splits: Array<string>, chunk_size: number): Array<string> { + const new_splits: Array<string> = []; + for (const split of splits) { + const num_cur_tokens = tokenizer(split).length; + if (num_cur_tokens <= chunk_size) { + new_splits.push(split); + } else { + let cur_splits: Array<string> = [split]; + if (backup_separators) { + for (const sep of backup_separators) { + if (split.includes(sep)) { + cur_splits = split.split(sep); + break; + } + } + } else { + cur_splits = [split]; + } + + const cur_splits2: Array<string> = []; + for (const cur_split of cur_splits) { + const num_cur_tokens = tokenizer(cur_split).length; + if (num_cur_tokens <= chunk_size) { + cur_splits2.push(cur_split); + } else { + // split cur_split according to chunk size of the token numbers + const cur_split_chunks: Array<string> = []; + let end_idx = cur_split.length; + while (tokenizer(cur_split.slice(0, end_idx)).length > chunk_size) { + for (let i = 1; i < end_idx; i++) { + const tmp_split = cur_split.slice(0, end_idx - i); + if (tokenizer(tmp_split).length <= chunk_size) { + cur_split_chunks.push(tmp_split); + cur_splits2.push(cur_split.slice(end_idx - i, end_idx)); + end_idx = cur_split.length; + break; + } + } + } + cur_split_chunks.push(cur_split); + cur_splits2.push(...cur_split_chunks); + } + } + new_splits.push(...cur_splits2); + } + } + return new_splits; + } + + _postprocessSplits(docs: TextSplit[]): TextSplit[] { + const new_docs: TextSplit[] = []; + for (const doc of docs) { + if (doc.text_chunk.replace(" ", "") == "") { + continue; + } + new_docs.push(doc); + } + return new_docs; + } + + splitText(text: string, extra_info_str?: string): string[] { + const text_splits = this.splitTextWithOverlaps(text); + const chunks = text_splits.map((text_split) => text_split.text_chunk); + return chunks; + } + + splitTextWithOverlaps(text: string) {} + + truncateText(text: string, separator: string, chunk_size: number): string { + if (text == "") { + return ""; + } + // First we naively split the large input into a bunch of smaller ones. + let splits: string[] = text.split(separator); + splits = preprocessSplits(splits, chunk_size); + + let start_idx = 0; + let cur_idx = 0; + let cur_total = 0; + while (cur_idx < splits.length) { + let cur_token = splits[cur_idx]; + let num_cur_tokens = Math.max(tokenizer(cur_token).length, 1); + if (cur_total + num_cur_tokens > chunk_size) { + cur_idx = reduce_chunk_size(start_idx, cur_idx, splits); + break; + } + cur_total += num_cur_tokens; + cur_idx += 1; + } + return splits.slice(start_idx, cur_idx).join(separator); + } +} diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts new file mode 100644 index 0000000000000000000000000000000000000000..def343a7945c57e0e96ea87fd729e1e0c2f4903c --- /dev/null +++ b/packages/core/src/constants.ts @@ -0,0 +1,9 @@ +export const DEFAULT_CONTEXT_WINDOW = 3900; +export const DEFAULT_NUM_OUTPUTS = 256; + +export const DEFAULT_CHUNK_SIZE = 1024; +export const DEFAULT_CHUNK_OVERLAP = 20; +export const DEFAULT_SIMILARITY_TOP_K = 2; + +// NOTE: for text-embedding-ada-002 +export const DEFAULT_EMBEDDING_DIM = 1536; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c7c8469447fc42ee25873dfb57407747e47f951d..8a3d4027ca8f2101756a7b064a47cc7718331898 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -103,8 +103,11 @@ importers: packages/core: dependencies: + js-tiktoken: + specifier: ^1.0.7 + version: 1.0.7 openai: - specifier: latest + specifier: ^3.3.0 version: 3.3.0 packages/eslint-config-custom: @@ -697,7 +700,6 @@ packages: /base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} - dev: true /big-integer@1.6.51: resolution: {integrity: sha512-GPEid2Y9QU1Exl1rpO9B2IPJGHPSupF5GnVIP0blYvNOMer2bTvSWs1jGOUg04hTmu67nmLsQ9TBo1puaotBHg==} @@ -2153,6 +2155,12 @@ packages: /isexe@2.0.0: resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} + /js-tiktoken@1.0.7: + resolution: {integrity: sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==} + dependencies: + base64-js: 1.5.1 + dev: false + /js-tokens@4.0.0: resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==}