diff --git a/.changeset/silver-ties-walk.md b/.changeset/silver-ties-walk.md new file mode 100644 index 0000000000000000000000000000000000000000..2c140a42dc5be4ee27b846c4c9263638f97b4894 --- /dev/null +++ b/.changeset/silver-ties-walk.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Fixed tokenizer decoder diff --git a/packages/core/src/GlobalsHelper.ts b/packages/core/src/GlobalsHelper.ts index 32b76b643ad1b2268e810fe7240f809d7560405b..f22abe7da315061f6a929f9ba48d6673defd87e7 100644 --- a/packages/core/src/GlobalsHelper.ts +++ b/packages/core/src/GlobalsHelper.ts @@ -5,19 +5,27 @@ import { v4 as uuidv4 } from "uuid"; * Helper class singleton */ class GlobalsHelper { - defaultTokenizer: ((text: string) => string[]) | null = null; + defaultTokenizer: { + encode: (text: string) => number[]; + decode: (tokens: number[]) => string; + } | null = null; tokenizer() { - if (this.defaultTokenizer) { - return this.defaultTokenizer; + if (!this.defaultTokenizer) { + const tiktoken = require("tiktoken-node"); + this.defaultTokenizer = tiktoken.getEncoding("gpt2"); } - const tiktoken = require("tiktoken-node"); - let enc = new tiktoken.getEncoding("gpt2"); - this.defaultTokenizer = (text: string) => { - return enc.encode(text); - }; - return this.defaultTokenizer; + return this.defaultTokenizer!.encode.bind(this.defaultTokenizer); + } + + tokenizerDecoder() { + if (!this.defaultTokenizer) { + const tiktoken = require("tiktoken-node"); + this.defaultTokenizer = tiktoken.getEncoding("gpt2"); + } + + return this.defaultTokenizer!.decode.bind(this.defaultTokenizer); } createEvent({ diff --git a/packages/core/src/PromptHelper.ts b/packages/core/src/PromptHelper.ts index 2d9ae8b3a458b887986e30a6bb7036032f5e5633..0f01988134697bf70688efa517e328f006704481 100644 --- a/packages/core/src/PromptHelper.ts +++ b/packages/core/src/PromptHelper.ts @@ -34,7 +34,7 @@ export class PromptHelper { numOutput = DEFAULT_NUM_OUTPUTS; chunkOverlapRatio = DEFAULT_CHUNK_OVERLAP_RATIO; chunkSizeLimit?: number; - tokenizer: (text: string) => string[]; + tokenizer: (text: string) => number[]; separator = " "; constructor( @@ -42,7 +42,7 @@ export class PromptHelper { numOutput = DEFAULT_NUM_OUTPUTS, chunkOverlapRatio = DEFAULT_CHUNK_OVERLAP_RATIO, chunkSizeLimit?: number, - tokenizer?: (text: string) => string[], + tokenizer?: (text: string) => number[], separator = " " ) { this.contextWindow = contextWindow; diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index 706506a10f031afcb5f1f998000463e18556cd72..f30adf3b6205beeb54777231d3010797ef6c8ec0 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -60,7 +60,7 @@ export class SentenceSplitter { if (tokenizer == undefined || tokenizerDecoder == undefined) { tokenizer = globalsHelper.tokenizer(); - tokenizerDecoder = globalsHelper.tokenizer; + tokenizerDecoder = globalsHelper.tokenizerDecoder(); } this.tokenizer = tokenizer; this.tokenizerDecoder = tokenizerDecoder;