From 6f2cb31d4171605b893b7d99d8b2cdd7137c3ad0 Mon Sep 17 00:00:00 2001 From: Yi Ding <yi.s.ding@gmail.com> Date: Mon, 24 Jul 2023 06:02:16 -0700 Subject: [PATCH] fixed tokenizer decoder --- .changeset/silver-ties-walk.md | 5 +++++ packages/core/src/GlobalsHelper.ts | 26 +++++++++++++++++--------- packages/core/src/PromptHelper.ts | 4 ++-- packages/core/src/TextSplitter.ts | 2 +- 4 files changed, 25 insertions(+), 12 deletions(-) create mode 100644 .changeset/silver-ties-walk.md diff --git a/.changeset/silver-ties-walk.md b/.changeset/silver-ties-walk.md new file mode 100644 index 000000000..2c140a42d --- /dev/null +++ b/.changeset/silver-ties-walk.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Fixed tokenizer decoder diff --git a/packages/core/src/GlobalsHelper.ts b/packages/core/src/GlobalsHelper.ts index 32b76b643..f22abe7da 100644 --- a/packages/core/src/GlobalsHelper.ts +++ b/packages/core/src/GlobalsHelper.ts @@ -5,19 +5,27 @@ import { v4 as uuidv4 } from "uuid"; * Helper class singleton */ class GlobalsHelper { - defaultTokenizer: ((text: string) => string[]) | null = null; + defaultTokenizer: { + encode: (text: string) => number[]; + decode: (tokens: number[]) => string; + } | null = null; tokenizer() { - if (this.defaultTokenizer) { - return this.defaultTokenizer; + if (!this.defaultTokenizer) { + const tiktoken = require("tiktoken-node"); + this.defaultTokenizer = tiktoken.getEncoding("gpt2"); } - const tiktoken = require("tiktoken-node"); - let enc = new tiktoken.getEncoding("gpt2"); - this.defaultTokenizer = (text: string) => { - return enc.encode(text); - }; - return this.defaultTokenizer; + return this.defaultTokenizer!.encode.bind(this.defaultTokenizer); + } + + tokenizerDecoder() { + if (!this.defaultTokenizer) { + const tiktoken = require("tiktoken-node"); + this.defaultTokenizer = tiktoken.getEncoding("gpt2"); + } + + return this.defaultTokenizer!.decode.bind(this.defaultTokenizer); } createEvent({ diff --git a/packages/core/src/PromptHelper.ts b/packages/core/src/PromptHelper.ts index 2d9ae8b3a..0f0198813 100644 --- a/packages/core/src/PromptHelper.ts +++ b/packages/core/src/PromptHelper.ts @@ -34,7 +34,7 @@ export class PromptHelper { numOutput = DEFAULT_NUM_OUTPUTS; chunkOverlapRatio = DEFAULT_CHUNK_OVERLAP_RATIO; chunkSizeLimit?: number; - tokenizer: (text: string) => string[]; + tokenizer: (text: string) => number[]; separator = " "; constructor( @@ -42,7 +42,7 @@ export class PromptHelper { numOutput = DEFAULT_NUM_OUTPUTS, chunkOverlapRatio = DEFAULT_CHUNK_OVERLAP_RATIO, chunkSizeLimit?: number, - tokenizer?: (text: string) => string[], + tokenizer?: (text: string) => number[], separator = " " ) { this.contextWindow = contextWindow; diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index 706506a10..f30adf3b6 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -60,7 +60,7 @@ export class SentenceSplitter { if (tokenizer == undefined || tokenizerDecoder == undefined) { tokenizer = globalsHelper.tokenizer(); - tokenizerDecoder = globalsHelper.tokenizer; + tokenizerDecoder = globalsHelper.tokenizerDecoder(); } this.tokenizer = tokenizer; this.tokenizerDecoder = tokenizerDecoder; -- GitLab