From 6f2cb31d4171605b893b7d99d8b2cdd7137c3ad0 Mon Sep 17 00:00:00 2001
From: Yi Ding <yi.s.ding@gmail.com>
Date: Mon, 24 Jul 2023 06:02:16 -0700
Subject: [PATCH] fixed tokenizer decoder

---
 .changeset/silver-ties-walk.md     |  5 +++++
 packages/core/src/GlobalsHelper.ts | 26 +++++++++++++++++---------
 packages/core/src/PromptHelper.ts  |  4 ++--
 packages/core/src/TextSplitter.ts  |  2 +-
 4 files changed, 25 insertions(+), 12 deletions(-)
 create mode 100644 .changeset/silver-ties-walk.md

diff --git a/.changeset/silver-ties-walk.md b/.changeset/silver-ties-walk.md
new file mode 100644
index 000000000..2c140a42d
--- /dev/null
+++ b/.changeset/silver-ties-walk.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Fixed tokenizer decoder
diff --git a/packages/core/src/GlobalsHelper.ts b/packages/core/src/GlobalsHelper.ts
index 32b76b643..f22abe7da 100644
--- a/packages/core/src/GlobalsHelper.ts
+++ b/packages/core/src/GlobalsHelper.ts
@@ -5,19 +5,27 @@ import { v4 as uuidv4 } from "uuid";
  * Helper class singleton
  */
 class GlobalsHelper {
-  defaultTokenizer: ((text: string) => string[]) | null = null;
+  defaultTokenizer: {
+    encode: (text: string) => number[];
+    decode: (tokens: number[]) => string;
+  } | null = null;
 
   tokenizer() {
-    if (this.defaultTokenizer) {
-      return this.defaultTokenizer;
+    if (!this.defaultTokenizer) {
+      const tiktoken = require("tiktoken-node");
+      this.defaultTokenizer = tiktoken.getEncoding("gpt2");
     }
 
-    const tiktoken = require("tiktoken-node");
-    let enc = new tiktoken.getEncoding("gpt2");
-    this.defaultTokenizer = (text: string) => {
-      return enc.encode(text);
-    };
-    return this.defaultTokenizer;
+    return this.defaultTokenizer!.encode.bind(this.defaultTokenizer);
+  }
+
+  tokenizerDecoder() {
+    if (!this.defaultTokenizer) {
+      const tiktoken = require("tiktoken-node");
+      this.defaultTokenizer = tiktoken.getEncoding("gpt2");
+    }
+
+    return this.defaultTokenizer!.decode.bind(this.defaultTokenizer);
   }
 
   createEvent({
diff --git a/packages/core/src/PromptHelper.ts b/packages/core/src/PromptHelper.ts
index 2d9ae8b3a..0f0198813 100644
--- a/packages/core/src/PromptHelper.ts
+++ b/packages/core/src/PromptHelper.ts
@@ -34,7 +34,7 @@ export class PromptHelper {
   numOutput = DEFAULT_NUM_OUTPUTS;
   chunkOverlapRatio = DEFAULT_CHUNK_OVERLAP_RATIO;
   chunkSizeLimit?: number;
-  tokenizer: (text: string) => string[];
+  tokenizer: (text: string) => number[];
   separator = " ";
 
   constructor(
@@ -42,7 +42,7 @@ export class PromptHelper {
     numOutput = DEFAULT_NUM_OUTPUTS,
     chunkOverlapRatio = DEFAULT_CHUNK_OVERLAP_RATIO,
     chunkSizeLimit?: number,
-    tokenizer?: (text: string) => string[],
+    tokenizer?: (text: string) => number[],
     separator = " "
   ) {
     this.contextWindow = contextWindow;
diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts
index 706506a10..f30adf3b6 100644
--- a/packages/core/src/TextSplitter.ts
+++ b/packages/core/src/TextSplitter.ts
@@ -60,7 +60,7 @@ export class SentenceSplitter {
 
     if (tokenizer == undefined || tokenizerDecoder == undefined) {
       tokenizer = globalsHelper.tokenizer();
-      tokenizerDecoder = globalsHelper.tokenizer;
+      tokenizerDecoder = globalsHelper.tokenizerDecoder();
     }
     this.tokenizer = tokenizer;
     this.tokenizerDecoder = tokenizerDecoder;
-- 
GitLab