From 18ec1f2f61c3c346006fb06e43553eaf375b1c76 Mon Sep 17 00:00:00 2001
From: Alex Yang <himself65@outlook.com>
Date: Fri, 8 Nov 2024 18:53:05 -0800
Subject: [PATCH] chore: separate tokenizers (#1454)

---
 .../src/utils/tokenizer.ts                    |  2 +-
 packages/core/src/embeddings/base.ts          |  2 +-
 packages/core/src/embeddings/tokenizer.ts     |  2 +-
 packages/core/src/global/settings.ts          |  3 ++-
 .../core/src/global/settings/tokenizer.ts     |  3 ++-
 packages/core/src/indices/prompt-helper.ts    |  4 +--
 packages/core/src/llms/type.ts                |  4 +--
 packages/core/src/memory/summary-memory.ts    |  2 +-
 .../core/src/node-parser/sentence-splitter.ts |  2 +-
 .../src/node-parser/token-text-splitter.ts    |  2 +-
 packages/core/src/node-parser/type.ts         |  2 +-
 packages/core/tests/embeddings.test.ts        |  2 +-
 .../node-parser/sentence-spiller.test.ts      |  2 +-
 packages/env/package.json                     | 27 +++++++++++++++++++
 packages/env/src/index.browser.ts             |  1 -
 packages/env/src/index.edge-light.ts          |  1 -
 packages/env/src/index.ts                     |  1 -
 packages/env/src/index.workerd.ts             |  1 -
 .../env/src/{ => internal}/tokenizers/js.ts   |  2 +-
 .../env/src/{ => internal}/tokenizers/node.ts |  1 -
 .../src/{ => internal}/tokenizers/types.ts    |  0
 packages/env/src/tokenizers.browser.ts        |  5 ++++
 packages/env/src/tokenizers.edge-light.ts     |  5 ++++
 packages/env/src/tokenizers.ts                |  5 ++++
 packages/env/src/tokenizers.workerd.ts        |  5 ++++
 packages/env/tests/tokenizer.test.ts          | 16 ++++++++---
 packages/providers/openai/src/embedding.ts    |  3 ++-
 packages/providers/openai/src/llm.ts          |  3 ++-
 28 files changed, 82 insertions(+), 26 deletions(-)
 rename packages/env/src/{ => internal}/tokenizers/js.ts (90%)
 rename packages/env/src/{ => internal}/tokenizers/node.ts (91%)
 rename packages/env/src/{ => internal}/tokenizers/types.ts (100%)
 create mode 100644 packages/env/src/tokenizers.browser.ts
 create mode 100644 packages/env/src/tokenizers.edge-light.ts
 create mode 100644 packages/env/src/tokenizers.ts
 create mode 100644 packages/env/src/tokenizers.workerd.ts

diff --git a/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts b/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts
index 414b76a3f..468c74d6f 100644
--- a/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts
+++ b/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts
@@ -1,5 +1,5 @@
 // test runtime
-import { Tokenizers, tokenizers } from "@llamaindex/env";
+import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers";
 import "llamaindex";
 
 // @ts-expect-error EdgeRuntime is not defined in type
diff --git a/packages/core/src/embeddings/base.ts b/packages/core/src/embeddings/base.ts
index 55678ca8d..d56b769ec 100644
--- a/packages/core/src/embeddings/base.ts
+++ b/packages/core/src/embeddings/base.ts
@@ -1,4 +1,4 @@
-import { type Tokenizers } from "@llamaindex/env";
+import type { Tokenizers } from "@llamaindex/env/tokenizers";
 import type { MessageContentDetail } from "../llms";
 import { BaseNode, MetadataMode, TransformComponent } from "../schema";
 import { extractSingleText } from "../utils";
diff --git a/packages/core/src/embeddings/tokenizer.ts b/packages/core/src/embeddings/tokenizer.ts
index 42fba032a..0a4cc8f7e 100644
--- a/packages/core/src/embeddings/tokenizer.ts
+++ b/packages/core/src/embeddings/tokenizer.ts
@@ -1,4 +1,4 @@
-import { Tokenizers, tokenizers } from "@llamaindex/env";
+import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers";
 
 export function truncateMaxTokens(
   tokenizer: Tokenizers,
diff --git a/packages/core/src/global/settings.ts b/packages/core/src/global/settings.ts
index 5b49d9d75..3a3af7ccf 100644
--- a/packages/core/src/global/settings.ts
+++ b/packages/core/src/global/settings.ts
@@ -1,4 +1,5 @@
-import { getEnv, type Tokenizer } from "@llamaindex/env";
+import { getEnv } from "@llamaindex/env";
+import type { Tokenizer } from "@llamaindex/env/tokenizers";
 import type { LLM } from "../llms";
 import {
   type CallbackManager,
diff --git a/packages/core/src/global/settings/tokenizer.ts b/packages/core/src/global/settings/tokenizer.ts
index bdb6e9943..ca2aada87 100644
--- a/packages/core/src/global/settings/tokenizer.ts
+++ b/packages/core/src/global/settings/tokenizer.ts
@@ -1,4 +1,5 @@
-import { AsyncLocalStorage, type Tokenizer, tokenizers } from "@llamaindex/env";
+import { AsyncLocalStorage } from "@llamaindex/env";
+import { type Tokenizer, tokenizers } from "@llamaindex/env/tokenizers";
 
 const chunkSizeAsyncLocalStorage = new AsyncLocalStorage<Tokenizer>();
 let globalTokenizer: Tokenizer = tokenizers.tokenizer();
diff --git a/packages/core/src/indices/prompt-helper.ts b/packages/core/src/indices/prompt-helper.ts
index 477c5037c..4abf6c75b 100644
--- a/packages/core/src/indices/prompt-helper.ts
+++ b/packages/core/src/indices/prompt-helper.ts
@@ -1,4 +1,4 @@
-import { type Tokenizer, tokenizers } from "@llamaindex/env";
+import type { Tokenizer } from "@llamaindex/env/tokenizers";
 import {
   DEFAULT_CHUNK_OVERLAP_RATIO,
   DEFAULT_CONTEXT_WINDOW,
@@ -64,7 +64,7 @@ export class PromptHelper {
     this.numOutput = numOutput;
     this.chunkOverlapRatio = chunkOverlapRatio;
     this.chunkSizeLimit = chunkSizeLimit;
-    this.tokenizer = tokenizer ?? tokenizers.tokenizer();
+    this.tokenizer = tokenizer ?? Settings.tokenizer;
     this.separator = separator;
   }
 
diff --git a/packages/core/src/llms/type.ts b/packages/core/src/llms/type.ts
index ea402ec5d..1ea15a211 100644
--- a/packages/core/src/llms/type.ts
+++ b/packages/core/src/llms/type.ts
@@ -1,6 +1,6 @@
-import type { Tokenizers } from "@llamaindex/env";
+import type { Tokenizers } from "@llamaindex/env/tokenizers";
 import type { JSONSchemaType } from "ajv";
-import type { JSONObject, JSONValue } from "../global/type";
+import type { JSONObject, JSONValue } from "../global";
 
 /**
  * @internal
diff --git a/packages/core/src/memory/summary-memory.ts b/packages/core/src/memory/summary-memory.ts
index e750aa367..d0838904b 100644
--- a/packages/core/src/memory/summary-memory.ts
+++ b/packages/core/src/memory/summary-memory.ts
@@ -1,4 +1,4 @@
-import { type Tokenizer, tokenizers } from "@llamaindex/env";
+import { type Tokenizer, tokenizers } from "@llamaindex/env/tokenizers";
 import { Settings } from "../global";
 import type { ChatMessage, LLM, MessageType } from "../llms";
 import { defaultSummaryPrompt, type SummaryPrompt } from "../prompts";
diff --git a/packages/core/src/node-parser/sentence-splitter.ts b/packages/core/src/node-parser/sentence-splitter.ts
index 49c9fe241..67c2d784d 100644
--- a/packages/core/src/node-parser/sentence-splitter.ts
+++ b/packages/core/src/node-parser/sentence-splitter.ts
@@ -1,4 +1,4 @@
-import type { Tokenizer } from "@llamaindex/env";
+import type { Tokenizer } from "@llamaindex/env/tokenizers";
 import { z } from "zod";
 import { Settings } from "../global";
 import { sentenceSplitterSchema } from "../schema";
diff --git a/packages/core/src/node-parser/token-text-splitter.ts b/packages/core/src/node-parser/token-text-splitter.ts
index e4f7b8dd8..e0d5daf91 100644
--- a/packages/core/src/node-parser/token-text-splitter.ts
+++ b/packages/core/src/node-parser/token-text-splitter.ts
@@ -1,4 +1,4 @@
-import type { Tokenizer } from "@llamaindex/env";
+import type { Tokenizer } from "@llamaindex/env/tokenizers";
 import { z } from "zod";
 import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, Settings } from "../global";
 import { MetadataAwareTextSplitter } from "./base";
diff --git a/packages/core/src/node-parser/type.ts b/packages/core/src/node-parser/type.ts
index 0974f761f..0c3a7160e 100644
--- a/packages/core/src/node-parser/type.ts
+++ b/packages/core/src/node-parser/type.ts
@@ -1,4 +1,4 @@
-import type { Tokenizer } from "@llamaindex/env";
+import type { Tokenizer } from "@llamaindex/env/tokenizers";
 
 export type SplitterParams = {
   tokenizer?: Tokenizer;
diff --git a/packages/core/tests/embeddings.test.ts b/packages/core/tests/embeddings.test.ts
index 3f0a12f8c..a1b79cbe8 100644
--- a/packages/core/tests/embeddings.test.ts
+++ b/packages/core/tests/embeddings.test.ts
@@ -1,5 +1,5 @@
 import { truncateMaxTokens } from "@llamaindex/core/embeddings";
-import { Tokenizers, tokenizers } from "@llamaindex/env";
+import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers";
 import { describe, expect, test } from "vitest";
 
 describe("truncateMaxTokens", () => {
diff --git a/packages/core/tests/node-parser/sentence-spiller.test.ts b/packages/core/tests/node-parser/sentence-spiller.test.ts
index 60c7931e0..281995f71 100644
--- a/packages/core/tests/node-parser/sentence-spiller.test.ts
+++ b/packages/core/tests/node-parser/sentence-spiller.test.ts
@@ -1,6 +1,6 @@
 import { SentenceSplitter } from "@llamaindex/core/node-parser";
 import { Document } from "@llamaindex/core/schema";
-import { tokenizers } from "@llamaindex/env";
+import { tokenizers } from "@llamaindex/env/tokenizers";
 import { beforeEach, describe, expect, test } from "vitest";
 
 describe("SentenceSplitter", () => {
diff --git a/packages/env/package.json b/packages/env/package.json
index 8446a5af5..b8d028151 100644
--- a/packages/env/package.json
+++ b/packages/env/package.json
@@ -76,9 +76,36 @@
         "types": "./multi-model/dist/index.d.ts",
         "default": "./multi-model/dist/index.js"
       }
+    },
+    "./tokenizers": {
+      "workerd": {
+        "types": "./tokenizers/dist/index.workerd.d.ts",
+        "default": "./tokenizers/dist/index.workerd.js"
+      },
+      "edge-light": {
+        "types": "./tokenizers/dist/index.edge-light.d.ts",
+        "default": "./tokenizers/dist/index.edge-light.js"
+      },
+      "browser": {
+        "types": "./tokenizers/dist/index.browser.d.ts",
+        "default": "./tokenizers/dist/index.browser.js"
+      },
+      "import": {
+        "types": "./tokenizers/dist/index.d.ts",
+        "default": "./tokenizers/dist/index.js"
+      },
+      "require": {
+        "types": "./tokenizers/dist/index.d.cts",
+        "default": "./tokenizers/dist/index.cjs"
+      },
+      "default": {
+        "types": "./tokenizers/dist/index.d.ts",
+        "default": "./tokenizers/dist/index.js"
+      }
     }
   },
   "files": [
+    "tokenizers",
     "multi-model",
     "dist",
     "CHANGELOG.md",
diff --git a/packages/env/src/index.browser.ts b/packages/env/src/index.browser.ts
index 9d55f99ed..a6eeef325 100644
--- a/packages/env/src/index.browser.ts
+++ b/packages/env/src/index.browser.ts
@@ -6,7 +6,6 @@
 import "./global-check.js";
 
 export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js";
-export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js";
 export { NotSupportCurrentRuntimeClass } from "./utils/shared.js";
 export * from "./web-polyfill.js";
 // @ts-expect-error no type
diff --git a/packages/env/src/index.edge-light.ts b/packages/env/src/index.edge-light.ts
index f7e49cd31..f6f1cfd1e 100644
--- a/packages/env/src/index.edge-light.ts
+++ b/packages/env/src/index.edge-light.ts
@@ -6,5 +6,4 @@
 import "./global-check.js";
 export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js";
 export * from "./node-polyfill.js";
-export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js";
 export { NotSupportCurrentRuntimeClass } from "./utils/shared.js";
diff --git a/packages/env/src/index.ts b/packages/env/src/index.ts
index cb7e77e64..446b31c4c 100644
--- a/packages/env/src/index.ts
+++ b/packages/env/src/index.ts
@@ -35,7 +35,6 @@ export function createSHA256(): SHA256 {
 }
 
 export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js";
-export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/node.js";
 export {
   AsyncLocalStorage,
   CustomEvent,
diff --git a/packages/env/src/index.workerd.ts b/packages/env/src/index.workerd.ts
index 1eacb2860..22f152c16 100644
--- a/packages/env/src/index.workerd.ts
+++ b/packages/env/src/index.workerd.ts
@@ -16,4 +16,3 @@ export function getEnv(name: string): string | undefined {
 }
 
 export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js";
-export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js";
diff --git a/packages/env/src/tokenizers/js.ts b/packages/env/src/internal/tokenizers/js.ts
similarity index 90%
rename from packages/env/src/tokenizers/js.ts
rename to packages/env/src/internal/tokenizers/js.ts
index ad2c0fec3..31aed9e03 100644
--- a/packages/env/src/tokenizers/js.ts
+++ b/packages/env/src/internal/tokenizers/js.ts
@@ -1,4 +1,4 @@
-// Note: js-tiktoken it's 60x slower than the WASM implementation - use it only for unsupported environments
+// Note: js-tiktoken it's 60x slower than gpt-tokenizer
 import { getEncoding } from "js-tiktoken";
 import type { Tokenizer } from "./types.js";
 import { Tokenizers } from "./types.js";
diff --git a/packages/env/src/tokenizers/node.ts b/packages/env/src/internal/tokenizers/node.ts
similarity index 91%
rename from packages/env/src/tokenizers/node.ts
rename to packages/env/src/internal/tokenizers/node.ts
index 592b53741..0ccd2b02f 100644
--- a/packages/env/src/tokenizers/node.ts
+++ b/packages/env/src/internal/tokenizers/node.ts
@@ -1,4 +1,3 @@
-// Note: This is using th WASM implementation of tiktoken which is 60x faster
 import type { Tokenizer } from "./types.js";
 import { Tokenizers } from "./types.js";
 
diff --git a/packages/env/src/tokenizers/types.ts b/packages/env/src/internal/tokenizers/types.ts
similarity index 100%
rename from packages/env/src/tokenizers/types.ts
rename to packages/env/src/internal/tokenizers/types.ts
diff --git a/packages/env/src/tokenizers.browser.ts b/packages/env/src/tokenizers.browser.ts
new file mode 100644
index 000000000..de80cda92
--- /dev/null
+++ b/packages/env/src/tokenizers.browser.ts
@@ -0,0 +1,5 @@
+export {
+  Tokenizers,
+  tokenizers,
+  type Tokenizer,
+} from "./internal/tokenizers/js.js";
diff --git a/packages/env/src/tokenizers.edge-light.ts b/packages/env/src/tokenizers.edge-light.ts
new file mode 100644
index 000000000..de80cda92
--- /dev/null
+++ b/packages/env/src/tokenizers.edge-light.ts
@@ -0,0 +1,5 @@
+export {
+  Tokenizers,
+  tokenizers,
+  type Tokenizer,
+} from "./internal/tokenizers/js.js";
diff --git a/packages/env/src/tokenizers.ts b/packages/env/src/tokenizers.ts
new file mode 100644
index 000000000..a1ac4a758
--- /dev/null
+++ b/packages/env/src/tokenizers.ts
@@ -0,0 +1,5 @@
+export {
+  Tokenizers,
+  tokenizers,
+  type Tokenizer,
+} from "./internal/tokenizers/node.js";
diff --git a/packages/env/src/tokenizers.workerd.ts b/packages/env/src/tokenizers.workerd.ts
new file mode 100644
index 000000000..de80cda92
--- /dev/null
+++ b/packages/env/src/tokenizers.workerd.ts
@@ -0,0 +1,5 @@
+export {
+  Tokenizers,
+  tokenizers,
+  type Tokenizer,
+} from "./internal/tokenizers/js.js";
diff --git a/packages/env/tests/tokenizer.test.ts b/packages/env/tests/tokenizer.test.ts
index 530f24009..c6b0329a2 100644
--- a/packages/env/tests/tokenizer.test.ts
+++ b/packages/env/tests/tokenizer.test.ts
@@ -1,11 +1,21 @@
 import { describe, expect, it } from "vitest";
-import { tokenizers } from "../src/tokenizers/node.js";
+import { tokenizers as fallbackTokenizers } from "../src/internal/tokenizers/js.js";
+import { tokenizers as nodeTokenizers } from "../src/internal/tokenizers/node.js";
 
-describe("tokenizer", () => {
+describe("node tokenizer", () => {
   it("should tokenize text", () => {
-    const tokenizer = tokenizers.tokenizer();
+    const tokenizer = nodeTokenizers.tokenizer();
     expect(tokenizer.decode(tokenizer.encode("hello world"))).toBe(
       "hello world",
     );
   });
+
+  it("should have same result as fallback tokenizer", () => {
+    const nodeTokenizer = nodeTokenizers.tokenizer();
+    const fallbackTokenizer = fallbackTokenizers.tokenizer();
+    const text = "hello world";
+    expect(nodeTokenizer.decode(nodeTokenizer.encode(text))).toBe(
+      fallbackTokenizer.decode(fallbackTokenizer.encode(text)),
+    );
+  });
 });
diff --git a/packages/providers/openai/src/embedding.ts b/packages/providers/openai/src/embedding.ts
index 26c40849c..345b76573 100644
--- a/packages/providers/openai/src/embedding.ts
+++ b/packages/providers/openai/src/embedding.ts
@@ -1,5 +1,6 @@
 import { BaseEmbedding } from "@llamaindex/core/embeddings";
-import { getEnv, Tokenizers } from "@llamaindex/env";
+import { getEnv } from "@llamaindex/env";
+import { Tokenizers } from "@llamaindex/env/tokenizers";
 import type {
   AzureClientOptions,
   AzureOpenAI as AzureOpenAILLM,
diff --git a/packages/providers/openai/src/llm.ts b/packages/providers/openai/src/llm.ts
index ed025a82d..b3e03a94f 100644
--- a/packages/providers/openai/src/llm.ts
+++ b/packages/providers/openai/src/llm.ts
@@ -14,7 +14,8 @@ import {
   type ToolCallLLMMessageOptions,
 } from "@llamaindex/core/llms";
 import { extractText } from "@llamaindex/core/utils";
-import { getEnv, Tokenizers } from "@llamaindex/env";
+import { getEnv } from "@llamaindex/env";
+import { Tokenizers } from "@llamaindex/env/tokenizers";
 import type {
   AzureClientOptions,
   AzureOpenAI as AzureOpenAILLM,
-- 
GitLab