From 18ec1f2f61c3c346006fb06e43553eaf375b1c76 Mon Sep 17 00:00:00 2001 From: Alex Yang <himself65@outlook.com> Date: Fri, 8 Nov 2024 18:53:05 -0800 Subject: [PATCH] chore: separate tokenizers (#1454) --- .../src/utils/tokenizer.ts | 2 +- packages/core/src/embeddings/base.ts | 2 +- packages/core/src/embeddings/tokenizer.ts | 2 +- packages/core/src/global/settings.ts | 3 ++- .../core/src/global/settings/tokenizer.ts | 3 ++- packages/core/src/indices/prompt-helper.ts | 4 +-- packages/core/src/llms/type.ts | 4 +-- packages/core/src/memory/summary-memory.ts | 2 +- .../core/src/node-parser/sentence-splitter.ts | 2 +- .../src/node-parser/token-text-splitter.ts | 2 +- packages/core/src/node-parser/type.ts | 2 +- packages/core/tests/embeddings.test.ts | 2 +- .../node-parser/sentence-spiller.test.ts | 2 +- packages/env/package.json | 27 +++++++++++++++++++ packages/env/src/index.browser.ts | 1 - packages/env/src/index.edge-light.ts | 1 - packages/env/src/index.ts | 1 - packages/env/src/index.workerd.ts | 1 - .../env/src/{ => internal}/tokenizers/js.ts | 2 +- .../env/src/{ => internal}/tokenizers/node.ts | 1 - .../src/{ => internal}/tokenizers/types.ts | 0 packages/env/src/tokenizers.browser.ts | 5 ++++ packages/env/src/tokenizers.edge-light.ts | 5 ++++ packages/env/src/tokenizers.ts | 5 ++++ packages/env/src/tokenizers.workerd.ts | 5 ++++ packages/env/tests/tokenizer.test.ts | 16 ++++++++--- packages/providers/openai/src/embedding.ts | 3 ++- packages/providers/openai/src/llm.ts | 3 ++- 28 files changed, 82 insertions(+), 26 deletions(-) rename packages/env/src/{ => internal}/tokenizers/js.ts (90%) rename packages/env/src/{ => internal}/tokenizers/node.ts (91%) rename packages/env/src/{ => internal}/tokenizers/types.ts (100%) create mode 100644 packages/env/src/tokenizers.browser.ts create mode 100644 packages/env/src/tokenizers.edge-light.ts create mode 100644 packages/env/src/tokenizers.ts create mode 100644 packages/env/src/tokenizers.workerd.ts diff --git a/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts b/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts index 414b76a3f..468c74d6f 100644 --- a/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts +++ b/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts @@ -1,5 +1,5 @@ // test runtime -import { Tokenizers, tokenizers } from "@llamaindex/env"; +import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers"; import "llamaindex"; // @ts-expect-error EdgeRuntime is not defined in type diff --git a/packages/core/src/embeddings/base.ts b/packages/core/src/embeddings/base.ts index 55678ca8d..d56b769ec 100644 --- a/packages/core/src/embeddings/base.ts +++ b/packages/core/src/embeddings/base.ts @@ -1,4 +1,4 @@ -import { type Tokenizers } from "@llamaindex/env"; +import type { Tokenizers } from "@llamaindex/env/tokenizers"; import type { MessageContentDetail } from "../llms"; import { BaseNode, MetadataMode, TransformComponent } from "../schema"; import { extractSingleText } from "../utils"; diff --git a/packages/core/src/embeddings/tokenizer.ts b/packages/core/src/embeddings/tokenizer.ts index 42fba032a..0a4cc8f7e 100644 --- a/packages/core/src/embeddings/tokenizer.ts +++ b/packages/core/src/embeddings/tokenizer.ts @@ -1,4 +1,4 @@ -import { Tokenizers, tokenizers } from "@llamaindex/env"; +import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers"; export function truncateMaxTokens( tokenizer: Tokenizers, diff --git a/packages/core/src/global/settings.ts b/packages/core/src/global/settings.ts index 5b49d9d75..3a3af7ccf 100644 --- a/packages/core/src/global/settings.ts +++ b/packages/core/src/global/settings.ts @@ -1,4 +1,5 @@ -import { getEnv, type Tokenizer } from "@llamaindex/env"; +import { getEnv } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; import type { LLM } from "../llms"; import { type CallbackManager, diff --git a/packages/core/src/global/settings/tokenizer.ts b/packages/core/src/global/settings/tokenizer.ts index bdb6e9943..ca2aada87 100644 --- a/packages/core/src/global/settings/tokenizer.ts +++ b/packages/core/src/global/settings/tokenizer.ts @@ -1,4 +1,5 @@ -import { AsyncLocalStorage, type Tokenizer, tokenizers } from "@llamaindex/env"; +import { AsyncLocalStorage } from "@llamaindex/env"; +import { type Tokenizer, tokenizers } from "@llamaindex/env/tokenizers"; const chunkSizeAsyncLocalStorage = new AsyncLocalStorage<Tokenizer>(); let globalTokenizer: Tokenizer = tokenizers.tokenizer(); diff --git a/packages/core/src/indices/prompt-helper.ts b/packages/core/src/indices/prompt-helper.ts index 477c5037c..4abf6c75b 100644 --- a/packages/core/src/indices/prompt-helper.ts +++ b/packages/core/src/indices/prompt-helper.ts @@ -1,4 +1,4 @@ -import { type Tokenizer, tokenizers } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; import { DEFAULT_CHUNK_OVERLAP_RATIO, DEFAULT_CONTEXT_WINDOW, @@ -64,7 +64,7 @@ export class PromptHelper { this.numOutput = numOutput; this.chunkOverlapRatio = chunkOverlapRatio; this.chunkSizeLimit = chunkSizeLimit; - this.tokenizer = tokenizer ?? tokenizers.tokenizer(); + this.tokenizer = tokenizer ?? Settings.tokenizer; this.separator = separator; } diff --git a/packages/core/src/llms/type.ts b/packages/core/src/llms/type.ts index ea402ec5d..1ea15a211 100644 --- a/packages/core/src/llms/type.ts +++ b/packages/core/src/llms/type.ts @@ -1,6 +1,6 @@ -import type { Tokenizers } from "@llamaindex/env"; +import type { Tokenizers } from "@llamaindex/env/tokenizers"; import type { JSONSchemaType } from "ajv"; -import type { JSONObject, JSONValue } from "../global/type"; +import type { JSONObject, JSONValue } from "../global"; /** * @internal diff --git a/packages/core/src/memory/summary-memory.ts b/packages/core/src/memory/summary-memory.ts index e750aa367..d0838904b 100644 --- a/packages/core/src/memory/summary-memory.ts +++ b/packages/core/src/memory/summary-memory.ts @@ -1,4 +1,4 @@ -import { type Tokenizer, tokenizers } from "@llamaindex/env"; +import { type Tokenizer, tokenizers } from "@llamaindex/env/tokenizers"; import { Settings } from "../global"; import type { ChatMessage, LLM, MessageType } from "../llms"; import { defaultSummaryPrompt, type SummaryPrompt } from "../prompts"; diff --git a/packages/core/src/node-parser/sentence-splitter.ts b/packages/core/src/node-parser/sentence-splitter.ts index 49c9fe241..67c2d784d 100644 --- a/packages/core/src/node-parser/sentence-splitter.ts +++ b/packages/core/src/node-parser/sentence-splitter.ts @@ -1,4 +1,4 @@ -import type { Tokenizer } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; import { z } from "zod"; import { Settings } from "../global"; import { sentenceSplitterSchema } from "../schema"; diff --git a/packages/core/src/node-parser/token-text-splitter.ts b/packages/core/src/node-parser/token-text-splitter.ts index e4f7b8dd8..e0d5daf91 100644 --- a/packages/core/src/node-parser/token-text-splitter.ts +++ b/packages/core/src/node-parser/token-text-splitter.ts @@ -1,4 +1,4 @@ -import type { Tokenizer } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; import { z } from "zod"; import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, Settings } from "../global"; import { MetadataAwareTextSplitter } from "./base"; diff --git a/packages/core/src/node-parser/type.ts b/packages/core/src/node-parser/type.ts index 0974f761f..0c3a7160e 100644 --- a/packages/core/src/node-parser/type.ts +++ b/packages/core/src/node-parser/type.ts @@ -1,4 +1,4 @@ -import type { Tokenizer } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; export type SplitterParams = { tokenizer?: Tokenizer; diff --git a/packages/core/tests/embeddings.test.ts b/packages/core/tests/embeddings.test.ts index 3f0a12f8c..a1b79cbe8 100644 --- a/packages/core/tests/embeddings.test.ts +++ b/packages/core/tests/embeddings.test.ts @@ -1,5 +1,5 @@ import { truncateMaxTokens } from "@llamaindex/core/embeddings"; -import { Tokenizers, tokenizers } from "@llamaindex/env"; +import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers"; import { describe, expect, test } from "vitest"; describe("truncateMaxTokens", () => { diff --git a/packages/core/tests/node-parser/sentence-spiller.test.ts b/packages/core/tests/node-parser/sentence-spiller.test.ts index 60c7931e0..281995f71 100644 --- a/packages/core/tests/node-parser/sentence-spiller.test.ts +++ b/packages/core/tests/node-parser/sentence-spiller.test.ts @@ -1,6 +1,6 @@ import { SentenceSplitter } from "@llamaindex/core/node-parser"; import { Document } from "@llamaindex/core/schema"; -import { tokenizers } from "@llamaindex/env"; +import { tokenizers } from "@llamaindex/env/tokenizers"; import { beforeEach, describe, expect, test } from "vitest"; describe("SentenceSplitter", () => { diff --git a/packages/env/package.json b/packages/env/package.json index 8446a5af5..b8d028151 100644 --- a/packages/env/package.json +++ b/packages/env/package.json @@ -76,9 +76,36 @@ "types": "./multi-model/dist/index.d.ts", "default": "./multi-model/dist/index.js" } + }, + "./tokenizers": { + "workerd": { + "types": "./tokenizers/dist/index.workerd.d.ts", + "default": "./tokenizers/dist/index.workerd.js" + }, + "edge-light": { + "types": "./tokenizers/dist/index.edge-light.d.ts", + "default": "./tokenizers/dist/index.edge-light.js" + }, + "browser": { + "types": "./tokenizers/dist/index.browser.d.ts", + "default": "./tokenizers/dist/index.browser.js" + }, + "import": { + "types": "./tokenizers/dist/index.d.ts", + "default": "./tokenizers/dist/index.js" + }, + "require": { + "types": "./tokenizers/dist/index.d.cts", + "default": "./tokenizers/dist/index.cjs" + }, + "default": { + "types": "./tokenizers/dist/index.d.ts", + "default": "./tokenizers/dist/index.js" + } } }, "files": [ + "tokenizers", "multi-model", "dist", "CHANGELOG.md", diff --git a/packages/env/src/index.browser.ts b/packages/env/src/index.browser.ts index 9d55f99ed..a6eeef325 100644 --- a/packages/env/src/index.browser.ts +++ b/packages/env/src/index.browser.ts @@ -6,7 +6,6 @@ import "./global-check.js"; export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js"; -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js"; export { NotSupportCurrentRuntimeClass } from "./utils/shared.js"; export * from "./web-polyfill.js"; // @ts-expect-error no type diff --git a/packages/env/src/index.edge-light.ts b/packages/env/src/index.edge-light.ts index f7e49cd31..f6f1cfd1e 100644 --- a/packages/env/src/index.edge-light.ts +++ b/packages/env/src/index.edge-light.ts @@ -6,5 +6,4 @@ import "./global-check.js"; export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js"; export * from "./node-polyfill.js"; -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js"; export { NotSupportCurrentRuntimeClass } from "./utils/shared.js"; diff --git a/packages/env/src/index.ts b/packages/env/src/index.ts index cb7e77e64..446b31c4c 100644 --- a/packages/env/src/index.ts +++ b/packages/env/src/index.ts @@ -35,7 +35,6 @@ export function createSHA256(): SHA256 { } export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js"; -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/node.js"; export { AsyncLocalStorage, CustomEvent, diff --git a/packages/env/src/index.workerd.ts b/packages/env/src/index.workerd.ts index 1eacb2860..22f152c16 100644 --- a/packages/env/src/index.workerd.ts +++ b/packages/env/src/index.workerd.ts @@ -16,4 +16,3 @@ export function getEnv(name: string): string | undefined { } export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js"; -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js"; diff --git a/packages/env/src/tokenizers/js.ts b/packages/env/src/internal/tokenizers/js.ts similarity index 90% rename from packages/env/src/tokenizers/js.ts rename to packages/env/src/internal/tokenizers/js.ts index ad2c0fec3..31aed9e03 100644 --- a/packages/env/src/tokenizers/js.ts +++ b/packages/env/src/internal/tokenizers/js.ts @@ -1,4 +1,4 @@ -// Note: js-tiktoken it's 60x slower than the WASM implementation - use it only for unsupported environments +// Note: js-tiktoken it's 60x slower than gpt-tokenizer import { getEncoding } from "js-tiktoken"; import type { Tokenizer } from "./types.js"; import { Tokenizers } from "./types.js"; diff --git a/packages/env/src/tokenizers/node.ts b/packages/env/src/internal/tokenizers/node.ts similarity index 91% rename from packages/env/src/tokenizers/node.ts rename to packages/env/src/internal/tokenizers/node.ts index 592b53741..0ccd2b02f 100644 --- a/packages/env/src/tokenizers/node.ts +++ b/packages/env/src/internal/tokenizers/node.ts @@ -1,4 +1,3 @@ -// Note: This is using th WASM implementation of tiktoken which is 60x faster import type { Tokenizer } from "./types.js"; import { Tokenizers } from "./types.js"; diff --git a/packages/env/src/tokenizers/types.ts b/packages/env/src/internal/tokenizers/types.ts similarity index 100% rename from packages/env/src/tokenizers/types.ts rename to packages/env/src/internal/tokenizers/types.ts diff --git a/packages/env/src/tokenizers.browser.ts b/packages/env/src/tokenizers.browser.ts new file mode 100644 index 000000000..de80cda92 --- /dev/null +++ b/packages/env/src/tokenizers.browser.ts @@ -0,0 +1,5 @@ +export { + Tokenizers, + tokenizers, + type Tokenizer, +} from "./internal/tokenizers/js.js"; diff --git a/packages/env/src/tokenizers.edge-light.ts b/packages/env/src/tokenizers.edge-light.ts new file mode 100644 index 000000000..de80cda92 --- /dev/null +++ b/packages/env/src/tokenizers.edge-light.ts @@ -0,0 +1,5 @@ +export { + Tokenizers, + tokenizers, + type Tokenizer, +} from "./internal/tokenizers/js.js"; diff --git a/packages/env/src/tokenizers.ts b/packages/env/src/tokenizers.ts new file mode 100644 index 000000000..a1ac4a758 --- /dev/null +++ b/packages/env/src/tokenizers.ts @@ -0,0 +1,5 @@ +export { + Tokenizers, + tokenizers, + type Tokenizer, +} from "./internal/tokenizers/node.js"; diff --git a/packages/env/src/tokenizers.workerd.ts b/packages/env/src/tokenizers.workerd.ts new file mode 100644 index 000000000..de80cda92 --- /dev/null +++ b/packages/env/src/tokenizers.workerd.ts @@ -0,0 +1,5 @@ +export { + Tokenizers, + tokenizers, + type Tokenizer, +} from "./internal/tokenizers/js.js"; diff --git a/packages/env/tests/tokenizer.test.ts b/packages/env/tests/tokenizer.test.ts index 530f24009..c6b0329a2 100644 --- a/packages/env/tests/tokenizer.test.ts +++ b/packages/env/tests/tokenizer.test.ts @@ -1,11 +1,21 @@ import { describe, expect, it } from "vitest"; -import { tokenizers } from "../src/tokenizers/node.js"; +import { tokenizers as fallbackTokenizers } from "../src/internal/tokenizers/js.js"; +import { tokenizers as nodeTokenizers } from "../src/internal/tokenizers/node.js"; -describe("tokenizer", () => { +describe("node tokenizer", () => { it("should tokenize text", () => { - const tokenizer = tokenizers.tokenizer(); + const tokenizer = nodeTokenizers.tokenizer(); expect(tokenizer.decode(tokenizer.encode("hello world"))).toBe( "hello world", ); }); + + it("should have same result as fallback tokenizer", () => { + const nodeTokenizer = nodeTokenizers.tokenizer(); + const fallbackTokenizer = fallbackTokenizers.tokenizer(); + const text = "hello world"; + expect(nodeTokenizer.decode(nodeTokenizer.encode(text))).toBe( + fallbackTokenizer.decode(fallbackTokenizer.encode(text)), + ); + }); }); diff --git a/packages/providers/openai/src/embedding.ts b/packages/providers/openai/src/embedding.ts index 26c40849c..345b76573 100644 --- a/packages/providers/openai/src/embedding.ts +++ b/packages/providers/openai/src/embedding.ts @@ -1,5 +1,6 @@ import { BaseEmbedding } from "@llamaindex/core/embeddings"; -import { getEnv, Tokenizers } from "@llamaindex/env"; +import { getEnv } from "@llamaindex/env"; +import { Tokenizers } from "@llamaindex/env/tokenizers"; import type { AzureClientOptions, AzureOpenAI as AzureOpenAILLM, diff --git a/packages/providers/openai/src/llm.ts b/packages/providers/openai/src/llm.ts index ed025a82d..b3e03a94f 100644 --- a/packages/providers/openai/src/llm.ts +++ b/packages/providers/openai/src/llm.ts @@ -14,7 +14,8 @@ import { type ToolCallLLMMessageOptions, } from "@llamaindex/core/llms"; import { extractText } from "@llamaindex/core/utils"; -import { getEnv, Tokenizers } from "@llamaindex/env"; +import { getEnv } from "@llamaindex/env"; +import { Tokenizers } from "@llamaindex/env/tokenizers"; import type { AzureClientOptions, AzureOpenAI as AzureOpenAILLM, -- GitLab