diff --git a/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts b/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts index 414b76a3f51209b91f7cdac36e124694841996e4..468c74d6fed3d69167a2069e745354bf628e623f 100644 --- a/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts +++ b/e2e/examples/nextjs-node-runtime/src/utils/tokenizer.ts @@ -1,5 +1,5 @@ // test runtime -import { Tokenizers, tokenizers } from "@llamaindex/env"; +import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers"; import "llamaindex"; // @ts-expect-error EdgeRuntime is not defined in type diff --git a/packages/core/src/embeddings/base.ts b/packages/core/src/embeddings/base.ts index 55678ca8df707a6aba4c4b3e669344db30e2a8be..d56b769ecbb8bad74c52ae98358401f03dd36d11 100644 --- a/packages/core/src/embeddings/base.ts +++ b/packages/core/src/embeddings/base.ts @@ -1,4 +1,4 @@ -import { type Tokenizers } from "@llamaindex/env"; +import type { Tokenizers } from "@llamaindex/env/tokenizers"; import type { MessageContentDetail } from "../llms"; import { BaseNode, MetadataMode, TransformComponent } from "../schema"; import { extractSingleText } from "../utils"; diff --git a/packages/core/src/embeddings/tokenizer.ts b/packages/core/src/embeddings/tokenizer.ts index 42fba032a3f4b2173c42d28e89c357542b04c61b..0a4cc8f7ee361c8be9a5fd7e098eeba1a255b188 100644 --- a/packages/core/src/embeddings/tokenizer.ts +++ b/packages/core/src/embeddings/tokenizer.ts @@ -1,4 +1,4 @@ -import { Tokenizers, tokenizers } from "@llamaindex/env"; +import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers"; export function truncateMaxTokens( tokenizer: Tokenizers, diff --git a/packages/core/src/global/settings.ts b/packages/core/src/global/settings.ts index 5b49d9d75d8a5f15bef891a2fed8f93927cd1031..3a3af7ccf7a9c6f15a9ba0db5e1ecb3ba37db909 100644 --- a/packages/core/src/global/settings.ts +++ b/packages/core/src/global/settings.ts @@ -1,4 +1,5 @@ -import { getEnv, type Tokenizer } from "@llamaindex/env"; +import { getEnv } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; import type { LLM } from "../llms"; import { type CallbackManager, diff --git a/packages/core/src/global/settings/tokenizer.ts b/packages/core/src/global/settings/tokenizer.ts index bdb6e9943e1d286af7bb5576f55fbb3c1dc85c05..ca2aada873744cb51928fb4e38dbc5668154ef4e 100644 --- a/packages/core/src/global/settings/tokenizer.ts +++ b/packages/core/src/global/settings/tokenizer.ts @@ -1,4 +1,5 @@ -import { AsyncLocalStorage, type Tokenizer, tokenizers } from "@llamaindex/env"; +import { AsyncLocalStorage } from "@llamaindex/env"; +import { type Tokenizer, tokenizers } from "@llamaindex/env/tokenizers"; const chunkSizeAsyncLocalStorage = new AsyncLocalStorage<Tokenizer>(); let globalTokenizer: Tokenizer = tokenizers.tokenizer(); diff --git a/packages/core/src/indices/prompt-helper.ts b/packages/core/src/indices/prompt-helper.ts index 477c5037cb1b06841fa422adfef0fe9392ba31e8..4abf6c75b7059a327bd50cc4be4f9780371f917b 100644 --- a/packages/core/src/indices/prompt-helper.ts +++ b/packages/core/src/indices/prompt-helper.ts @@ -1,4 +1,4 @@ -import { type Tokenizer, tokenizers } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; import { DEFAULT_CHUNK_OVERLAP_RATIO, DEFAULT_CONTEXT_WINDOW, @@ -64,7 +64,7 @@ export class PromptHelper { this.numOutput = numOutput; this.chunkOverlapRatio = chunkOverlapRatio; this.chunkSizeLimit = chunkSizeLimit; - this.tokenizer = tokenizer ?? tokenizers.tokenizer(); + this.tokenizer = tokenizer ?? Settings.tokenizer; this.separator = separator; } diff --git a/packages/core/src/llms/type.ts b/packages/core/src/llms/type.ts index ea402ec5db68ef3a33fe3862e3537cd88b991ecf..1ea15a211cb982b319e4e749e0ac152369be7720 100644 --- a/packages/core/src/llms/type.ts +++ b/packages/core/src/llms/type.ts @@ -1,6 +1,6 @@ -import type { Tokenizers } from "@llamaindex/env"; +import type { Tokenizers } from "@llamaindex/env/tokenizers"; import type { JSONSchemaType } from "ajv"; -import type { JSONObject, JSONValue } from "../global/type"; +import type { JSONObject, JSONValue } from "../global"; /** * @internal diff --git a/packages/core/src/memory/summary-memory.ts b/packages/core/src/memory/summary-memory.ts index e750aa3671d3c1e601275c53a292b3be8bd84c2d..d0838904b67a839d72c1084b579028b5585389e7 100644 --- a/packages/core/src/memory/summary-memory.ts +++ b/packages/core/src/memory/summary-memory.ts @@ -1,4 +1,4 @@ -import { type Tokenizer, tokenizers } from "@llamaindex/env"; +import { type Tokenizer, tokenizers } from "@llamaindex/env/tokenizers"; import { Settings } from "../global"; import type { ChatMessage, LLM, MessageType } from "../llms"; import { defaultSummaryPrompt, type SummaryPrompt } from "../prompts"; diff --git a/packages/core/src/node-parser/sentence-splitter.ts b/packages/core/src/node-parser/sentence-splitter.ts index 49c9fe241f502267be9736cd2541b800554f5011..67c2d784dab2ca0acd7e78c6894748957ec497fc 100644 --- a/packages/core/src/node-parser/sentence-splitter.ts +++ b/packages/core/src/node-parser/sentence-splitter.ts @@ -1,4 +1,4 @@ -import type { Tokenizer } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; import { z } from "zod"; import { Settings } from "../global"; import { sentenceSplitterSchema } from "../schema"; diff --git a/packages/core/src/node-parser/token-text-splitter.ts b/packages/core/src/node-parser/token-text-splitter.ts index e4f7b8dd8bbcf24668ac712b56c93df1b7d129c1..e0d5daf9149aa42ec6bed6d2773bc2edbf57f89c 100644 --- a/packages/core/src/node-parser/token-text-splitter.ts +++ b/packages/core/src/node-parser/token-text-splitter.ts @@ -1,4 +1,4 @@ -import type { Tokenizer } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; import { z } from "zod"; import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, Settings } from "../global"; import { MetadataAwareTextSplitter } from "./base"; diff --git a/packages/core/src/node-parser/type.ts b/packages/core/src/node-parser/type.ts index 0974f761f74f1b37213e7115e7437324f701e323..0c3a7160e817fe57fa4793b8929544e05fefefa7 100644 --- a/packages/core/src/node-parser/type.ts +++ b/packages/core/src/node-parser/type.ts @@ -1,4 +1,4 @@ -import type { Tokenizer } from "@llamaindex/env"; +import type { Tokenizer } from "@llamaindex/env/tokenizers"; export type SplitterParams = { tokenizer?: Tokenizer; diff --git a/packages/core/tests/embeddings.test.ts b/packages/core/tests/embeddings.test.ts index 3f0a12f8cd56f63ed84856e9978e14ce5601e4cc..a1b79cbe83138ed6442422dbca941233f9527575 100644 --- a/packages/core/tests/embeddings.test.ts +++ b/packages/core/tests/embeddings.test.ts @@ -1,5 +1,5 @@ import { truncateMaxTokens } from "@llamaindex/core/embeddings"; -import { Tokenizers, tokenizers } from "@llamaindex/env"; +import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers"; import { describe, expect, test } from "vitest"; describe("truncateMaxTokens", () => { diff --git a/packages/core/tests/node-parser/sentence-spiller.test.ts b/packages/core/tests/node-parser/sentence-spiller.test.ts index 60c7931e0c89e5f2434cdd776ff7bb04fa2b0b16..281995f71fc342231d936653de5e017fcbbafaad 100644 --- a/packages/core/tests/node-parser/sentence-spiller.test.ts +++ b/packages/core/tests/node-parser/sentence-spiller.test.ts @@ -1,6 +1,6 @@ import { SentenceSplitter } from "@llamaindex/core/node-parser"; import { Document } from "@llamaindex/core/schema"; -import { tokenizers } from "@llamaindex/env"; +import { tokenizers } from "@llamaindex/env/tokenizers"; import { beforeEach, describe, expect, test } from "vitest"; describe("SentenceSplitter", () => { diff --git a/packages/env/package.json b/packages/env/package.json index 8446a5af554bc276a318d3e0d592f253aca9dbe2..b8d02815100ff100b0c0ff7aa6245831827331e4 100644 --- a/packages/env/package.json +++ b/packages/env/package.json @@ -76,9 +76,36 @@ "types": "./multi-model/dist/index.d.ts", "default": "./multi-model/dist/index.js" } + }, + "./tokenizers": { + "workerd": { + "types": "./tokenizers/dist/index.workerd.d.ts", + "default": "./tokenizers/dist/index.workerd.js" + }, + "edge-light": { + "types": "./tokenizers/dist/index.edge-light.d.ts", + "default": "./tokenizers/dist/index.edge-light.js" + }, + "browser": { + "types": "./tokenizers/dist/index.browser.d.ts", + "default": "./tokenizers/dist/index.browser.js" + }, + "import": { + "types": "./tokenizers/dist/index.d.ts", + "default": "./tokenizers/dist/index.js" + }, + "require": { + "types": "./tokenizers/dist/index.d.cts", + "default": "./tokenizers/dist/index.cjs" + }, + "default": { + "types": "./tokenizers/dist/index.d.ts", + "default": "./tokenizers/dist/index.js" + } } }, "files": [ + "tokenizers", "multi-model", "dist", "CHANGELOG.md", diff --git a/packages/env/src/index.browser.ts b/packages/env/src/index.browser.ts index 9d55f99eddb65e412e49f81e9a11ef98c15cab66..a6eeef325c7331e1ed57b79b455fa709960d066d 100644 --- a/packages/env/src/index.browser.ts +++ b/packages/env/src/index.browser.ts @@ -6,7 +6,6 @@ import "./global-check.js"; export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js"; -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js"; export { NotSupportCurrentRuntimeClass } from "./utils/shared.js"; export * from "./web-polyfill.js"; // @ts-expect-error no type diff --git a/packages/env/src/index.edge-light.ts b/packages/env/src/index.edge-light.ts index f7e49cd319412891886d5c1fd489d5bd982e766e..f6f1cfd1ef3aa71ceace9bd2a609b84397aa1970 100644 --- a/packages/env/src/index.edge-light.ts +++ b/packages/env/src/index.edge-light.ts @@ -6,5 +6,4 @@ import "./global-check.js"; export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js"; export * from "./node-polyfill.js"; -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js"; export { NotSupportCurrentRuntimeClass } from "./utils/shared.js"; diff --git a/packages/env/src/index.ts b/packages/env/src/index.ts index cb7e77e64fd1d8d81ad1552956dee7e50b428f2a..446b31c4c41a3e30d353a5f5c144b02d463c4985 100644 --- a/packages/env/src/index.ts +++ b/packages/env/src/index.ts @@ -35,7 +35,6 @@ export function createSHA256(): SHA256 { } export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js"; -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/node.js"; export { AsyncLocalStorage, CustomEvent, diff --git a/packages/env/src/index.workerd.ts b/packages/env/src/index.workerd.ts index 1eacb2860c387372e9aabb2ded70d6d6476b02db..22f152c16a79c51b35151aecb681b82a72e5faa4 100644 --- a/packages/env/src/index.workerd.ts +++ b/packages/env/src/index.workerd.ts @@ -16,4 +16,3 @@ export function getEnv(name: string): string | undefined { } export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js"; -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js"; diff --git a/packages/env/src/tokenizers/js.ts b/packages/env/src/internal/tokenizers/js.ts similarity index 90% rename from packages/env/src/tokenizers/js.ts rename to packages/env/src/internal/tokenizers/js.ts index ad2c0fec371d60630602bd6257f37ef81f3f153f..31aed9e03610dd89332a9bae7aaeb0b8bea764cf 100644 --- a/packages/env/src/tokenizers/js.ts +++ b/packages/env/src/internal/tokenizers/js.ts @@ -1,4 +1,4 @@ -// Note: js-tiktoken it's 60x slower than the WASM implementation - use it only for unsupported environments +// Note: js-tiktoken it's 60x slower than gpt-tokenizer import { getEncoding } from "js-tiktoken"; import type { Tokenizer } from "./types.js"; import { Tokenizers } from "./types.js"; diff --git a/packages/env/src/tokenizers/node.ts b/packages/env/src/internal/tokenizers/node.ts similarity index 91% rename from packages/env/src/tokenizers/node.ts rename to packages/env/src/internal/tokenizers/node.ts index 592b53741dd6daf3c84bb563dba0c81370bcf5f2..0ccd2b02f7d59b2c39a894fe930628659e40f60f 100644 --- a/packages/env/src/tokenizers/node.ts +++ b/packages/env/src/internal/tokenizers/node.ts @@ -1,4 +1,3 @@ -// Note: This is using th WASM implementation of tiktoken which is 60x faster import type { Tokenizer } from "./types.js"; import { Tokenizers } from "./types.js"; diff --git a/packages/env/src/tokenizers/types.ts b/packages/env/src/internal/tokenizers/types.ts similarity index 100% rename from packages/env/src/tokenizers/types.ts rename to packages/env/src/internal/tokenizers/types.ts diff --git a/packages/env/src/tokenizers.browser.ts b/packages/env/src/tokenizers.browser.ts new file mode 100644 index 0000000000000000000000000000000000000000..de80cda92ae949abcb4cac12632fd3950923aca5 --- /dev/null +++ b/packages/env/src/tokenizers.browser.ts @@ -0,0 +1,5 @@ +export { + Tokenizers, + tokenizers, + type Tokenizer, +} from "./internal/tokenizers/js.js"; diff --git a/packages/env/src/tokenizers.edge-light.ts b/packages/env/src/tokenizers.edge-light.ts new file mode 100644 index 0000000000000000000000000000000000000000..de80cda92ae949abcb4cac12632fd3950923aca5 --- /dev/null +++ b/packages/env/src/tokenizers.edge-light.ts @@ -0,0 +1,5 @@ +export { + Tokenizers, + tokenizers, + type Tokenizer, +} from "./internal/tokenizers/js.js"; diff --git a/packages/env/src/tokenizers.ts b/packages/env/src/tokenizers.ts new file mode 100644 index 0000000000000000000000000000000000000000..a1ac4a758cfae99261f9d153104c5f4b062feb67 --- /dev/null +++ b/packages/env/src/tokenizers.ts @@ -0,0 +1,5 @@ +export { + Tokenizers, + tokenizers, + type Tokenizer, +} from "./internal/tokenizers/node.js"; diff --git a/packages/env/src/tokenizers.workerd.ts b/packages/env/src/tokenizers.workerd.ts new file mode 100644 index 0000000000000000000000000000000000000000..de80cda92ae949abcb4cac12632fd3950923aca5 --- /dev/null +++ b/packages/env/src/tokenizers.workerd.ts @@ -0,0 +1,5 @@ +export { + Tokenizers, + tokenizers, + type Tokenizer, +} from "./internal/tokenizers/js.js"; diff --git a/packages/env/tests/tokenizer.test.ts b/packages/env/tests/tokenizer.test.ts index 530f24009173fd7908419b20eb2c8bbda04db888..c6b0329a2d728810b693a627c2fbdcbbf4395e24 100644 --- a/packages/env/tests/tokenizer.test.ts +++ b/packages/env/tests/tokenizer.test.ts @@ -1,11 +1,21 @@ import { describe, expect, it } from "vitest"; -import { tokenizers } from "../src/tokenizers/node.js"; +import { tokenizers as fallbackTokenizers } from "../src/internal/tokenizers/js.js"; +import { tokenizers as nodeTokenizers } from "../src/internal/tokenizers/node.js"; -describe("tokenizer", () => { +describe("node tokenizer", () => { it("should tokenize text", () => { - const tokenizer = tokenizers.tokenizer(); + const tokenizer = nodeTokenizers.tokenizer(); expect(tokenizer.decode(tokenizer.encode("hello world"))).toBe( "hello world", ); }); + + it("should have same result as fallback tokenizer", () => { + const nodeTokenizer = nodeTokenizers.tokenizer(); + const fallbackTokenizer = fallbackTokenizers.tokenizer(); + const text = "hello world"; + expect(nodeTokenizer.decode(nodeTokenizer.encode(text))).toBe( + fallbackTokenizer.decode(fallbackTokenizer.encode(text)), + ); + }); }); diff --git a/packages/providers/openai/src/embedding.ts b/packages/providers/openai/src/embedding.ts index 26c40849cff57d7f9f5836fb58e7bfab7dd849f0..345b76573ab5277f89f7c2df67db6f7dc62cee1a 100644 --- a/packages/providers/openai/src/embedding.ts +++ b/packages/providers/openai/src/embedding.ts @@ -1,5 +1,6 @@ import { BaseEmbedding } from "@llamaindex/core/embeddings"; -import { getEnv, Tokenizers } from "@llamaindex/env"; +import { getEnv } from "@llamaindex/env"; +import { Tokenizers } from "@llamaindex/env/tokenizers"; import type { AzureClientOptions, AzureOpenAI as AzureOpenAILLM, diff --git a/packages/providers/openai/src/llm.ts b/packages/providers/openai/src/llm.ts index ed025a82df74838b147329d3b0cc6fc5b877ba44..b3e03a94f97506869f10f1471a3264f425316142 100644 --- a/packages/providers/openai/src/llm.ts +++ b/packages/providers/openai/src/llm.ts @@ -14,7 +14,8 @@ import { type ToolCallLLMMessageOptions, } from "@llamaindex/core/llms"; import { extractText } from "@llamaindex/core/utils"; -import { getEnv, Tokenizers } from "@llamaindex/env"; +import { getEnv } from "@llamaindex/env"; +import { Tokenizers } from "@llamaindex/env/tokenizers"; import type { AzureClientOptions, AzureOpenAI as AzureOpenAILLM,