From 20498a2bce11a447c8b0a7bc2cbc01c11dd87f47 Mon Sep 17 00:00:00 2001 From: Marcus Schiesser <mail@marcusschiesser.de> Date: Tue, 25 Jun 2024 18:51:08 +0200 Subject: [PATCH] fix: use tiktoken instead of tiktoken/lite (#967) --- .changeset/odd-insects-vanish.md | 2 +- packages/env/src/index.workerd.ts | 2 +- packages/env/src/tokenizers/node.ts | 10 +++------- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/.changeset/odd-insects-vanish.md b/.changeset/odd-insects-vanish.md index 542af1e97..ac0fc1a82 100644 --- a/.changeset/odd-insects-vanish.md +++ b/.changeset/odd-insects-vanish.md @@ -2,4 +2,4 @@ "@llamaindex/env": patch --- -Fix TypeError: Module "..." needs an import attribute of "type: json" +Use tiktoken instead of tiktoken/lite and disable WASM tiktoken for non-Node environments diff --git a/packages/env/src/index.workerd.ts b/packages/env/src/index.workerd.ts index 1b0d68386..4f4a5a9f4 100644 --- a/packages/env/src/index.workerd.ts +++ b/packages/env/src/index.workerd.ts @@ -13,4 +13,4 @@ export function getEnv(name: string): string | undefined { return INTERNAL_ENV[name]; } -export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/node.js"; +export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js"; diff --git a/packages/env/src/tokenizers/node.ts b/packages/env/src/tokenizers/node.ts index 4a6e6c9f7..0485194aa 100644 --- a/packages/env/src/tokenizers/node.ts +++ b/packages/env/src/tokenizers/node.ts @@ -1,18 +1,14 @@ // Note: This is using th WASM implementation of tiktoken which is 60x faster -import cl100k_base from "tiktoken/encoders/cl100k_base.json" with { type: "json" }; -import { Tiktoken } from "tiktoken/lite"; import type { Tokenizer } from "./types.js"; import { Tokenizers } from "./types.js"; +import { get_encoding } from "tiktoken"; + class TokenizerSingleton { private defaultTokenizer: Tokenizer; constructor() { - const encoding = new Tiktoken( - cl100k_base.bpe_ranks, - cl100k_base.special_tokens, - cl100k_base.pat_str, - ); + const encoding = get_encoding("cl100k_base"); this.defaultTokenizer = { encode: (text: string) => { -- GitLab