diff --git a/.changeset/orange-ducks-pull.md b/.changeset/orange-ducks-pull.md new file mode 100644 index 0000000000000000000000000000000000000000..93d227508d61472c9bd17751df3523666c82bb2b --- /dev/null +++ b/.changeset/orange-ducks-pull.md @@ -0,0 +1,7 @@ +--- +"@llamaindex/env": patch +"@llamaindex/core": patch +"llamaindex": patch +--- + +Fix edge runtime builds by adding missing packages to env package. Make gpt-tokenizer optional for llamaindex to reduce package size. diff --git a/apps/next/src/content/docs/llamaindex/getting_started/setup/node.mdx b/apps/next/src/content/docs/llamaindex/getting_started/setup/node.mdx index e49bc79c0992346e75c66451a4748b8ce296816c..de48fa881d62d64ee03ba1a6abcbd6934cb1504f 100644 --- a/apps/next/src/content/docs/llamaindex/getting_started/setup/node.mdx +++ b/apps/next/src/content/docs/llamaindex/getting_started/setup/node.mdx @@ -3,6 +3,8 @@ title: With Node.js/Bun/Deno description: In this guide, you'll learn how to use LlamaIndex with Node.js, Bun, and Deno. --- +import { Tab, Tabs } from "fumadocs-ui/components/tabs"; + ## Adding environment variables By default, LlamaIndex uses OpenAI provider, which requires an API key. You can set the `OPENAI_API_KEY` environment variable to authenticate with OpenAI. @@ -22,6 +24,26 @@ node --env-file .env your-script.js For more information, see the [How to read environment variables from Node.js](https://nodejs.org/en/learn/command-line/how-to-read-environment-variables-from-nodejs). +## Performance Optimization + +By the default, we are using `js-tiktoken` for tokenization. You can install `gpt-tokenizer` which is then automatically used by LlamaIndex to get a 60x speedup for tokenization: + +<Tabs groupId="install" items={["npm", "yarn", "pnpm"]} persist> + ```shell tab="npm" + npm install gpt-tokenizer + ``` + + ```shell tab="yarn" + yarn add gpt-tokenizer + ``` + + ```shell tab="pnpm" + pnpm add gpt-tokenizer + ``` +</Tabs> + +> Note: This only works for Node.js + ## TypeScript support <Card diff --git a/examples/package.json b/examples/package.json index 3b36cf15272193bbf65e29947d7bb842736cd24f..6a30004e4ceef131c1561f7905029ec90dc66609 100644 --- a/examples/package.json +++ b/examples/package.json @@ -54,7 +54,6 @@ "js-tiktoken": "^1.0.14", "llamaindex": "^0.9.6", "mongodb": "6.7.0", - "pathe": "^1.1.2", "postgres": "^3.4.4", "wikipedia": "^2.1.2", "zod": "^3.23.8" diff --git a/packages/env/package.json b/packages/env/package.json index 7161475b3bc7773a2804eba40bda7587bfc1c1b8..5a6c057305fe57fe4f679cf5e337a50a7da1f32f 100644 --- a/packages/env/package.json +++ b/packages/env/package.json @@ -120,31 +120,22 @@ "@types/node": "^22.9.0", "@types/readable-stream": "^4.0.15", "bunchee": "6.3.4", - "gpt-tokenizer": "^2.6.2", - "pathe": "^1.1.2", "vitest": "^2.1.5" }, - "peerDependencies": { + "dependencies": { + "pathe": "^1.1.2", "@aws-crypto/sha256-js": "^5.2.0", + "js-tiktoken": "^1.0.12" + }, + "peerDependencies": { "@huggingface/transformers": "^3.0.2", - "gpt-tokenizer": "^2.5.0", - "js-tiktoken": "^1.0.12", - "pathe": "^1.1.2" + "gpt-tokenizer": "^2.5.0" }, "peerDependenciesMeta": { - "@aws-crypto/sha256-js": { - "optional": true - }, "@huggingface/transformers": { "optional": true }, - "pathe": { - "optional": true - }, - "tiktoken": { - "optional": true - }, - "js-tiktoken": { + "gpt-tokenizer": { "optional": true } } diff --git a/packages/env/src/internal/tokenizers/node.ts b/packages/env/src/internal/tokenizers/node.ts index 0ccd2b02f7d59b2c39a894fe930628659e40f60f..0ab5b01c4dc5dc38006c857e8efd28bfd45117b1 100644 --- a/packages/env/src/internal/tokenizers/node.ts +++ b/packages/env/src/internal/tokenizers/node.ts @@ -1,20 +1,48 @@ +import { getEncoding } from "js-tiktoken"; import type { Tokenizer } from "./types.js"; import { Tokenizers } from "./types.js"; -import cl100kBase from "gpt-tokenizer"; +function tryLoadGptTokenizer() { + try { + // eslint-disable-next-line @typescript-eslint/no-require-imports + return require("gpt-tokenizer"); // using require for CommonJS compatibility + } catch (e) { + return null; + } +} + +const gptTokenizerModule = tryLoadGptTokenizer(); class TokenizerSingleton { #defaultTokenizer: Tokenizer; constructor() { - this.#defaultTokenizer = { - encode: (text: string): Uint32Array => { - return new Uint32Array(cl100kBase.encode(text)); - }, - decode: (tokens: Uint32Array) => { - return cl100kBase.decode(tokens); - }, - }; + // Use gpt-tokenizer if available, otherwise use js-tiktoken + if (gptTokenizerModule) { + this.#defaultTokenizer = { + encode: (text: string): Uint32Array => { + return new Uint32Array(gptTokenizerModule.encode(text)); + }, + decode: (tokens: Uint32Array): string => { + return gptTokenizerModule.decode(Array.from(tokens)); + }, + }; + } else { + // Fall back to js-tiktoken which is always available + // Note: js-tiktoken it's 60x slower than gpt-tokenizer + const encoding = getEncoding("cl100k_base"); + this.#defaultTokenizer = { + encode: (text: string) => { + return new Uint32Array(encoding.encode(text)); + }, + decode: (tokens: Uint32Array) => { + const numberArray = Array.from(tokens); + const text = encoding.decode(numberArray); + const uint8Array = new TextEncoder().encode(text); + return new TextDecoder().decode(uint8Array); + }, + }; + } } tokenizer(encoding?: Tokenizers): Tokenizer { diff --git a/packages/experimental/package.json b/packages/experimental/package.json index 28841136bcda26b57419ae972f84c00f76ac5b1e..d019f6a90d4789b76ea0a5fcf3f6c9f3fea05777 100644 --- a/packages/experimental/package.json +++ b/packages/experimental/package.json @@ -55,12 +55,10 @@ "dev": "concurrently \"pnpm run build:esm --watch\" \"pnpm run build:cjs --watch\" \"pnpm run build:type --watch\"" }, "devDependencies": { - "@aws-crypto/sha256-js": "^5.2.0", "@swc/cli": "^0.5.0", "@swc/core": "^1.9.2", "@types/jsonpath": "^0.2.4", - "concurrently": "^9.1.0", - "pathe": "^1.1.2" + "concurrently": "^9.1.0" }, "dependencies": { "@types/lodash": "^4.17.7", diff --git a/packages/llamaindex/package.json b/packages/llamaindex/package.json index 616a609ab108eff8bcf7fa12f13e8c8057e2e156..09fb7f0edee57de47a75bdde92070e384d51092b 100644 --- a/packages/llamaindex/package.json +++ b/packages/llamaindex/package.json @@ -30,8 +30,7 @@ "@types/node": "^22.9.0", "ajv": "^8.17.1", "lodash": "^4.17.21", - "magic-bytes.js": "^1.10.0", - "gpt-tokenizer": "^2.6.2" + "magic-bytes.js": "^1.10.0" }, "devDependencies": { "@swc/cli": "^0.5.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7dabf5bdd0a1b40bc0b9951679fb3a40340043f1..771b61465b86579d4588c7856e2674c2925fbea4 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -724,9 +724,6 @@ importers: mongodb: specifier: 6.7.0 version: 6.7.0(@aws-sdk/credential-providers@3.744.0)(socks@2.8.4) - pathe: - specifier: ^1.1.2 - version: 1.1.2 postgres: specifier: ^3.4.4 version: 3.4.5 @@ -1002,9 +999,15 @@ importers: '@aws-crypto/sha256-js': specifier: ^5.2.0 version: 5.2.0 + gpt-tokenizer: + specifier: ^2.5.0 + version: 2.8.1 js-tiktoken: specifier: ^1.0.12 version: 1.0.18 + pathe: + specifier: ^1.1.2 + version: 1.1.2 devDependencies: '@huggingface/transformers': specifier: ^3.0.2 @@ -1018,12 +1021,6 @@ importers: bunchee: specifier: 6.3.4 version: 6.3.4(patch_hash=pavboztthlgni7m5gzw7643oru)(typescript@5.7.3) - gpt-tokenizer: - specifier: ^2.6.2 - version: 2.8.1 - pathe: - specifier: ^1.1.2 - version: 1.1.2 vitest: specifier: ^2.1.5 version: 2.1.5(@edge-runtime/vm@4.0.4)(@types/node@22.9.0)(happy-dom@15.11.7)(msw@2.7.0(@types/node@22.9.0)(typescript@5.7.3))(terser@5.38.2) @@ -1046,9 +1043,6 @@ importers: specifier: ^4.17.21 version: 4.17.21 devDependencies: - '@aws-crypto/sha256-js': - specifier: ^5.2.0 - version: 5.2.0 '@swc/cli': specifier: ^0.5.0 version: 0.5.2(@swc/core@1.10.15(@swc/helpers@0.5.15))(chokidar@3.6.0) @@ -1061,9 +1055,6 @@ importers: concurrently: specifier: ^9.1.0 version: 9.1.2 - pathe: - specifier: ^1.1.2 - version: 1.1.2 packages/llamaindex: dependencies: @@ -1094,9 +1085,6 @@ importers: ajv: specifier: ^8.17.1 version: 8.17.1 - gpt-tokenizer: - specifier: ^2.6.2 - version: 2.8.1 lodash: specifier: ^4.17.21 version: 4.17.21