Skip to content
Snippets Groups Projects
Unverified Commit 18ec1f2f authored by Alex Yang's avatar Alex Yang Committed by GitHub
Browse files

chore: separate tokenizers (#1454)

parent b0fbd8b5
No related branches found
No related tags found
No related merge requests found
Showing
with 45 additions and 21 deletions
// test runtime
import { Tokenizers, tokenizers } from "@llamaindex/env";
import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers";
import "llamaindex";
// @ts-expect-error EdgeRuntime is not defined in type
......
import { type Tokenizers } from "@llamaindex/env";
import type { Tokenizers } from "@llamaindex/env/tokenizers";
import type { MessageContentDetail } from "../llms";
import { BaseNode, MetadataMode, TransformComponent } from "../schema";
import { extractSingleText } from "../utils";
......
import { Tokenizers, tokenizers } from "@llamaindex/env";
import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers";
export function truncateMaxTokens(
tokenizer: Tokenizers,
......
import { getEnv, type Tokenizer } from "@llamaindex/env";
import { getEnv } from "@llamaindex/env";
import type { Tokenizer } from "@llamaindex/env/tokenizers";
import type { LLM } from "../llms";
import {
type CallbackManager,
......
import { AsyncLocalStorage, type Tokenizer, tokenizers } from "@llamaindex/env";
import { AsyncLocalStorage } from "@llamaindex/env";
import { type Tokenizer, tokenizers } from "@llamaindex/env/tokenizers";
const chunkSizeAsyncLocalStorage = new AsyncLocalStorage<Tokenizer>();
let globalTokenizer: Tokenizer = tokenizers.tokenizer();
......
import { type Tokenizer, tokenizers } from "@llamaindex/env";
import type { Tokenizer } from "@llamaindex/env/tokenizers";
import {
DEFAULT_CHUNK_OVERLAP_RATIO,
DEFAULT_CONTEXT_WINDOW,
......@@ -64,7 +64,7 @@ export class PromptHelper {
this.numOutput = numOutput;
this.chunkOverlapRatio = chunkOverlapRatio;
this.chunkSizeLimit = chunkSizeLimit;
this.tokenizer = tokenizer ?? tokenizers.tokenizer();
this.tokenizer = tokenizer ?? Settings.tokenizer;
this.separator = separator;
}
......
import type { Tokenizers } from "@llamaindex/env";
import type { Tokenizers } from "@llamaindex/env/tokenizers";
import type { JSONSchemaType } from "ajv";
import type { JSONObject, JSONValue } from "../global/type";
import type { JSONObject, JSONValue } from "../global";
/**
* @internal
......
import { type Tokenizer, tokenizers } from "@llamaindex/env";
import { type Tokenizer, tokenizers } from "@llamaindex/env/tokenizers";
import { Settings } from "../global";
import type { ChatMessage, LLM, MessageType } from "../llms";
import { defaultSummaryPrompt, type SummaryPrompt } from "../prompts";
......
import type { Tokenizer } from "@llamaindex/env";
import type { Tokenizer } from "@llamaindex/env/tokenizers";
import { z } from "zod";
import { Settings } from "../global";
import { sentenceSplitterSchema } from "../schema";
......
import type { Tokenizer } from "@llamaindex/env";
import type { Tokenizer } from "@llamaindex/env/tokenizers";
import { z } from "zod";
import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, Settings } from "../global";
import { MetadataAwareTextSplitter } from "./base";
......
import type { Tokenizer } from "@llamaindex/env";
import type { Tokenizer } from "@llamaindex/env/tokenizers";
export type SplitterParams = {
tokenizer?: Tokenizer;
......
import { truncateMaxTokens } from "@llamaindex/core/embeddings";
import { Tokenizers, tokenizers } from "@llamaindex/env";
import { Tokenizers, tokenizers } from "@llamaindex/env/tokenizers";
import { describe, expect, test } from "vitest";
describe("truncateMaxTokens", () => {
......
import { SentenceSplitter } from "@llamaindex/core/node-parser";
import { Document } from "@llamaindex/core/schema";
import { tokenizers } from "@llamaindex/env";
import { tokenizers } from "@llamaindex/env/tokenizers";
import { beforeEach, describe, expect, test } from "vitest";
describe("SentenceSplitter", () => {
......
......@@ -76,9 +76,36 @@
"types": "./multi-model/dist/index.d.ts",
"default": "./multi-model/dist/index.js"
}
},
"./tokenizers": {
"workerd": {
"types": "./tokenizers/dist/index.workerd.d.ts",
"default": "./tokenizers/dist/index.workerd.js"
},
"edge-light": {
"types": "./tokenizers/dist/index.edge-light.d.ts",
"default": "./tokenizers/dist/index.edge-light.js"
},
"browser": {
"types": "./tokenizers/dist/index.browser.d.ts",
"default": "./tokenizers/dist/index.browser.js"
},
"import": {
"types": "./tokenizers/dist/index.d.ts",
"default": "./tokenizers/dist/index.js"
},
"require": {
"types": "./tokenizers/dist/index.d.cts",
"default": "./tokenizers/dist/index.cjs"
},
"default": {
"types": "./tokenizers/dist/index.d.ts",
"default": "./tokenizers/dist/index.js"
}
}
},
"files": [
"tokenizers",
"multi-model",
"dist",
"CHANGELOG.md",
......
......@@ -6,7 +6,6 @@
import "./global-check.js";
export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js";
export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js";
export { NotSupportCurrentRuntimeClass } from "./utils/shared.js";
export * from "./web-polyfill.js";
// @ts-expect-error no type
......
......@@ -6,5 +6,4 @@
import "./global-check.js";
export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js";
export * from "./node-polyfill.js";
export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js";
export { NotSupportCurrentRuntimeClass } from "./utils/shared.js";
......@@ -35,7 +35,6 @@ export function createSHA256(): SHA256 {
}
export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js";
export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/node.js";
export {
AsyncLocalStorage,
CustomEvent,
......
......@@ -16,4 +16,3 @@ export function getEnv(name: string): string | undefined {
}
export { consoleLogger, emptyLogger, type Logger } from "./logger/index.js";
export { Tokenizers, tokenizers, type Tokenizer } from "./tokenizers/js.js";
// Note: js-tiktoken it's 60x slower than the WASM implementation - use it only for unsupported environments
// Note: js-tiktoken it's 60x slower than gpt-tokenizer
import { getEncoding } from "js-tiktoken";
import type { Tokenizer } from "./types.js";
import { Tokenizers } from "./types.js";
......
// Note: This is using th WASM implementation of tiktoken which is 60x faster
import type { Tokenizer } from "./types.js";
import { Tokenizers } from "./types.js";
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment