Skip to content
Snippets Groups Projects
Commit fba49b80 authored by V4N's avatar V4N
Browse files

replace tiktoken with js-tiktoken

parent 6e0ee9ec
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,7 @@
"dependencies": {
"@anthropic-ai/sdk": "^0.8.1",
"@notionhq/client": "^2.2.13",
"js-tiktoken": "^1.0.7",
"lodash": "^4.17.21",
"mammoth": "^1.6.0",
"md-utils-ts": "^2.0.0",
......@@ -17,7 +18,6 @@
"rake-modified": "^1.0.8",
"replicate": "^0.20.1",
"string-strip-html": "^13.4.3",
"tiktoken": "^1.0.10",
"uuid": "^9.0.1",
"wink-nlp": "^1.14.3"
},
......
import cl100k_base from "tiktoken/encoders/cl100k_base.json";
import { Tiktoken } from "tiktoken/lite";
import { encodingForModel, TiktokenModel } from "js-tiktoken";
import { v4 as uuidv4 } from "uuid";
import { Event, EventTag, EventType } from "./callbacks/CallbackManager";
......@@ -18,18 +17,17 @@ class GlobalsHelper {
} | null = null;
private initDefaultTokenizer() {
const encoding = new Tiktoken(
cl100k_base.bpe_ranks,
cl100k_base.special_tokens,
cl100k_base.pat_str,
);
const encoding = encodingForModel("text-embedding-ada-002"); // cl100k_base
this.defaultTokenizer = {
encode: (text: string) => {
return encoding.encode(text);
return new Uint32Array(encoding.encode(text));
},
decode: (tokens: Uint32Array) => {
return new TextDecoder().decode(encoding.decode(tokens));
const numberArray = Array.from(tokens);
const text = encoding.decode(numberArray);
const uint8Array = new TextEncoder().encode(text);
return new TextDecoder().decode(uint8Array);
},
};
}
......@@ -41,10 +39,10 @@ class GlobalsHelper {
if (!this.defaultTokenizer) {
this.initDefaultTokenizer();
}
return this.defaultTokenizer!.encode.bind(this.defaultTokenizer);
}
tokenizerDecoder(encoding?: string) {
if (encoding && encoding !== Tokenizers.CL100K_BASE) {
throw new Error(`Tokenizer encoding ${encoding} not yet supported`);
......@@ -52,7 +50,7 @@ class GlobalsHelper {
if (!this.defaultTokenizer) {
this.initDefaultTokenizer();
}
return this.defaultTokenizer!.decode.bind(this.defaultTokenizer);
}
......
......@@ -73,6 +73,7 @@ describe("SentenceSplitter", () => {
let splits = sentenceSplitter.splitText(
"This is a sentence. This is another sentence. 1.0",
);
expect(splits).toEqual([
"This is a sentence.",
"This is another sentence.",
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment