Skip to content
Snippets Groups Projects
Commit fba49b80 authored by V4N's avatar V4N
Browse files

replace tiktoken with js-tiktoken

parent 6e0ee9ec
Branches
Tags
No related merge requests found
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
"dependencies": { "dependencies": {
"@anthropic-ai/sdk": "^0.8.1", "@anthropic-ai/sdk": "^0.8.1",
"@notionhq/client": "^2.2.13", "@notionhq/client": "^2.2.13",
"js-tiktoken": "^1.0.7",
"lodash": "^4.17.21", "lodash": "^4.17.21",
"mammoth": "^1.6.0", "mammoth": "^1.6.0",
"md-utils-ts": "^2.0.0", "md-utils-ts": "^2.0.0",
...@@ -17,7 +18,6 @@ ...@@ -17,7 +18,6 @@
"rake-modified": "^1.0.8", "rake-modified": "^1.0.8",
"replicate": "^0.20.1", "replicate": "^0.20.1",
"string-strip-html": "^13.4.3", "string-strip-html": "^13.4.3",
"tiktoken": "^1.0.10",
"uuid": "^9.0.1", "uuid": "^9.0.1",
"wink-nlp": "^1.14.3" "wink-nlp": "^1.14.3"
}, },
......
import cl100k_base from "tiktoken/encoders/cl100k_base.json"; import { encodingForModel, TiktokenModel } from "js-tiktoken";
import { Tiktoken } from "tiktoken/lite";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Event, EventTag, EventType } from "./callbacks/CallbackManager"; import { Event, EventTag, EventType } from "./callbacks/CallbackManager";
...@@ -18,18 +17,17 @@ class GlobalsHelper { ...@@ -18,18 +17,17 @@ class GlobalsHelper {
} | null = null; } | null = null;
private initDefaultTokenizer() { private initDefaultTokenizer() {
const encoding = new Tiktoken( const encoding = encodingForModel("text-embedding-ada-002"); // cl100k_base
cl100k_base.bpe_ranks,
cl100k_base.special_tokens,
cl100k_base.pat_str,
);
this.defaultTokenizer = { this.defaultTokenizer = {
encode: (text: string) => { encode: (text: string) => {
return encoding.encode(text); return new Uint32Array(encoding.encode(text));
}, },
decode: (tokens: Uint32Array) => { decode: (tokens: Uint32Array) => {
return new TextDecoder().decode(encoding.decode(tokens)); const numberArray = Array.from(tokens);
const text = encoding.decode(numberArray);
const uint8Array = new TextEncoder().encode(text);
return new TextDecoder().decode(uint8Array);
}, },
}; };
} }
...@@ -41,10 +39,10 @@ class GlobalsHelper { ...@@ -41,10 +39,10 @@ class GlobalsHelper {
if (!this.defaultTokenizer) { if (!this.defaultTokenizer) {
this.initDefaultTokenizer(); this.initDefaultTokenizer();
} }
return this.defaultTokenizer!.encode.bind(this.defaultTokenizer); return this.defaultTokenizer!.encode.bind(this.defaultTokenizer);
} }
tokenizerDecoder(encoding?: string) { tokenizerDecoder(encoding?: string) {
if (encoding && encoding !== Tokenizers.CL100K_BASE) { if (encoding && encoding !== Tokenizers.CL100K_BASE) {
throw new Error(`Tokenizer encoding ${encoding} not yet supported`); throw new Error(`Tokenizer encoding ${encoding} not yet supported`);
...@@ -52,7 +50,7 @@ class GlobalsHelper { ...@@ -52,7 +50,7 @@ class GlobalsHelper {
if (!this.defaultTokenizer) { if (!this.defaultTokenizer) {
this.initDefaultTokenizer(); this.initDefaultTokenizer();
} }
return this.defaultTokenizer!.decode.bind(this.defaultTokenizer); return this.defaultTokenizer!.decode.bind(this.defaultTokenizer);
} }
......
...@@ -73,6 +73,7 @@ describe("SentenceSplitter", () => { ...@@ -73,6 +73,7 @@ describe("SentenceSplitter", () => {
let splits = sentenceSplitter.splitText( let splits = sentenceSplitter.splitText(
"This is a sentence. This is another sentence. 1.0", "This is a sentence. This is another sentence. 1.0",
); );
expect(splits).toEqual([ expect(splits).toEqual([
"This is a sentence.", "This is a sentence.",
"This is another sentence.", "This is another sentence.",
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment