diff --git a/package.json b/package.json index 5bf808928028a6b3065e1ed206c16e13b796e777..8e9e9f49ac3503cdc6052638e4a1bc05c6037953 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,8 @@ "prettier": "^2.5.1", "prettier-plugin-tailwindcss": "^0.3.0", "ts-jest": "^29.1.0", - "turbo": "latest" + "turbo": "latest", + "wink-nlp": "latest" }, "packageManager": "pnpm@7.15.0", "name": "llamascript" diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index c80e454c87566148d5b3a943c93cd8689a0d443f..cdcf8677b5193b53c436e21910912c5e20a35d07 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -9,6 +9,9 @@ class SentenceSplitter { private _chunk_overlap: number; private tokenizer: any; private _backup_separators: string[]; + private _paragraph_separator: string | undefined; + private _chunking_tokenizer_fn: any; + private _secondary_chunking_regex: string; // private _callback_manager: any; constructor( @@ -35,25 +38,38 @@ class SentenceSplitter { // this._callback_manager = callback_manager || new CallbackManager([]); if (chunking_tokenizer_fn == undefined) { - // use default tokenizer that splits by sentences - const winkNLP = require("wink-nlp"); - // Load "its" helper to extract item properties. - const its = require( 'wink-nlp/src/its.js' ); - // Load english language model — light version. - const model = require( 'wink-eng-lite-model' ); - // Instantiate winkNLP. - const nlp = winkNLP( model ); - - // Input text - const text = 'AI Inc. is focussing on AI. It is based in the U.S.A. It was started on 06.12.2007.'; - // Read text - const doc = nlp.readDoc( text ); - // Extract sentences from the data - const sentences = doc.sentences().out(); + + // define a callable mapping a string to a list of strings + const default_chunking_tokenizer_fn = (text: string) => { + var result = text.match(/[^.?!]+[.!?]+[\])'"`’â€]*|.+/g); + return result + }; + + chunking_tokenizer_fn = default_chunking_tokenizer_fn; + } + + if (tokenizer == undefined) { + const tiktoken = require('tiktoken-node') + let enc = new tiktoken.getEncoding("gpt-2") + const default_tokenizer = (text: string) => { + return enc.encode(text) + } + tokenizer = default_tokenizer } + + this._paragraph_separator = paragraph_separator; + this._chunking_tokenizer_fn = chunking_tokenizer_fn; + this._secondary_chunking_regex = secondary_chunking_regex; } + splitText(text: string, extra_info_str?: string): string[] { + const text_splits = this.splitTextWithOverlaps(text); + const chunks = text_splits.map((text_split) => text_split.text_chunk); + return chunks; + } + + } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0dd717a424f7be485e041adf75188102f36f7b99..159b6f8d3889030ddf6a328b632cb15912946cc5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -35,6 +35,9 @@ importers: turbo: specifier: latest version: 1.10.3 + wink-nlp: + specifier: latest + version: 1.14.1 apps/docs: dependencies: @@ -116,9 +119,6 @@ importers: packages/core: dependencies: - '@vespaiach/axios-fetch-adapter': - specifier: ^0.3.1 - version: 0.3.1(axios@0.26.1) axios: specifier: ^0.26.1 version: 0.26.1 @@ -1207,14 +1207,6 @@ packages: eslint-visitor-keys: 3.4.0 dev: false - /@vespaiach/axios-fetch-adapter@0.3.1(axios@0.26.1): - resolution: {integrity: sha512-+1F52VWXmQHSRFSv4/H0wtnxfvjRMPK5531e880MIjypPdUSX6QZuoDgEVeCE1vjhzDdxCVX7rOqkub7StEUwQ==} - peerDependencies: - axios: '>=0.26.0' - dependencies: - axios: 0.26.1 - dev: false - /acorn-jsx@5.3.2(acorn@7.4.1): resolution: {integrity: sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==} peerDependencies: @@ -5114,6 +5106,10 @@ packages: dependencies: isexe: 2.0.0 + /wink-nlp@1.14.1: + resolution: {integrity: sha512-RIdUZI3ei3OB6OY5f3jNo74fmsfPV7cfwiJ2fvBM1xzGnnl2CjRJmwGwsO04n0xl28vDTtxj6AlhIb74XQLoqQ==} + dev: true + /word-wrap@1.2.3: resolution: {integrity: sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==} engines: {node: '>=0.10.0'}