From fcc06b227a70f9942a1d54a528559011c1b1c8cb Mon Sep 17 00:00:00 2001 From: Alex Yang <himself65@outlook.com> Date: Wed, 10 Jan 2024 17:27:50 -0600 Subject: [PATCH] fix(perf): use regex to spilt texts (#364) --- packages/core/package.json | 1 - packages/core/src/TextSplitter.ts | 15 +++-- pnpm-lock.yaml | 97 ++++++++++++++++++------------- 3 files changed, 65 insertions(+), 48 deletions(-) diff --git a/packages/core/package.json b/packages/core/package.json index 1f259327d..493d7049d 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -10,7 +10,6 @@ "@pinecone-database/pinecone": "^1.1.2", "@xenova/transformers": "^2.10.0", "assemblyai": "^4.0.0", - "compromise": "^14.10.1", "file-type": "^18.7.0", "js-tiktoken": "^1.0.8", "lodash": "^4.17.21", diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index f6d2a831f..e4a349d6b 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -1,4 +1,3 @@ -import nlp from "compromise"; import { EOL } from "node:os"; // GitHub translated import { globalsHelper } from "./GlobalsHelper"; @@ -19,11 +18,17 @@ class TextSplit { type SplitRep = { text: string; numTokens: number }; +const defaultregex = /[.?!][\])'"`’â€]*(?:\s|$)/g; export const defaultSentenceTokenizer = (text: string): string[] => { - return nlp(text) - .sentences() - .json() - .map((sentence: any) => sentence.text); + const slist = []; + const iter = text.matchAll(defaultregex); + let lastIdx = 0; + for (const match of iter) { + slist.push(text.slice(lastIdx, match.index! + 1)); + lastIdx = match.index! + 1; + } + slist.push(text.slice(lastIdx)); + return slist.filter((s) => s.length > 0); }; // Refs: https://github.com/fxsjy/jieba/issues/575#issuecomment-359637511 diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 513f0474c..cb1ee0bed 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,7 +17,7 @@ importers: version: 2.27.1 '@turbo/gen': specifier: ^1.11.2 - version: 1.11.2(@types/node@20.10.6)(typescript@5.3.3) + version: 1.11.2(@types/node@18.19.2)(typescript@5.3.3) '@types/jest': specifier: ^29.5.11 version: 29.5.11 @@ -32,7 +32,7 @@ importers: version: 8.0.3 jest: specifier: ^29.7.0 - version: 29.7.0(@types/node@20.10.6) + version: 29.7.0(@types/node@18.19.2) lint-staged: specifier: ^15.2.0 version: 15.2.0 @@ -158,9 +158,6 @@ importers: assemblyai: specifier: ^4.0.0 version: 4.0.0 - compromise: - specifier: ^14.10.1 - version: 14.10.1 file-type: specifier: ^18.7.0 version: 18.7.0 @@ -4315,7 +4312,7 @@ packages: resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==} dev: true - /@turbo/gen@1.11.2(@types/node@20.10.6)(typescript@5.3.3): + /@turbo/gen@1.11.2(@types/node@18.19.2)(typescript@5.3.3): resolution: {integrity: sha512-zV4vwedEujiAcACPnFXnKat8IqDo0EVJpMbS3W5CiokUBv35vw5PjldjqKcdh0GIiUTlriWGwRU6FZ8pzBg+kg==} hasBin: true dependencies: @@ -4327,7 +4324,7 @@ packages: minimatch: 9.0.3 node-plop: 0.26.3 proxy-agent: 6.3.1 - ts-node: 10.9.2(@types/node@20.10.6)(typescript@5.3.3) + ts-node: 10.9.2(@types/node@18.19.2)(typescript@5.3.3) update-check: 1.5.4 validate-npm-package-name: 5.0.0 transitivePeerDependencies: @@ -6491,7 +6488,6 @@ packages: /commander@2.20.0: resolution: {integrity: sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==} - dev: true /commander@2.20.3: resolution: {integrity: sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==} @@ -6546,15 +6542,6 @@ packages: - supports-color dev: false - /compromise@14.10.1: - resolution: {integrity: sha512-GX91lZfJsma34HHifGlmnoWdu45PreuRFjrccCSAZq+r7Jb0wdKxKZWhyi8OSPvZ0+xk7LclDakUnd/Np57ZRQ==} - engines: {node: '>=12.0.0'} - dependencies: - efrt: 2.7.0 - grad-school: 0.0.5 - suffix-thumb: 5.0.2 - dev: false - /concat-map@0.0.1: resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==} @@ -6760,7 +6747,7 @@ packages: sha.js: 2.4.11 dev: true - /create-jest@29.7.0(@types/node@20.10.6): + /create-jest@29.7.0(@types/node@18.19.2): resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -6769,7 +6756,7 @@ packages: chalk: 4.1.2 exit: 0.1.2 graceful-fs: 4.2.11 - jest-config: 29.7.0(@types/node@20.10.6) + jest-config: 29.7.0(@types/node@18.19.2) jest-util: 29.7.0 prompts: 2.4.2 transitivePeerDependencies: @@ -7553,11 +7540,6 @@ packages: resolution: {integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==} dev: false - /efrt@2.7.0: - resolution: {integrity: sha512-/RInbCy1d4P6Zdfa+TMVsf/ufZVotat5hCw3QXmWtjU+3pFEOvOQ7ibo3aIxyCJw2leIeAMjmPj+1SLJiCpdrQ==} - engines: {node: '>=12.0.0'} - dev: false - /electron-to-chromium@1.4.530: resolution: {integrity: sha512-rsJ9O8SCI4etS8TBsXuRfHa2eZReJhnGf5MHZd3Vo05PukWHKXhk3VQGbHHnDLa8nZz9woPCpLCMQpLGgkGNRA==} dev: false @@ -9113,11 +9095,6 @@ packages: /graceful-fs@4.2.11: resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} - /grad-school@0.0.5: - resolution: {integrity: sha512-rXunEHF9M9EkMydTBux7+IryYXEZinRk6g8OBOGDBzo/qWJjhTxy86i5q7lQYpCLHN8Sqv1XX3OIOc7ka2gtvQ==} - engines: {node: '>=8.0.0'} - dev: false - /gradient-string@2.0.2: resolution: {integrity: sha512-rEDCuqUQ4tbD78TpzsMtt5OIf0cBCSDWSJtUDaF6JsAh+k0v9r++NzxNEG87oDZx9ZwGhD8DaezR2L/yrw0Jdw==} engines: {node: '>=10'} @@ -10251,7 +10228,7 @@ packages: - supports-color dev: true - /jest-cli@29.7.0(@types/node@20.10.6): + /jest-cli@29.7.0(@types/node@18.19.2): resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -10265,10 +10242,10 @@ packages: '@jest/test-result': 29.7.0 '@jest/types': 29.6.3 chalk: 4.1.2 - create-jest: 29.7.0(@types/node@20.10.6) + create-jest: 29.7.0(@types/node@18.19.2) exit: 0.1.2 import-local: 3.1.0 - jest-config: 29.7.0(@types/node@20.10.6) + jest-config: 29.7.0(@types/node@18.19.2) jest-util: 29.7.0 jest-validate: 29.7.0 yargs: 17.7.2 @@ -10279,6 +10256,46 @@ packages: - ts-node dev: true + /jest-config@29.7.0(@types/node@18.19.2): + resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} + engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} + peerDependencies: + '@types/node': '*' + ts-node: '>=9.0.0' + peerDependenciesMeta: + '@types/node': + optional: true + ts-node: + optional: true + dependencies: + '@babel/core': 7.23.7 + '@jest/test-sequencer': 29.7.0 + '@jest/types': 29.6.3 + '@types/node': 18.19.2 + babel-jest: 29.7.0(@babel/core@7.23.7) + chalk: 4.1.2 + ci-info: 3.9.0 + deepmerge: 4.3.1 + glob: 7.2.3 + graceful-fs: 4.2.11 + jest-circus: 29.7.0 + jest-environment-node: 29.7.0 + jest-get-type: 29.6.3 + jest-regex-util: 29.6.3 + jest-resolve: 29.7.0 + jest-runner: 29.7.0 + jest-util: 29.7.0 + jest-validate: 29.7.0 + micromatch: 4.0.5 + parse-json: 5.2.0 + pretty-format: 29.7.0 + slash: 3.0.0 + strip-json-comments: 3.1.1 + transitivePeerDependencies: + - babel-plugin-macros + - supports-color + dev: true + /jest-config@29.7.0(@types/node@20.10.6): resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} @@ -10608,7 +10625,7 @@ packages: merge-stream: 2.0.0 supports-color: 8.1.1 - /jest@29.7.0(@types/node@20.10.6): + /jest@29.7.0(@types/node@18.19.2): resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -10621,7 +10638,7 @@ packages: '@jest/core': 29.7.0 '@jest/types': 29.6.3 import-local: 3.1.0 - jest-cli: 29.7.0(@types/node@20.10.6) + jest-cli: 29.7.0(@types/node@18.19.2) transitivePeerDependencies: - '@types/node' - babel-plugin-macros @@ -14886,10 +14903,6 @@ packages: ts-interface-checker: 0.1.13 dev: true - /suffix-thumb@5.0.2: - resolution: {integrity: sha512-I5PWXAFKx3FYnI9a+dQMWNqTxoRt6vdBdb0O+BJ1sxXCWtSoQCusc13E58f+9p4MYx/qCnEMkD5jac6K2j3dgA==} - dev: false - /supports-color@5.5.0: resolution: {integrity: sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==} engines: {node: '>=4'} @@ -15101,7 +15114,7 @@ packages: dependencies: '@jridgewell/source-map': 0.3.5 acorn: 8.11.3 - commander: 2.20.3 + commander: 2.20.0 source-map-support: 0.5.21 dev: false @@ -15300,7 +15313,7 @@ packages: '@babel/core': 7.23.7 bs-logger: 0.2.6 fast-json-stable-stringify: 2.1.0 - jest: 29.7.0(@types/node@20.10.6) + jest: 29.7.0(@types/node@18.19.2) jest-util: 29.7.0 json5: 2.2.3 lodash.memoize: 4.1.2 @@ -15341,7 +15354,7 @@ packages: yn: 3.1.1 dev: true - /ts-node@10.9.2(@types/node@20.10.6)(typescript@5.3.3): + /ts-node@10.9.2(@types/node@18.19.2)(typescript@5.3.3): resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==} hasBin: true peerDependencies: @@ -15360,7 +15373,7 @@ packages: '@tsconfig/node12': 1.0.11 '@tsconfig/node14': 1.0.3 '@tsconfig/node16': 1.0.4 - '@types/node': 20.10.6 + '@types/node': 18.19.2 acorn: 8.11.3 acorn-walk: 8.3.1 arg: 4.1.3 -- GitLab