Skip to content
Snippets Groups Projects
Unverified Commit fcc06b22 authored by Alex Yang's avatar Alex Yang Committed by GitHub
Browse files

fix(perf): use regex to spilt texts (#364)

parent 08a39790
No related branches found
No related tags found
No related merge requests found
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
"@pinecone-database/pinecone": "^1.1.2", "@pinecone-database/pinecone": "^1.1.2",
"@xenova/transformers": "^2.10.0", "@xenova/transformers": "^2.10.0",
"assemblyai": "^4.0.0", "assemblyai": "^4.0.0",
"compromise": "^14.10.1",
"file-type": "^18.7.0", "file-type": "^18.7.0",
"js-tiktoken": "^1.0.8", "js-tiktoken": "^1.0.8",
"lodash": "^4.17.21", "lodash": "^4.17.21",
......
import nlp from "compromise";
import { EOL } from "node:os"; import { EOL } from "node:os";
// GitHub translated // GitHub translated
import { globalsHelper } from "./GlobalsHelper"; import { globalsHelper } from "./GlobalsHelper";
...@@ -19,11 +18,17 @@ class TextSplit { ...@@ -19,11 +18,17 @@ class TextSplit {
type SplitRep = { text: string; numTokens: number }; type SplitRep = { text: string; numTokens: number };
const defaultregex = /[.?!][\])'"`’”]*(?:\s|$)/g;
export const defaultSentenceTokenizer = (text: string): string[] => { export const defaultSentenceTokenizer = (text: string): string[] => {
return nlp(text) const slist = [];
.sentences() const iter = text.matchAll(defaultregex);
.json() let lastIdx = 0;
.map((sentence: any) => sentence.text); for (const match of iter) {
slist.push(text.slice(lastIdx, match.index! + 1));
lastIdx = match.index! + 1;
}
slist.push(text.slice(lastIdx));
return slist.filter((s) => s.length > 0);
}; };
// Refs: https://github.com/fxsjy/jieba/issues/575#issuecomment-359637511 // Refs: https://github.com/fxsjy/jieba/issues/575#issuecomment-359637511
......
...@@ -17,7 +17,7 @@ importers: ...@@ -17,7 +17,7 @@ importers:
version: 2.27.1 version: 2.27.1
'@turbo/gen': '@turbo/gen':
specifier: ^1.11.2 specifier: ^1.11.2
version: 1.11.2(@types/node@20.10.6)(typescript@5.3.3) version: 1.11.2(@types/node@18.19.2)(typescript@5.3.3)
'@types/jest': '@types/jest':
specifier: ^29.5.11 specifier: ^29.5.11
version: 29.5.11 version: 29.5.11
...@@ -32,7 +32,7 @@ importers: ...@@ -32,7 +32,7 @@ importers:
version: 8.0.3 version: 8.0.3
jest: jest:
specifier: ^29.7.0 specifier: ^29.7.0
version: 29.7.0(@types/node@20.10.6) version: 29.7.0(@types/node@18.19.2)
lint-staged: lint-staged:
specifier: ^15.2.0 specifier: ^15.2.0
version: 15.2.0 version: 15.2.0
...@@ -158,9 +158,6 @@ importers: ...@@ -158,9 +158,6 @@ importers:
assemblyai: assemblyai:
specifier: ^4.0.0 specifier: ^4.0.0
version: 4.0.0 version: 4.0.0
compromise:
specifier: ^14.10.1
version: 14.10.1
file-type: file-type:
specifier: ^18.7.0 specifier: ^18.7.0
version: 18.7.0 version: 18.7.0
...@@ -4315,7 +4312,7 @@ packages: ...@@ -4315,7 +4312,7 @@ packages:
resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==} resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==}
dev: true dev: true
   
/@turbo/gen@1.11.2(@types/node@20.10.6)(typescript@5.3.3): /@turbo/gen@1.11.2(@types/node@18.19.2)(typescript@5.3.3):
resolution: {integrity: sha512-zV4vwedEujiAcACPnFXnKat8IqDo0EVJpMbS3W5CiokUBv35vw5PjldjqKcdh0GIiUTlriWGwRU6FZ8pzBg+kg==} resolution: {integrity: sha512-zV4vwedEujiAcACPnFXnKat8IqDo0EVJpMbS3W5CiokUBv35vw5PjldjqKcdh0GIiUTlriWGwRU6FZ8pzBg+kg==}
hasBin: true hasBin: true
dependencies: dependencies:
...@@ -4327,7 +4324,7 @@ packages: ...@@ -4327,7 +4324,7 @@ packages:
minimatch: 9.0.3 minimatch: 9.0.3
node-plop: 0.26.3 node-plop: 0.26.3
proxy-agent: 6.3.1 proxy-agent: 6.3.1
ts-node: 10.9.2(@types/node@20.10.6)(typescript@5.3.3) ts-node: 10.9.2(@types/node@18.19.2)(typescript@5.3.3)
update-check: 1.5.4 update-check: 1.5.4
validate-npm-package-name: 5.0.0 validate-npm-package-name: 5.0.0
transitivePeerDependencies: transitivePeerDependencies:
...@@ -6491,7 +6488,6 @@ packages: ...@@ -6491,7 +6488,6 @@ packages:
   
/commander@2.20.0: /commander@2.20.0:
resolution: {integrity: sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==} resolution: {integrity: sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==}
dev: true
   
/commander@2.20.3: /commander@2.20.3:
resolution: {integrity: sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==} resolution: {integrity: sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==}
...@@ -6546,15 +6542,6 @@ packages: ...@@ -6546,15 +6542,6 @@ packages:
- supports-color - supports-color
dev: false dev: false
   
/compromise@14.10.1:
resolution: {integrity: sha512-GX91lZfJsma34HHifGlmnoWdu45PreuRFjrccCSAZq+r7Jb0wdKxKZWhyi8OSPvZ0+xk7LclDakUnd/Np57ZRQ==}
engines: {node: '>=12.0.0'}
dependencies:
efrt: 2.7.0
grad-school: 0.0.5
suffix-thumb: 5.0.2
dev: false
/concat-map@0.0.1: /concat-map@0.0.1:
resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==} resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==}
   
...@@ -6760,7 +6747,7 @@ packages: ...@@ -6760,7 +6747,7 @@ packages:
sha.js: 2.4.11 sha.js: 2.4.11
dev: true dev: true
   
/create-jest@29.7.0(@types/node@20.10.6): /create-jest@29.7.0(@types/node@18.19.2):
resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==}
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
hasBin: true hasBin: true
...@@ -6769,7 +6756,7 @@ packages: ...@@ -6769,7 +6756,7 @@ packages:
chalk: 4.1.2 chalk: 4.1.2
exit: 0.1.2 exit: 0.1.2
graceful-fs: 4.2.11 graceful-fs: 4.2.11
jest-config: 29.7.0(@types/node@20.10.6) jest-config: 29.7.0(@types/node@18.19.2)
jest-util: 29.7.0 jest-util: 29.7.0
prompts: 2.4.2 prompts: 2.4.2
transitivePeerDependencies: transitivePeerDependencies:
...@@ -7553,11 +7540,6 @@ packages: ...@@ -7553,11 +7540,6 @@ packages:
resolution: {integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==} resolution: {integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==}
dev: false dev: false
   
/efrt@2.7.0:
resolution: {integrity: sha512-/RInbCy1d4P6Zdfa+TMVsf/ufZVotat5hCw3QXmWtjU+3pFEOvOQ7ibo3aIxyCJw2leIeAMjmPj+1SLJiCpdrQ==}
engines: {node: '>=12.0.0'}
dev: false
/electron-to-chromium@1.4.530: /electron-to-chromium@1.4.530:
resolution: {integrity: sha512-rsJ9O8SCI4etS8TBsXuRfHa2eZReJhnGf5MHZd3Vo05PukWHKXhk3VQGbHHnDLa8nZz9woPCpLCMQpLGgkGNRA==} resolution: {integrity: sha512-rsJ9O8SCI4etS8TBsXuRfHa2eZReJhnGf5MHZd3Vo05PukWHKXhk3VQGbHHnDLa8nZz9woPCpLCMQpLGgkGNRA==}
dev: false dev: false
...@@ -9113,11 +9095,6 @@ packages: ...@@ -9113,11 +9095,6 @@ packages:
/graceful-fs@4.2.11: /graceful-fs@4.2.11:
resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==}
   
/grad-school@0.0.5:
resolution: {integrity: sha512-rXunEHF9M9EkMydTBux7+IryYXEZinRk6g8OBOGDBzo/qWJjhTxy86i5q7lQYpCLHN8Sqv1XX3OIOc7ka2gtvQ==}
engines: {node: '>=8.0.0'}
dev: false
/gradient-string@2.0.2: /gradient-string@2.0.2:
resolution: {integrity: sha512-rEDCuqUQ4tbD78TpzsMtt5OIf0cBCSDWSJtUDaF6JsAh+k0v9r++NzxNEG87oDZx9ZwGhD8DaezR2L/yrw0Jdw==} resolution: {integrity: sha512-rEDCuqUQ4tbD78TpzsMtt5OIf0cBCSDWSJtUDaF6JsAh+k0v9r++NzxNEG87oDZx9ZwGhD8DaezR2L/yrw0Jdw==}
engines: {node: '>=10'} engines: {node: '>=10'}
...@@ -10251,7 +10228,7 @@ packages: ...@@ -10251,7 +10228,7 @@ packages:
- supports-color - supports-color
dev: true dev: true
   
/jest-cli@29.7.0(@types/node@20.10.6): /jest-cli@29.7.0(@types/node@18.19.2):
resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==}
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
hasBin: true hasBin: true
...@@ -10265,10 +10242,10 @@ packages: ...@@ -10265,10 +10242,10 @@ packages:
'@jest/test-result': 29.7.0 '@jest/test-result': 29.7.0
'@jest/types': 29.6.3 '@jest/types': 29.6.3
chalk: 4.1.2 chalk: 4.1.2
create-jest: 29.7.0(@types/node@20.10.6) create-jest: 29.7.0(@types/node@18.19.2)
exit: 0.1.2 exit: 0.1.2
import-local: 3.1.0 import-local: 3.1.0
jest-config: 29.7.0(@types/node@20.10.6) jest-config: 29.7.0(@types/node@18.19.2)
jest-util: 29.7.0 jest-util: 29.7.0
jest-validate: 29.7.0 jest-validate: 29.7.0
yargs: 17.7.2 yargs: 17.7.2
...@@ -10279,6 +10256,46 @@ packages: ...@@ -10279,6 +10256,46 @@ packages:
- ts-node - ts-node
dev: true dev: true
   
/jest-config@29.7.0(@types/node@18.19.2):
resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==}
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
peerDependencies:
'@types/node': '*'
ts-node: '>=9.0.0'
peerDependenciesMeta:
'@types/node':
optional: true
ts-node:
optional: true
dependencies:
'@babel/core': 7.23.7
'@jest/test-sequencer': 29.7.0
'@jest/types': 29.6.3
'@types/node': 18.19.2
babel-jest: 29.7.0(@babel/core@7.23.7)
chalk: 4.1.2
ci-info: 3.9.0
deepmerge: 4.3.1
glob: 7.2.3
graceful-fs: 4.2.11
jest-circus: 29.7.0
jest-environment-node: 29.7.0
jest-get-type: 29.6.3
jest-regex-util: 29.6.3
jest-resolve: 29.7.0
jest-runner: 29.7.0
jest-util: 29.7.0
jest-validate: 29.7.0
micromatch: 4.0.5
parse-json: 5.2.0
pretty-format: 29.7.0
slash: 3.0.0
strip-json-comments: 3.1.1
transitivePeerDependencies:
- babel-plugin-macros
- supports-color
dev: true
/jest-config@29.7.0(@types/node@20.10.6): /jest-config@29.7.0(@types/node@20.10.6):
resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==}
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
...@@ -10608,7 +10625,7 @@ packages: ...@@ -10608,7 +10625,7 @@ packages:
merge-stream: 2.0.0 merge-stream: 2.0.0
supports-color: 8.1.1 supports-color: 8.1.1
   
/jest@29.7.0(@types/node@20.10.6): /jest@29.7.0(@types/node@18.19.2):
resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==}
engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0}
hasBin: true hasBin: true
...@@ -10621,7 +10638,7 @@ packages: ...@@ -10621,7 +10638,7 @@ packages:
'@jest/core': 29.7.0 '@jest/core': 29.7.0
'@jest/types': 29.6.3 '@jest/types': 29.6.3
import-local: 3.1.0 import-local: 3.1.0
jest-cli: 29.7.0(@types/node@20.10.6) jest-cli: 29.7.0(@types/node@18.19.2)
transitivePeerDependencies: transitivePeerDependencies:
- '@types/node' - '@types/node'
- babel-plugin-macros - babel-plugin-macros
...@@ -14886,10 +14903,6 @@ packages: ...@@ -14886,10 +14903,6 @@ packages:
ts-interface-checker: 0.1.13 ts-interface-checker: 0.1.13
dev: true dev: true
   
/suffix-thumb@5.0.2:
resolution: {integrity: sha512-I5PWXAFKx3FYnI9a+dQMWNqTxoRt6vdBdb0O+BJ1sxXCWtSoQCusc13E58f+9p4MYx/qCnEMkD5jac6K2j3dgA==}
dev: false
/supports-color@5.5.0: /supports-color@5.5.0:
resolution: {integrity: sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==} resolution: {integrity: sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==}
engines: {node: '>=4'} engines: {node: '>=4'}
...@@ -15101,7 +15114,7 @@ packages: ...@@ -15101,7 +15114,7 @@ packages:
dependencies: dependencies:
'@jridgewell/source-map': 0.3.5 '@jridgewell/source-map': 0.3.5
acorn: 8.11.3 acorn: 8.11.3
commander: 2.20.3 commander: 2.20.0
source-map-support: 0.5.21 source-map-support: 0.5.21
dev: false dev: false
   
...@@ -15300,7 +15313,7 @@ packages: ...@@ -15300,7 +15313,7 @@ packages:
'@babel/core': 7.23.7 '@babel/core': 7.23.7
bs-logger: 0.2.6 bs-logger: 0.2.6
fast-json-stable-stringify: 2.1.0 fast-json-stable-stringify: 2.1.0
jest: 29.7.0(@types/node@20.10.6) jest: 29.7.0(@types/node@18.19.2)
jest-util: 29.7.0 jest-util: 29.7.0
json5: 2.2.3 json5: 2.2.3
lodash.memoize: 4.1.2 lodash.memoize: 4.1.2
...@@ -15341,7 +15354,7 @@ packages: ...@@ -15341,7 +15354,7 @@ packages:
yn: 3.1.1 yn: 3.1.1
dev: true dev: true
   
/ts-node@10.9.2(@types/node@20.10.6)(typescript@5.3.3): /ts-node@10.9.2(@types/node@18.19.2)(typescript@5.3.3):
resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==} resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==}
hasBin: true hasBin: true
peerDependencies: peerDependencies:
...@@ -15360,7 +15373,7 @@ packages: ...@@ -15360,7 +15373,7 @@ packages:
'@tsconfig/node12': 1.0.11 '@tsconfig/node12': 1.0.11
'@tsconfig/node14': 1.0.3 '@tsconfig/node14': 1.0.3
'@tsconfig/node16': 1.0.4 '@tsconfig/node16': 1.0.4
'@types/node': 20.10.6 '@types/node': 18.19.2
acorn: 8.11.3 acorn: 8.11.3
acorn-walk: 8.3.1 acorn-walk: 8.3.1
arg: 4.1.3 arg: 4.1.3
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment