diff --git a/packages/core/package.json b/packages/core/package.json index 5d04303ac8ac5af8065f079c84256e2403facc66..881f4686d800fe5ac72464077730642ee4afc3b1 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -9,6 +9,7 @@ "@notionhq/client": "^2.2.14", "@xenova/transformers": "^2.10.0", "assemblyai": "^4.0.0", + "compromise": "^14.10.1", "crypto-js": "^4.2.0", "file-type": "^18.7.0", "js-tiktoken": "^1.0.8", diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts index 4931bcd0d9c04376ea67dc7f4ab5acf5d5cf4ce0..36708004c59ea5931dfdc17acc99336dee2bcc20 100644 --- a/packages/core/src/TextSplitter.ts +++ b/packages/core/src/TextSplitter.ts @@ -1,3 +1,4 @@ +import nlp from 'compromise' import { EOL } from 'node:os' // GitHub translated import { globalsHelper } from "./GlobalsHelper"; @@ -18,28 +19,32 @@ class TextSplit { type SplitRep = { text: string; numTokens: number }; -/** - * Tokenizes sentences. Suitable for English and most European languages. - * @param text - * @returns - */ -export const englishSentenceTokenizer = (text: string) => { - // The first part is a lazy match for any character. - return text.match(/.+?[.?!]+[\])'"`’”]*(?:\s|$)|.+/g); +export const defaultSentenceTokenizer = (text: string): string[] => { + return nlp(text).sentences().json().map((sentence: any) => sentence.text); }; +// Refs: https://github.com/fxsjy/jieba/issues/575#issuecomment-359637511 +const resentencesp = /([﹒﹔﹖﹗.;。!?]["’”」』]{0,2}|:(?=["‘“「『]{1,2}|$))/; /** - * Tokenizes sentences. Suitable for Chinese, Japanese, and Korean. + * Tokenizes sentences. Suitable for Chinese, Japanese, and Korean. Use instead of `defaultSentenceTokenizer`. * @param text - * @returns + * @returns string[] */ -export const cjkSentenceTokenizer = (text: string) => { - // Accepts english style sentence endings with space and - // CJK style sentence endings with no space. - return text.match( - /.+?[.?!]+[\])'"`’”]*(?:\s|$)|.+?[。?!]+[\])'"`’”]*(?:\s|$)?|.+/g, - ); -}; +export function cjkSentenceTokenizer(sentence: string): string[] { + const slist = []; + const parts = sentence.split(resentencesp); + + for (let i = 0; i < parts.length; i++) { + const part = parts[i]; + if (resentencesp.test(part) && slist.length > 0) { + slist[slist.length - 1] += part; + } else if (part) { + slist.push(part); + } + } + + return slist.filter((s) => s.length > 0); +} export const defaultParagraphSeparator = EOL + EOL + EOL @@ -57,7 +62,7 @@ export class SentenceSplitter { private tokenizer: any; private tokenizerDecoder: any; private paragraphSeparator: string; - private chunkingTokenizerFn: (text: string) => RegExpMatchArray | null; + private chunkingTokenizerFn: (text: string) => string[]; private splitLongSentences: boolean; constructor(options?: { @@ -66,7 +71,7 @@ export class SentenceSplitter { tokenizer?: any; tokenizerDecoder?: any; paragraphSeparator?: string; - chunkingTokenizerFn?: (text: string) => RegExpMatchArray | null; + chunkingTokenizerFn?: (text: string) => string[]; splitLongSentences?: boolean; }) { const { @@ -75,7 +80,7 @@ export class SentenceSplitter { tokenizer = null, tokenizerDecoder = null, paragraphSeparator = defaultParagraphSeparator, - chunkingTokenizerFn = undefined, + chunkingTokenizerFn, splitLongSentences = false, } = options ?? {}; @@ -93,7 +98,7 @@ export class SentenceSplitter { tokenizerDecoder ?? globalsHelper.tokenizerDecoder(); this.paragraphSeparator = paragraphSeparator; - this.chunkingTokenizerFn = chunkingTokenizerFn ?? englishSentenceTokenizer; + this.chunkingTokenizerFn = chunkingTokenizerFn ?? defaultSentenceTokenizer; this.splitLongSentences = splitLongSentences; } @@ -218,15 +223,16 @@ export class SentenceSplitter { curChunkTokens + newSentenceSplits[i].numTokens > effectiveChunkSize ) { - // push curent doc list to docs - docs.push( - new TextSplit( - curChunkSentences - .map((sentence) => sentence.text) - .join(" ") - .trim(), - ), - ); + if (curChunkSentences.length > 0) { + // push curent doc list to docs + docs.push( + new TextSplit( + curChunkSentences.map((sentence) => sentence.text). + join(" "). + trim(), + ), + ); + } const lastChunkSentences = curChunkSentences; diff --git a/packages/core/src/tests/TextSplitter.test.ts b/packages/core/src/tests/TextSplitter.test.ts index 0b176293a363aca9af221ce8f2d9d4f6b07efc29..591fd493c1c96127458443cb4d41f1d02a878872 100644 --- a/packages/core/src/tests/TextSplitter.test.ts +++ b/packages/core/src/tests/TextSplitter.test.ts @@ -1,4 +1,4 @@ -import { SentenceSplitter, cjkSentenceTokenizer } from "../TextSplitter"; +import { cjkSentenceTokenizer, SentenceSplitter } from '../TextSplitter' describe("SentenceSplitter", () => { test("initializes", () => { @@ -88,7 +88,12 @@ describe("SentenceSplitter", () => { chunkingTokenizerFn: cjkSentenceTokenizer, }); - const splits = sentenceSplitter.splitText("这是一个句子!这是另一个句子。"); - expect(splits).toEqual(["这是一个句子!", "这是另一个句子。"]); + const splits = sentenceSplitter.splitText("此后如竟没有炬火:我便是唯一的光。倘若有了炬火,出了太阳,我们自然心悦诚服的消失。不但毫无不平,而且还要随喜赞美这炬火或太阳;因为他照了人类,连我都在内。"); + expect(splits).toEqual([ + "此后如竟没有炬火:我便是唯一的光。", + "倘若有了炬火,出了太阳,我们自然心悦诚服的消失。", + "不但毫无不平,而且还要随喜赞美这炬火或太阳;", + "因为他照了人类,连我都在内。", + ]); }); }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f107577c1d269910c891467bc58fec4d602bfbae..c68f43736929dfc09837a5ffaa3da695e24daf5f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,7 +17,7 @@ importers: version: 2.26.2 '@turbo/gen': specifier: ^1.10.16 - version: 1.10.16(@types/node@20.10.5)(typescript@5.3.3) + version: 1.10.16(@types/node@18.19.2)(typescript@5.3.3) '@types/jest': specifier: ^29.5.10 version: 29.5.10 @@ -32,7 +32,7 @@ importers: version: 8.0.3 jest: specifier: ^29.7.0 - version: 29.7.0(@types/node@20.10.5) + version: 29.7.0(@types/node@18.19.2) lint-staged: specifier: ^15.1.0 version: 15.1.0 @@ -155,6 +155,9 @@ importers: assemblyai: specifier: ^4.0.0 version: 4.0.0 + compromise: + specifier: ^14.10.1 + version: 14.10.1 crypto-js: specifier: ^4.2.0 version: 4.2.0 @@ -4289,7 +4292,7 @@ packages: resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==} dev: true - /@turbo/gen@1.10.16(@types/node@20.10.5)(typescript@5.3.3): + /@turbo/gen@1.10.16(@types/node@18.19.2)(typescript@5.3.3): resolution: {integrity: sha512-PzyluADjVuy5OcIi+/aRcD70OElQpRVRDdfZ9fH8G5Fv75lQcNrjd1bBGKmhjSw+g+eTEkXMGnY7s6gsCYjYTQ==} hasBin: true dependencies: @@ -4301,7 +4304,7 @@ packages: minimatch: 9.0.3 node-plop: 0.26.3 proxy-agent: 6.3.1 - ts-node: 10.9.1(@types/node@20.10.5)(typescript@5.3.3) + ts-node: 10.9.1(@types/node@18.19.2)(typescript@5.3.3) update-check: 1.5.4 validate-npm-package-name: 5.0.0 transitivePeerDependencies: @@ -4626,12 +4629,6 @@ packages: dependencies: undici-types: 5.26.5 - /@types/node@20.10.5: - resolution: {integrity: sha512-nNPsNE65wjMxEKI93yOP+NPGGBJz/PoN3kZsVLee0XMiJolxSekEVD8wRwBUBqkwc7UWop0edW50yrCQW4CyRw==} - dependencies: - undici-types: 5.26.5 - dev: true - /@types/node@20.9.0: resolution: {integrity: sha512-nekiGu2NDb1BcVofVcEKMIwzlx4NjHlcjhoxxKBNLtz15Y1z7MYf549DFvkHSId02Ax6kGwWntIBPC3l/JZcmw==} dependencies: @@ -6513,6 +6510,15 @@ packages: - supports-color dev: false + /compromise@14.10.1: + resolution: {integrity: sha512-GX91lZfJsma34HHifGlmnoWdu45PreuRFjrccCSAZq+r7Jb0wdKxKZWhyi8OSPvZ0+xk7LclDakUnd/Np57ZRQ==} + engines: {node: '>=12.0.0'} + dependencies: + efrt: 2.7.0 + grad-school: 0.0.5 + suffix-thumb: 5.0.2 + dev: false + /concat-map@0.0.1: resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==} @@ -6718,7 +6724,7 @@ packages: sha.js: 2.4.11 dev: true - /create-jest@29.7.0(@types/node@20.10.5): + /create-jest@29.7.0(@types/node@18.19.2): resolution: {integrity: sha512-Adz2bdH0Vq3F53KEMJOoftQFutWCukm6J24wbPWRO4k1kMY7gS7ds/uoJkNuV8wDCtWWnuwGcJwpWcih+zEW1Q==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -6727,7 +6733,7 @@ packages: chalk: 4.1.2 exit: 0.1.2 graceful-fs: 4.2.11 - jest-config: 29.7.0(@types/node@20.10.5) + jest-config: 29.7.0(@types/node@18.19.2) jest-util: 29.7.0 prompts: 2.4.2 transitivePeerDependencies: @@ -7514,6 +7520,11 @@ packages: resolution: {integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==} dev: false + /efrt@2.7.0: + resolution: {integrity: sha512-/RInbCy1d4P6Zdfa+TMVsf/ufZVotat5hCw3QXmWtjU+3pFEOvOQ7ibo3aIxyCJw2leIeAMjmPj+1SLJiCpdrQ==} + engines: {node: '>=12.0.0'} + dev: false + /electron-to-chromium@1.4.530: resolution: {integrity: sha512-rsJ9O8SCI4etS8TBsXuRfHa2eZReJhnGf5MHZd3Vo05PukWHKXhk3VQGbHHnDLa8nZz9woPCpLCMQpLGgkGNRA==} @@ -9094,6 +9105,11 @@ packages: /graceful-fs@4.2.11: resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} + /grad-school@0.0.5: + resolution: {integrity: sha512-rXunEHF9M9EkMydTBux7+IryYXEZinRk6g8OBOGDBzo/qWJjhTxy86i5q7lQYpCLHN8Sqv1XX3OIOc7ka2gtvQ==} + engines: {node: '>=8.0.0'} + dev: false + /gradient-string@2.0.2: resolution: {integrity: sha512-rEDCuqUQ4tbD78TpzsMtt5OIf0cBCSDWSJtUDaF6JsAh+k0v9r++NzxNEG87oDZx9ZwGhD8DaezR2L/yrw0Jdw==} engines: {node: '>=10'} @@ -10222,7 +10238,7 @@ packages: - supports-color dev: true - /jest-cli@29.7.0(@types/node@20.10.5): + /jest-cli@29.7.0(@types/node@18.19.2): resolution: {integrity: sha512-OVVobw2IubN/GSYsxETi+gOe7Ka59EFMR/twOU3Jb2GnKKeMGJB5SGUUrEz3SFVmJASUdZUzy83sLNNQ2gZslg==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -10236,10 +10252,10 @@ packages: '@jest/test-result': 29.7.0 '@jest/types': 29.6.3 chalk: 4.1.2 - create-jest: 29.7.0(@types/node@20.10.5) + create-jest: 29.7.0(@types/node@18.19.2) exit: 0.1.2 import-local: 3.1.0 - jest-config: 29.7.0(@types/node@20.10.5) + jest-config: 29.7.0(@types/node@18.19.2) jest-util: 29.7.0 jest-validate: 29.7.0 yargs: 17.7.2 @@ -10250,7 +10266,7 @@ packages: - ts-node dev: true - /jest-config@29.7.0(@types/node@20.10.3): + /jest-config@29.7.0(@types/node@18.19.2): resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} peerDependencies: @@ -10265,7 +10281,7 @@ packages: '@babel/core': 7.23.3 '@jest/test-sequencer': 29.7.0 '@jest/types': 29.6.3 - '@types/node': 20.10.3 + '@types/node': 18.19.2 babel-jest: 29.7.0(@babel/core@7.23.3) chalk: 4.1.2 ci-info: 3.9.0 @@ -10290,7 +10306,7 @@ packages: - supports-color dev: true - /jest-config@29.7.0(@types/node@20.10.5): + /jest-config@29.7.0(@types/node@20.10.3): resolution: {integrity: sha512-uXbpfeQ7R6TZBqI3/TxCU4q4ttk3u0PJeC+E0zbfSoSjq6bJ7buBPxzQPL0ifrkY4DNu4JUdk0ImlBUYi840eQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} peerDependencies: @@ -10305,7 +10321,7 @@ packages: '@babel/core': 7.23.3 '@jest/test-sequencer': 29.7.0 '@jest/types': 29.6.3 - '@types/node': 20.10.5 + '@types/node': 20.10.3 babel-jest: 29.7.0(@babel/core@7.23.3) chalk: 4.1.2 ci-info: 3.9.0 @@ -10619,7 +10635,7 @@ packages: merge-stream: 2.0.0 supports-color: 8.1.1 - /jest@29.7.0(@types/node@20.10.5): + /jest@29.7.0(@types/node@18.19.2): resolution: {integrity: sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} hasBin: true @@ -10632,7 +10648,7 @@ packages: '@jest/core': 29.7.0 '@jest/types': 29.6.3 import-local: 3.1.0 - jest-cli: 29.7.0(@types/node@20.10.5) + jest-cli: 29.7.0(@types/node@18.19.2) transitivePeerDependencies: - '@types/node' - babel-plugin-macros @@ -14854,6 +14870,10 @@ packages: ts-interface-checker: 0.1.13 dev: true + /suffix-thumb@5.0.2: + resolution: {integrity: sha512-I5PWXAFKx3FYnI9a+dQMWNqTxoRt6vdBdb0O+BJ1sxXCWtSoQCusc13E58f+9p4MYx/qCnEMkD5jac6K2j3dgA==} + dev: false + /supports-color@5.5.0: resolution: {integrity: sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==} engines: {node: '>=4'} @@ -15253,7 +15273,7 @@ packages: '@babel/core': 7.23.3 bs-logger: 0.2.6 fast-json-stable-stringify: 2.1.0 - jest: 29.7.0(@types/node@20.10.5) + jest: 29.7.0(@types/node@18.19.2) jest-util: 29.7.0 json5: 2.2.3 lodash.memoize: 4.1.2 @@ -15294,7 +15314,7 @@ packages: yn: 3.1.1 dev: true - /ts-node@10.9.1(@types/node@20.10.5)(typescript@5.3.3): + /ts-node@10.9.1(@types/node@18.19.2)(typescript@5.3.3): resolution: {integrity: sha512-NtVysVPkxxrwFGUUxGYhfux8k78pQB3JqYBXlLRZgdGUqTO5wU/UyHop5p70iEbGhB7q5KmiZiU0Y3KlJrScEw==} hasBin: true peerDependencies: @@ -15313,7 +15333,7 @@ packages: '@tsconfig/node12': 1.0.11 '@tsconfig/node14': 1.0.3 '@tsconfig/node16': 1.0.4 - '@types/node': 20.10.5 + '@types/node': 18.19.2 acorn: 8.11.2 acorn-walk: 8.3.0 arg: 4.1.3