diff --git a/.changeset/quick-seas-hug.md b/.changeset/quick-seas-hug.md new file mode 100644 index 0000000000000000000000000000000000000000..683ebc437484037c85d1ec12cab560ce27caee84 --- /dev/null +++ b/.changeset/quick-seas-hug.md @@ -0,0 +1,5 @@ +--- +"@llamaindex/core": patch +--- + +chore: bump `natural` to 8.0.1 diff --git a/packages/core/package.json b/packages/core/package.json index 3a79ef3f272e46869bdc8f9d88850e0ff44a5e29..38e4df3cc0e605b52e0a0c8d5e79ff5986c59ec8 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -132,7 +132,7 @@ "devDependencies": { "ajv": "^8.16.0", "bunchee": "5.3.1", - "natural": "^7.1.0" + "natural": "^8.0.1" }, "dependencies": { "@llamaindex/env": "workspace:*", diff --git a/packages/core/src/node-parser/sentence-tokenizer-parser.js b/packages/core/src/node-parser/sentence-tokenizer-parser.js deleted file mode 100644 index ea052634362b11cd49ff532cc45ebc2b76890e90..0000000000000000000000000000000000000000 --- a/packages/core/src/node-parser/sentence-tokenizer-parser.js +++ /dev/null @@ -1,1571 +0,0 @@ -var __getOwnPropNames = Object.getOwnPropertyNames; -var cjs = (cb, mod) => - function _r() { - return ( - mod || - (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), - mod.exports - ); - }; - -// lib/natural/util/abbreviations_en.js -var require_abbreviations_en = cjs({ - "lib/natural/util/abbreviations_en.js"(exports) { - "use strict"; - var knownAbbreviations = [ - "approx.", - "appt.", - "apt.", - "A.S.A.P.", - "B.Y.O.B.", - "c/o", - "dept.", - "D.I.Y.", - "est.", - "E.T.A.", - "Inc.", - "min.", - "misc.", - "Mr.", - "Mrs.", - "no.", - "R.S.V.P.", - "tel.", - "temp.", - "vet.", - "vs.", - ]; - exports.knownAbbreviations = knownAbbreviations; - }, -}); - -// lib/natural/tokenizers/parser_sentence_tokenizer.js -var require_parser_sentence_tokenizer = cjs({ - "lib/natural/tokenizers/parser_sentence_tokenizer.js"(exports, module) { - "use strict"; - function peg$subclass(child, parent) { - function ctor() { - this.constructor = child; - } - ctor.prototype = parent.prototype; - child.prototype = new ctor(); - } - function peg$SyntaxError(message, expected, found, location) { - this.message = message; - this.expected = expected; - this.found = found; - this.location = location; - this.name = "SyntaxError"; - if (typeof Error.captureStackTrace === "function") { - Error.captureStackTrace(this, peg$SyntaxError); - } - } - peg$subclass(peg$SyntaxError, Error); - peg$SyntaxError.buildMessage = function (expected, found) { - var DESCRIBE_EXPECTATION_FNS = { - literal: function (expectation) { - return '"' + literalEscape(expectation.text) + '"'; - }, - class: function (expectation) { - var escapedParts = "", - i; - for (i = 0; i < expectation.parts.length; i++) { - escapedParts += - expectation.parts[i] instanceof Array - ? classEscape(expectation.parts[i][0]) + - "-" + - classEscape(expectation.parts[i][1]) - : classEscape(expectation.parts[i]); - } - return "[" + (expectation.inverted ? "^" : "") + escapedParts + "]"; - }, - any: function (expectation) { - return "any character"; - }, - end: function (expectation) { - return "end of input"; - }, - other: function (expectation) { - return expectation.description; - }, - }; - function hex(ch) { - return ch.charCodeAt(0).toString(16).toUpperCase(); - } - function literalEscape(s) { - return s - .replace(/\\/g, "\\\\") - .replace(/"/g, '\\"') - .replace(/\0/g, "\\0") - .replace(/\t/g, "\\t") - .replace(/\n/g, "\\n") - .replace(/\r/g, "\\r") - .replace(/[\x00-\x0F]/g, function (ch) { - return "\\x0" + hex(ch); - }) - .replace(/[\x10-\x1F\x7F-\x9F]/g, function (ch) { - return "\\x" + hex(ch); - }); - } - function classEscape(s) { - return s - .replace(/\\/g, "\\\\") - .replace(/\]/g, "\\]") - .replace(/\^/g, "\\^") - .replace(/-/g, "\\-") - .replace(/\0/g, "\\0") - .replace(/\t/g, "\\t") - .replace(/\n/g, "\\n") - .replace(/\r/g, "\\r") - .replace(/[\x00-\x0F]/g, function (ch) { - return "\\x0" + hex(ch); - }) - .replace(/[\x10-\x1F\x7F-\x9F]/g, function (ch) { - return "\\x" + hex(ch); - }); - } - function describeExpectation(expectation) { - return DESCRIBE_EXPECTATION_FNS[expectation.type](expectation); - } - function describeExpected(expected2) { - var descriptions = new Array(expected2.length), - i, - j; - for (i = 0; i < expected2.length; i++) { - descriptions[i] = describeExpectation(expected2[i]); - } - descriptions.sort(); - if (descriptions.length > 0) { - for (i = 1, j = 1; i < descriptions.length; i++) { - if (descriptions[i - 1] !== descriptions[i]) { - descriptions[j] = descriptions[i]; - j++; - } - } - descriptions.length = j; - } - switch (descriptions.length) { - case 1: - return descriptions[0]; - case 2: - return descriptions[0] + " or " + descriptions[1]; - default: - return ( - descriptions.slice(0, -1).join(", ") + - ", or " + - descriptions[descriptions.length - 1] - ); - } - } - function describeFound(found2) { - return found2 ? '"' + literalEscape(found2) + '"' : "end of input"; - } - return ( - "Expected " + - describeExpected(expected) + - " but " + - describeFound(found) + - " found." - ); - }; - function peg$parse(input, options) { - options = options !== void 0 ? options : {}; - var peg$FAILED = {}, - peg$startRuleFunctions = { s: peg$parses }, - peg$startRuleFunction = peg$parses, - peg$c0 = function (sentences) { - const result = []; - sentences.forEach((sent0) => { - sent0[0].forEach((sent1) => { - result.push(sent1); - }); - }); - return result; - }, - peg$c1 = function (sentences) { - return sentences.map((sent) => { - sent[0].push(sent[1]); - return sent[0].reduce((accu, str) => accu + str).trim(); - }); - }, - peg$c2 = function (open, sentences, close) { - const result = sentences.map((sent) => { - sent[0].push(sent[1]); - return sent[0].reduce((accu, str) => accu + str).trim(); - }); - result.unshift(open); - if (close) { - result.push(close); - } - return result; - }, - peg$c3 = function (seqs, end) { - const res = seqs.reduce((accu, seq) => accu.concat(seq)); - res.push(end); - return res; - }, - peg$c4 = function (tokens) { - const result = tokens.map((pair) => pair[0] + pair[1]); - return result; - }, - peg$c5 = function (open, tokens, end, close) { - const result = tokens.map((pair) => pair[0] + pair[1]); - result.unshift(open); - result.push(end); - result.push(close); - return result; - }, - peg$c6 = /^[ \t\n\r.?!]/, - peg$c7 = peg$classExpectation( - [" ", " ", "\n", "\r", ".", "?", "!"], - false, - false, - ), - peg$c8 = function () { - return text(); - }, - peg$c9 = /^[ \t\n\r]/, - peg$c10 = peg$classExpectation([" ", " ", "\n", "\r"], false, false), - peg$c11 = function (t) { - return t; - }, - peg$c12 = /^[^ \t\n\r!?([}"`)\]}"`0-9@]/, - peg$c13 = peg$classExpectation( - [ - " ", - " ", - "\n", - "\r", - "!", - "?", - "(", - "[", - "}", - '"', - "`", - ")", - "]", - "}", - '"', - "`", - ["0", "9"], - "@", - ], - true, - false, - ), - peg$c14 = function (word) { - const tmp = word.reduce((accu, elt) => accu + elt); - return knownAbbreviations.indexOf(tmp) > -1; - }, - peg$c15 = function (word) { - return text(); - }, - peg$c16 = /^[^ \t\n\r!?.([})\]}`"0-9@]/, - peg$c17 = peg$classExpectation( - [ - " ", - " ", - "\n", - "\r", - "!", - "?", - ".", - "(", - "[", - "}", - ")", - "]", - "}", - "`", - '"', - ["0", "9"], - "@", - ], - true, - false, - ), - peg$c18 = function () { - return text(); - }, - peg$c19 = /^[0-9]/, - peg$c20 = peg$classExpectation([["0", "9"]], false, false), - peg$c21 = peg$anyExpectation(), - peg$c22 = /^[a-z]/, - peg$c23 = peg$classExpectation([["a", "z"]], false, false), - peg$c24 = /^[@]/, - peg$c25 = peg$classExpectation(["@"], false, false), - peg$c26 = /^[.]/, - peg$c27 = peg$classExpectation(["."], false, false), - peg$c28 = "http://", - peg$c29 = peg$literalExpectation("http://", false), - peg$c30 = "https://", - peg$c31 = peg$literalExpectation("https://", false), - peg$c32 = /^[a-z0-9]/, - peg$c33 = peg$classExpectation( - [ - ["a", "z"], - ["0", "9"], - ], - false, - false, - ), - peg$c34 = /^[\/]/, - peg$c35 = peg$classExpectation(["/"], false, false), - peg$c36 = function () { - return text(); - }, - peg$c37 = /^[([{"'`\u2018]/, - peg$c38 = peg$classExpectation( - ["(", "[", "{", '"', "'", "`", "\u2018"], - false, - false, - ), - peg$c39 = /^[)\]}"'`\u2019]/, - peg$c40 = peg$classExpectation( - [")", "]", "}", '"', "'", "`", "\u2019"], - false, - false, - ), - peg$currPos = 0, - peg$savedPos = 0, - peg$posDetailsCache = [{ line: 1, column: 1 }], - peg$maxFailPos = 0, - peg$maxFailExpected = [], - peg$silentFails = 0, - peg$result; - if ("startRule" in options) { - if (!(options.startRule in peg$startRuleFunctions)) { - throw new Error( - `Can't start parsing from rule "` + options.startRule + '".', - ); - } - peg$startRuleFunction = peg$startRuleFunctions[options.startRule]; - } - function text() { - return input.substring(peg$savedPos, peg$currPos); - } - function location() { - return peg$computeLocation(peg$savedPos, peg$currPos); - } - function expected(description, location2) { - location2 = - location2 !== void 0 - ? location2 - : peg$computeLocation(peg$savedPos, peg$currPos); - throw peg$buildStructuredError( - [peg$otherExpectation(description)], - input.substring(peg$savedPos, peg$currPos), - location2, - ); - } - function error(message, location2) { - location2 = - location2 !== void 0 - ? location2 - : peg$computeLocation(peg$savedPos, peg$currPos); - throw peg$buildSimpleError(message, location2); - } - function peg$literalExpectation(text2, ignoreCase) { - return { type: "literal", text: text2, ignoreCase }; - } - function peg$classExpectation(parts, inverted, ignoreCase) { - return { type: "class", parts, inverted, ignoreCase }; - } - function peg$anyExpectation() { - return { type: "any" }; - } - function peg$endExpectation() { - return { type: "end" }; - } - function peg$otherExpectation(description) { - return { type: "other", description }; - } - function peg$computePosDetails(pos) { - var details = peg$posDetailsCache[pos], - p; - if (details) { - return details; - } else { - p = pos - 1; - while (!peg$posDetailsCache[p]) { - p--; - } - details = peg$posDetailsCache[p]; - details = { - line: details.line, - column: details.column, - }; - while (p < pos) { - if (input.charCodeAt(p) === 10) { - details.line++; - details.column = 1; - } else { - details.column++; - } - p++; - } - peg$posDetailsCache[pos] = details; - return details; - } - } - function peg$computeLocation(startPos, endPos) { - var startPosDetails = peg$computePosDetails(startPos), - endPosDetails = peg$computePosDetails(endPos); - return { - start: { - offset: startPos, - line: startPosDetails.line, - column: startPosDetails.column, - }, - end: { - offset: endPos, - line: endPosDetails.line, - column: endPosDetails.column, - }, - }; - } - function peg$fail(expected2) { - if (peg$currPos < peg$maxFailPos) { - return; - } - if (peg$currPos > peg$maxFailPos) { - peg$maxFailPos = peg$currPos; - peg$maxFailExpected = []; - } - peg$maxFailExpected.push(expected2); - } - function peg$buildSimpleError(message, location2) { - return new peg$SyntaxError(message, null, null, location2); - } - function peg$buildStructuredError(expected2, found, location2) { - return new peg$SyntaxError( - peg$SyntaxError.buildMessage(expected2, found), - expected2, - found, - location2, - ); - } - function peg$parses() { - var s0, s1, s2, s3, s4; - s0 = peg$currPos; - s1 = []; - s2 = peg$currPos; - s3 = peg$parseSentences(); - if (s3 !== peg$FAILED) { - s4 = peg$parseWhitespace(); - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - if (s2 === peg$FAILED) { - s2 = peg$currPos; - s3 = peg$parseQuotedSentences(); - if (s3 !== peg$FAILED) { - s4 = peg$parseWhitespace(); - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } - if (s2 !== peg$FAILED) { - while (s2 !== peg$FAILED) { - s1.push(s2); - s2 = peg$currPos; - s3 = peg$parseSentences(); - if (s3 !== peg$FAILED) { - s4 = peg$parseWhitespace(); - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - if (s2 === peg$FAILED) { - s2 = peg$currPos; - s3 = peg$parseQuotedSentences(); - if (s3 !== peg$FAILED) { - s4 = peg$parseWhitespace(); - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } - } - } else { - s1 = peg$FAILED; - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c0(s1); - } - s0 = s1; - return s0; - } - function peg$parseSentences() { - var s0, s1, s2, s3, s4; - s0 = peg$currPos; - s1 = []; - s2 = peg$currPos; - s3 = peg$parseSentence(); - if (s3 !== peg$FAILED) { - s4 = peg$parseWhitespace(); - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - if (s2 !== peg$FAILED) { - while (s2 !== peg$FAILED) { - s1.push(s2); - s2 = peg$currPos; - s3 = peg$parseSentence(); - if (s3 !== peg$FAILED) { - s4 = peg$parseWhitespace(); - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } - } else { - s1 = peg$FAILED; - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c1(s1); - } - s0 = s1; - return s0; - } - function peg$parseQuotedSentences() { - var s0, s1, s2, s3, s4, s5; - s0 = peg$currPos; - s1 = peg$parseOpenSymbol(); - if (s1 !== peg$FAILED) { - s2 = []; - s3 = peg$currPos; - s4 = peg$parseSentence(); - if (s4 !== peg$FAILED) { - s5 = peg$parseWhitespace(); - if (s5 !== peg$FAILED) { - s4 = [s4, s5]; - s3 = s4; - } else { - peg$currPos = s3; - s3 = peg$FAILED; - } - } else { - peg$currPos = s3; - s3 = peg$FAILED; - } - if (s3 !== peg$FAILED) { - while (s3 !== peg$FAILED) { - s2.push(s3); - s3 = peg$currPos; - s4 = peg$parseSentence(); - if (s4 !== peg$FAILED) { - s5 = peg$parseWhitespace(); - if (s5 !== peg$FAILED) { - s4 = [s4, s5]; - s3 = s4; - } else { - peg$currPos = s3; - s3 = peg$FAILED; - } - } else { - peg$currPos = s3; - s3 = peg$FAILED; - } - } - } else { - s2 = peg$FAILED; - } - if (s2 !== peg$FAILED) { - s3 = peg$parseCloseSymbol(); - if (s3 === peg$FAILED) { - s3 = null; - } - if (s3 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c2(s1, s2, s3); - s0 = s1; - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - return s0; - } - function peg$parseSentence() { - var s0, s1, s2; - s0 = peg$currPos; - s1 = []; - s2 = peg$parseTokenSeq(); - if (s2 === peg$FAILED) { - s2 = peg$parseQuotedTokenSeq(); - } - if (s2 !== peg$FAILED) { - while (s2 !== peg$FAILED) { - s1.push(s2); - s2 = peg$parseTokenSeq(); - if (s2 === peg$FAILED) { - s2 = peg$parseQuotedTokenSeq(); - } - } - } else { - s1 = peg$FAILED; - } - if (s1 !== peg$FAILED) { - s2 = peg$parseEndOfSentence(); - if (s2 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c3(s1, s2); - s0 = s1; - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - return s0; - } - function peg$parseTokenSeq() { - var s0, s1, s2, s3, s4; - s0 = peg$currPos; - s1 = []; - s2 = peg$currPos; - s3 = peg$parseToken(); - if (s3 !== peg$FAILED) { - s4 = peg$parseWhitespace(); - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - if (s2 !== peg$FAILED) { - while (s2 !== peg$FAILED) { - s1.push(s2); - s2 = peg$currPos; - s3 = peg$parseToken(); - if (s3 !== peg$FAILED) { - s4 = peg$parseWhitespace(); - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } - } else { - s1 = peg$FAILED; - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c4(s1); - } - s0 = s1; - return s0; - } - function peg$parseQuotedTokenSeq() { - var s0, s1, s2, s3, s4, s5; - s0 = peg$currPos; - s1 = peg$parseOpenSymbol(); - if (s1 !== peg$FAILED) { - s2 = []; - s3 = peg$currPos; - s4 = peg$parseToken(); - if (s4 !== peg$FAILED) { - s5 = peg$parseWhitespace(); - if (s5 !== peg$FAILED) { - s4 = [s4, s5]; - s3 = s4; - } else { - peg$currPos = s3; - s3 = peg$FAILED; - } - } else { - peg$currPos = s3; - s3 = peg$FAILED; - } - if (s3 !== peg$FAILED) { - while (s3 !== peg$FAILED) { - s2.push(s3); - s3 = peg$currPos; - s4 = peg$parseToken(); - if (s4 !== peg$FAILED) { - s5 = peg$parseWhitespace(); - if (s5 !== peg$FAILED) { - s4 = [s4, s5]; - s3 = s4; - } else { - peg$currPos = s3; - s3 = peg$FAILED; - } - } else { - peg$currPos = s3; - s3 = peg$FAILED; - } - } - } else { - s2 = peg$FAILED; - } - if (s2 !== peg$FAILED) { - s3 = peg$parseEndOfSentence(); - if (s3 !== peg$FAILED) { - s4 = peg$parseCloseSymbol(); - if (s4 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c5(s1, s2, s3, s4); - s0 = s1; - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - return s0; - } - function peg$parseEndOfSentence() { - var s0, s1, s2; - s0 = peg$currPos; - s1 = []; - if (peg$c6.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c7); - } - } - while (s2 !== peg$FAILED) { - s1.push(s2); - if (peg$c6.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c7); - } - } - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c8(); - } - s0 = s1; - return s0; - } - function peg$parseWhitespace() { - var s0, s1, s2; - s0 = peg$currPos; - s1 = []; - if (peg$c9.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c10); - } - } - while (s2 !== peg$FAILED) { - s1.push(s2); - if (peg$c9.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c10); - } - } - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c8(); - } - s0 = s1; - return s0; - } - function peg$parseToken() { - var s0, s1; - s0 = peg$currPos; - s1 = peg$parseURI(); - if (s1 === peg$FAILED) { - s1 = peg$parseEmail(); - if (s1 === peg$FAILED) { - s1 = peg$parseNumber(); - if (s1 === peg$FAILED) { - s1 = peg$parseAbbreviation(); - if (s1 === peg$FAILED) { - s1 = peg$parseWord(); - } - } - } - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c11(s1); - } - s0 = s1; - return s0; - } - function peg$parseAbbreviation() { - var s0, s1, s2; - s0 = peg$currPos; - s1 = []; - if (peg$c12.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c13); - } - } - if (s2 !== peg$FAILED) { - while (s2 !== peg$FAILED) { - s1.push(s2); - if (peg$c12.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c13); - } - } - } - } else { - s1 = peg$FAILED; - } - if (s1 !== peg$FAILED) { - peg$savedPos = peg$currPos; - s2 = peg$c14(s1); - if (s2) { - s2 = void 0; - } else { - s2 = peg$FAILED; - } - if (s2 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c15(s1); - s0 = s1; - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - return s0; - } - function peg$parseWord() { - var s0, s1, s2; - s0 = peg$currPos; - s1 = []; - if (peg$c16.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c17); - } - } - if (s2 !== peg$FAILED) { - while (s2 !== peg$FAILED) { - s1.push(s2); - if (peg$c16.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c17); - } - } - } - } else { - s1 = peg$FAILED; - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c18(); - } - s0 = s1; - return s0; - } - function peg$parseNumber() { - var s0, s1, s2, s3, s4, s5; - s0 = peg$currPos; - s1 = []; - if (peg$c19.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c20); - } - } - if (s2 !== peg$FAILED) { - while (s2 !== peg$FAILED) { - s1.push(s2); - if (peg$c19.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c20); - } - } - } - } else { - s1 = peg$FAILED; - } - if (s1 !== peg$FAILED) { - s2 = peg$currPos; - if (input.length > peg$currPos) { - s3 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s3 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c21); - } - } - if (s3 !== peg$FAILED) { - s4 = []; - if (peg$c19.test(input.charAt(peg$currPos))) { - s5 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s5 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c20); - } - } - if (s5 !== peg$FAILED) { - while (s5 !== peg$FAILED) { - s4.push(s5); - if (peg$c19.test(input.charAt(peg$currPos))) { - s5 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s5 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c20); - } - } - } - } else { - s4 = peg$FAILED; - } - if (s4 !== peg$FAILED) { - s3 = [s3, s4]; - s2 = s3; - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - } else { - peg$currPos = s2; - s2 = peg$FAILED; - } - if (s2 === peg$FAILED) { - s2 = null; - } - if (s2 !== peg$FAILED) { - s3 = peg$parseCloseSymbol(); - if (s3 === peg$FAILED) { - s3 = null; - } - if (s3 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c8(); - s0 = s1; - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - return s0; - } - function peg$parseEmail() { - var s0, s1, s2, s3, s4, s5, s6; - s0 = peg$currPos; - s1 = []; - if (peg$c22.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c23); - } - } - if (s2 !== peg$FAILED) { - while (s2 !== peg$FAILED) { - s1.push(s2); - if (peg$c22.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c23); - } - } - } - } else { - s1 = peg$FAILED; - } - if (s1 !== peg$FAILED) { - if (peg$c24.test(input.charAt(peg$currPos))) { - s2 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s2 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c25); - } - } - if (s2 !== peg$FAILED) { - s3 = []; - if (peg$c22.test(input.charAt(peg$currPos))) { - s4 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s4 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c23); - } - } - if (s4 !== peg$FAILED) { - while (s4 !== peg$FAILED) { - s3.push(s4); - if (peg$c22.test(input.charAt(peg$currPos))) { - s4 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s4 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c23); - } - } - } - } else { - s3 = peg$FAILED; - } - if (s3 !== peg$FAILED) { - if (peg$c26.test(input.charAt(peg$currPos))) { - s4 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s4 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c27); - } - } - if (s4 !== peg$FAILED) { - s5 = []; - if (peg$c22.test(input.charAt(peg$currPos))) { - s6 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s6 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c23); - } - } - if (s6 !== peg$FAILED) { - while (s6 !== peg$FAILED) { - s5.push(s6); - if (peg$c22.test(input.charAt(peg$currPos))) { - s6 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s6 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c23); - } - } - } - } else { - s5 = peg$FAILED; - } - if (s5 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c8(); - s0 = s1; - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - return s0; - } - function peg$parseURI() { - var s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; - s0 = peg$currPos; - if (input.substr(peg$currPos, 7) === peg$c28) { - s1 = peg$c28; - peg$currPos += 7; - } else { - s1 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c29); - } - } - if (s1 === peg$FAILED) { - if (input.substr(peg$currPos, 8) === peg$c30) { - s1 = peg$c30; - peg$currPos += 8; - } else { - s1 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c31); - } - } - } - if (s1 === peg$FAILED) { - s1 = null; - } - if (s1 !== peg$FAILED) { - s2 = []; - if (peg$c32.test(input.charAt(peg$currPos))) { - s3 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s3 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - if (s3 !== peg$FAILED) { - while (s3 !== peg$FAILED) { - s2.push(s3); - if (peg$c32.test(input.charAt(peg$currPos))) { - s3 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s3 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - } - } else { - s2 = peg$FAILED; - } - if (s2 !== peg$FAILED) { - if (peg$c26.test(input.charAt(peg$currPos))) { - s3 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s3 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c27); - } - } - if (s3 !== peg$FAILED) { - s4 = peg$currPos; - s5 = []; - if (peg$c32.test(input.charAt(peg$currPos))) { - s6 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s6 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - if (s6 !== peg$FAILED) { - while (s6 !== peg$FAILED) { - s5.push(s6); - if (peg$c32.test(input.charAt(peg$currPos))) { - s6 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s6 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - } - } else { - s5 = peg$FAILED; - } - if (s5 !== peg$FAILED) { - if (peg$c26.test(input.charAt(peg$currPos))) { - s6 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s6 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c27); - } - } - if (s6 !== peg$FAILED) { - s5 = [s5, s6]; - s4 = s5; - } else { - peg$currPos = s4; - s4 = peg$FAILED; - } - } else { - peg$currPos = s4; - s4 = peg$FAILED; - } - if (s4 === peg$FAILED) { - s4 = null; - } - if (s4 !== peg$FAILED) { - s5 = []; - if (peg$c32.test(input.charAt(peg$currPos))) { - s6 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s6 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - if (s6 !== peg$FAILED) { - while (s6 !== peg$FAILED) { - s5.push(s6); - if (peg$c32.test(input.charAt(peg$currPos))) { - s6 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s6 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - } - } else { - s5 = peg$FAILED; - } - if (s5 !== peg$FAILED) { - s6 = []; - s7 = peg$currPos; - s8 = []; - if (peg$c32.test(input.charAt(peg$currPos))) { - s9 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s9 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - if (s9 !== peg$FAILED) { - while (s9 !== peg$FAILED) { - s8.push(s9); - if (peg$c32.test(input.charAt(peg$currPos))) { - s9 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s9 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - } - } else { - s8 = peg$FAILED; - } - if (s8 !== peg$FAILED) { - if (peg$c34.test(input.charAt(peg$currPos))) { - s9 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s9 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c35); - } - } - if (s9 !== peg$FAILED) { - s8 = [s8, s9]; - s7 = s8; - } else { - peg$currPos = s7; - s7 = peg$FAILED; - } - } else { - peg$currPos = s7; - s7 = peg$FAILED; - } - while (s7 !== peg$FAILED) { - s6.push(s7); - s7 = peg$currPos; - s8 = []; - if (peg$c32.test(input.charAt(peg$currPos))) { - s9 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s9 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - if (s9 !== peg$FAILED) { - while (s9 !== peg$FAILED) { - s8.push(s9); - if (peg$c32.test(input.charAt(peg$currPos))) { - s9 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s9 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c33); - } - } - } - } else { - s8 = peg$FAILED; - } - if (s8 !== peg$FAILED) { - if (peg$c34.test(input.charAt(peg$currPos))) { - s9 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s9 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c35); - } - } - if (s9 !== peg$FAILED) { - s8 = [s8, s9]; - s7 = s8; - } else { - peg$currPos = s7; - s7 = peg$FAILED; - } - } else { - peg$currPos = s7; - s7 = peg$FAILED; - } - } - if (s6 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c36(); - s0 = s1; - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - } else { - peg$currPos = s0; - s0 = peg$FAILED; - } - return s0; - } - function peg$parseOpenSymbol() { - var s0, s1; - s0 = peg$currPos; - if (peg$c37.test(input.charAt(peg$currPos))) { - s1 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s1 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c38); - } - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c8(); - } - s0 = s1; - return s0; - } - function peg$parseCloseSymbol() { - var s0, s1; - s0 = peg$currPos; - if (peg$c39.test(input.charAt(peg$currPos))) { - s1 = input.charAt(peg$currPos); - peg$currPos++; - } else { - s1 = peg$FAILED; - if (peg$silentFails === 0) { - peg$fail(peg$c40); - } - } - if (s1 !== peg$FAILED) { - peg$savedPos = s0; - s1 = peg$c8(); - } - s0 = s1; - return s0; - } - const knownAbbreviations = require_abbreviations_en().knownAbbreviations; - peg$result = peg$startRuleFunction(); - if (peg$result !== peg$FAILED && peg$currPos === input.length) { - return peg$result; - } else { - if (peg$result !== peg$FAILED && peg$currPos < input.length) { - peg$fail(peg$endExpectation()); - } - throw peg$buildStructuredError( - peg$maxFailExpected, - peg$maxFailPos < input.length ? input.charAt(peg$maxFailPos) : null, - peg$maxFailPos < input.length - ? peg$computeLocation(peg$maxFailPos, peg$maxFailPos + 1) - : peg$computeLocation(peg$maxFailPos, peg$maxFailPos), - ); - } - } - module.exports = { - SyntaxError: peg$SyntaxError, - parse: peg$parse, - }; - }, -}); - -// lib/natural/tokenizers/tokenizer.js -var require_tokenizer = cjs({ - "lib/natural/tokenizers/tokenizer.js"(exports, module) { - "use strict"; - var Tokenizer = class { - trim(array) { - while (array[array.length - 1] === "") { - array.pop(); - } - while (array[0] === "") { - array.shift(); - } - return array; - } - }; - module.exports = Tokenizer; - }, -}); - -// lib/natural/tokenizers/sentence_tokenizer_parser.js -var require_sentence_tokenizer_parser = cjs({ - "lib/natural/tokenizers/sentence_tokenizer_parser.js"(exports, module) { - var parser = require_parser_sentence_tokenizer(); - var Tokenizer = require_tokenizer(); - var SentenceTokenizer = class extends Tokenizer { - tokenize(text) { - return parser.parse(text); - } - }; - module.exports = SentenceTokenizer; - }, -}); -export default require_sentence_tokenizer_parser(); diff --git a/packages/core/src/node-parser/sentence-tokenizer-parser.d.ts b/packages/core/src/node-parser/sentence_tokenizer.d.ts similarity index 73% rename from packages/core/src/node-parser/sentence-tokenizer-parser.d.ts rename to packages/core/src/node-parser/sentence_tokenizer.d.ts index 87074151446a7a938065e97b3eb2c0320f363ac9..c0c2d416fde9b5b26a3d409d6c8bcafa200eedc2 100644 --- a/packages/core/src/node-parser/sentence-tokenizer-parser.d.ts +++ b/packages/core/src/node-parser/sentence_tokenizer.d.ts @@ -1,4 +1,5 @@ declare class SentenceTokenizer { + constructor(abbreviations?: string[]); tokenize(text: string): string[]; } diff --git a/packages/core/src/node-parser/sentence_tokenizer.js b/packages/core/src/node-parser/sentence_tokenizer.js new file mode 100644 index 0000000000000000000000000000000000000000..08cabd31bc2420bf180d2c5117d2a51df2360f07 --- /dev/null +++ b/packages/core/src/node-parser/sentence_tokenizer.js @@ -0,0 +1,222 @@ +var __getOwnPropNames = Object.getOwnPropertyNames; +var __commonJS = (cb, mod) => + function __require() { + return ( + mod || + (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), + mod.exports + ); + }; + +// lib/natural/tokenizers/tokenizer.js +var require_tokenizer = __commonJS({ + "lib/natural/tokenizers/tokenizer.js"(exports, module) { + "use strict"; + var Tokenizer = class { + trim(array) { + while (array[array.length - 1] === "") { + array.pop(); + } + while (array[0] === "") { + array.shift(); + } + return array; + } + }; + module.exports = Tokenizer; + }, +}); + +// lib/natural/tokenizers/sentence_tokenizer.js +var require_sentence_tokenizer = __commonJS({ + "lib/natural/tokenizers/sentence_tokenizer.js"(exports, module) { + var Tokenizer = require_tokenizer(); + var NUM = "NUMBER"; + var DELIM = "DELIM"; + var URI = "URI"; + var ABBREV = "ABBREV"; + var DEBUG = false; + function generateUniqueCode(base, index) { + return `{{${base}_${index}}}`; + } + function escapeRegExp(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + } + var SentenceTokenizer = class extends Tokenizer { + constructor(abbreviations) { + super(); + if (abbreviations) { + this.abbreviations = abbreviations; + } else { + this.abbreviations = []; + } + this.replacementMap = null; + this.replacementCounter = 0; + } + replaceUrisWithPlaceholders(text) { + const urlPattern = + /(https?:\/\/\S+|www\.\S+|ftp:\/\/\S+|(mailto:)?[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|file:\/\/\S+)/gi; + const modifiedText = text.replace(urlPattern, (match) => { + const placeholder = generateUniqueCode( + URI, + this.replacementCounter++, + ); + this.replacementMap.set(placeholder, match); + return placeholder; + }); + return modifiedText; + } + replaceAbbreviations(text) { + if (this.abbreviations.length === 0) { + return text; + } + const pattern = new RegExp( + `(${this.abbreviations.map((abbrev) => escapeRegExp(abbrev)).join("|")})`, + "gi", + ); + const replacedText = text.replace(pattern, (match) => { + const code = generateUniqueCode(ABBREV, this.replacementCounter++); + this.replacementMap.set(code, match); + return code; + }); + return replacedText; + } + replaceDelimitersWithPlaceholders(text) { + const delimiterPattern = /([.?!… ]*)([.?!…])(["'â€â€™)}\]]?)/g; + const modifiedText = text.replace( + delimiterPattern, + (match, p1, p2, p3) => { + const placeholder = generateUniqueCode( + DELIM, + this.replacementCounter++, + ); + this.delimiterMap.set(placeholder, p1 + p2 + p3); + return placeholder; + }, + ); + return modifiedText; + } + splitOnPlaceholders(text, placeholders) { + if (this.delimiterMap.size === 0) { + return [text]; + } + const keys = Array.from(this.delimiterMap.keys()); + const pattern = new RegExp(`(${keys.map(escapeRegExp).join("|")})`); + const parts = text.split(pattern); + const sentences = []; + for (let i = 0; i < parts.length; i += 2) { + const sentence = parts[i]; + const placeholder = parts[i + 1] || ""; + sentences.push(sentence + placeholder); + } + return sentences; + } + replaceNumbersWithCode(text) { + const numberPattern = /\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b/g; + const replacedText = text.replace(numberPattern, (match) => { + const code = generateUniqueCode(NUM, this.replacementCounter++); + this.replacementMap.set(code, match); + return code; + }); + return replacedText; + } + revertReplacements(text) { + let originalText = text; + for (const [ + placeholder, + replacement, + ] of this.replacementMap.entries()) { + const pattern = new RegExp(escapeRegExp(placeholder), "g"); + originalText = originalText.replace(pattern, replacement); + } + return originalText; + } + revertDelimiters(text) { + let originalText = text; + for (const [placeholder, replacement] of this.delimiterMap.entries()) { + const pattern = new RegExp(escapeRegExp(placeholder), "g"); + originalText = originalText.replace(pattern, replacement); + } + return originalText; + } + tokenize(text) { + this.replacementCounter = 0; + this.replacementMap = /* @__PURE__ */ new Map(); + this.delimiterMap = /* @__PURE__ */ new Map(); + DEBUG && + console.log( + "---Start of sentence tokenization-----------------------", + ); + DEBUG && console.log("Original input: >>>" + text + "<<<"); + const result1 = this.replaceAbbreviations(text); + DEBUG && + console.log( + "Phase 1: replacing abbreviations: " + JSON.stringify(result1), + ); + const result2 = this.replaceUrisWithPlaceholders(result1); + DEBUG && + console.log("Phase 2: replacing URIs: " + JSON.stringify(result2)); + const result3 = this.replaceNumbersWithCode(result2); + DEBUG && + console.log( + "Phase 3: replacing numbers with placeholders: " + + JSON.stringify(result3), + ); + const result4 = this.replaceDelimitersWithPlaceholders(result3); + DEBUG && + console.log( + "Phase 4: replacing delimiters with placeholders: " + + JSON.stringify(result4), + ); + const sentences = this.splitOnPlaceholders(result4); + DEBUG && + console.log( + "Phase 5: splitting into sentences on placeholders: " + + JSON.stringify(sentences), + ); + const newSentences = sentences.map((s) => { + const s1 = this.revertReplacements(s); + return this.revertDelimiters(s1); + }); + DEBUG && + console.log( + "Phase 6: replacing back abbreviations, URIs, numbers and delimiters: " + + JSON.stringify(newSentences), + ); + const trimmedSentences = this.trim(newSentences); + DEBUG && + console.log( + "Phase 7: trimming array of empty sentences: " + + JSON.stringify(trimmedSentences), + ); + const trimmedSentences2 = trimmedSentences.map((sent) => sent.trim()); + DEBUG && + console.log( + "Phase 8: trimming sentences from surrounding whitespace: " + + JSON.stringify(trimmedSentences2), + ); + DEBUG && + console.log( + "---End of sentence tokenization--------------------------", + ); + DEBUG && + console.log( + "---Replacement map---------------------------------------", + ); + DEBUG && console.log([...this.replacementMap.entries()]); + DEBUG && + console.log( + "---Delimiter map-----------------------------------------", + ); + DEBUG && console.log([...this.delimiterMap.entries()]); + DEBUG && + console.log( + "---------------------------------------------------------", + ); + return trimmedSentences2; + } + }; + module.exports = SentenceTokenizer; + }, +}); +export default require_sentence_tokenizer(); diff --git a/packages/core/src/node-parser/utils.ts b/packages/core/src/node-parser/utils.ts index 74351b6e7c1ed095a5d46dce74a8bd2b1842f40a..5f31b23162c04a22ddf0b7ba1fe304081103f76b 100644 --- a/packages/core/src/node-parser/utils.ts +++ b/packages/core/src/node-parser/utils.ts @@ -1,5 +1,5 @@ import type { TextSplitter } from "./base"; -import SentenceTokenizerNew from "./sentence-tokenizer-parser.js"; +import SentenceTokenizer from "./sentence_tokenizer"; export type TextSplitterFn = (text: string) => string[]; @@ -31,11 +31,17 @@ export const splitByChar = (): TextSplitterFn => { return (text: string) => text.split(""); }; -let sentenceTokenizer: SentenceTokenizerNew | null = null; +let sentenceTokenizer: SentenceTokenizer | null = null; export const splitBySentenceTokenizer = (): TextSplitterFn => { if (!sentenceTokenizer) { - sentenceTokenizer = new SentenceTokenizerNew(); + sentenceTokenizer = new SentenceTokenizer([ + "i.e.", + "etc.", + "vs.", + "Inc.", + "A.S.A.P.", + ]); } const tokenizer = sentenceTokenizer; return (text: string) => { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d89192a65b98aace9f332e19d190323fd937f50c..7ae92bdcce925f9596ac5d5761bda583daed719f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -163,7 +163,7 @@ importers: version: link:../packages/llamaindex mongodb: specifier: ^6.7.0 - version: 6.8.0(@aws-sdk/credential-providers@3.613.0) + version: 6.8.0(@aws-sdk/credential-providers@3.613.0(@aws-sdk/client-sso-oidc@3.613.0(@aws-sdk/client-sts@3.613.0))) pathe: specifier: ^1.1.2 version: 1.1.2 @@ -382,8 +382,8 @@ importers: specifier: 5.3.1 version: 5.3.1(typescript@5.5.3) natural: - specifier: ^7.1.0 - version: 7.1.0(@aws-sdk/credential-providers@3.613.0) + specifier: ^8.0.1 + version: 8.0.1(@aws-sdk/credential-providers@3.613.0) packages/core/tests: devDependencies: @@ -568,7 +568,7 @@ importers: version: 2.0.0 mongodb: specifier: ^6.7.0 - version: 6.8.0(@aws-sdk/credential-providers@3.613.0) + version: 6.8.0(@aws-sdk/credential-providers@3.613.0(@aws-sdk/client-sso-oidc@3.613.0(@aws-sdk/client-sts@3.613.0))) notion-md-crawler: specifier: ^1.0.0 version: 1.0.0(encoding@0.1.13) @@ -8099,8 +8099,8 @@ packages: natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} - natural@7.1.0: - resolution: {integrity: sha512-GBhiRgF0VUX+zPWahBVir1ajARQDZF1Fe6UpQORNzyQT57JQ2KLKYvubecvjIYh/uDaociusmySeRh+WL5OdxQ==} + natural@8.0.1: + resolution: {integrity: sha512-VVw8O5KrfvwqAFeNZEgBbdgA+AQaBlHcXEootWU7TWDaFWFI0VLfzyKMsRjnfdS3cVCpWmI04xLJonCvEv11VQ==} engines: {node: '>=0.4.10'} negotiator@0.6.3: @@ -12003,7 +12003,7 @@ snapshots: '@babel/core': 7.24.7 '@babel/helper-compilation-targets': 7.24.7 '@babel/helper-plugin-utils': 7.24.7 - debug: 4.3.5 + debug: 4.3.6 lodash.debounce: 4.0.8 resolve: 1.22.8 transitivePeerDependencies: @@ -15604,7 +15604,7 @@ snapshots: dependencies: '@typescript-eslint/types': 5.62.0 '@typescript-eslint/visitor-keys': 5.62.0 - debug: 4.3.5 + debug: 4.3.6 globby: 11.1.0 is-glob: 4.0.3 semver: 7.6.2 @@ -15942,7 +15942,7 @@ snapshots: agent-base@6.0.2: dependencies: - debug: 4.3.5 + debug: 4.3.6 transitivePeerDependencies: - supports-color optional: true @@ -17742,6 +17742,16 @@ snapshots: - eslint-import-resolver-webpack - supports-color + eslint-module-utils@2.8.1(@typescript-eslint/parser@7.16.0(eslint@8.57.0)(typescript@5.5.3))(eslint-import-resolver-node@0.3.9)(eslint@8.57.0): + dependencies: + debug: 3.2.7 + optionalDependencies: + '@typescript-eslint/parser': 7.16.0(eslint@8.57.0)(typescript@5.5.3) + eslint: 8.57.0 + eslint-import-resolver-node: 0.3.9 + transitivePeerDependencies: + - supports-color + eslint-module-utils@2.8.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.5.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.5.3))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0))(eslint@8.57.0): dependencies: debug: 3.2.7 @@ -17763,7 +17773,7 @@ snapshots: doctrine: 2.1.0 eslint: 8.57.0 eslint-import-resolver-node: 0.3.9 - eslint-module-utils: 2.8.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.5.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.5.3))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0))(eslint@8.57.0) + eslint-module-utils: 2.8.1(@typescript-eslint/parser@7.16.0(eslint@8.57.0)(typescript@5.5.3))(eslint-import-resolver-node@0.3.9)(eslint@8.57.0) hasown: 2.0.2 is-core-module: 2.14.0 is-glob: 4.0.3 @@ -20463,7 +20473,7 @@ snapshots: optionalDependencies: '@aws-sdk/credential-providers': 3.613.0(@aws-sdk/client-sso-oidc@3.613.0(@aws-sdk/client-sts@3.613.0)) - mongodb@6.8.0(@aws-sdk/credential-providers@3.613.0): + mongodb@6.8.0(@aws-sdk/credential-providers@3.613.0(@aws-sdk/client-sso-oidc@3.613.0(@aws-sdk/client-sts@3.613.0))): dependencies: '@mongodb-js/saslprep': 1.1.7 bson: 6.8.0 @@ -20494,7 +20504,7 @@ snapshots: mquery@5.0.0: dependencies: - debug: 4.3.5 + debug: 4.3.6 transitivePeerDependencies: - supports-color @@ -20532,7 +20542,7 @@ snapshots: natural-compare@1.4.0: {} - natural@7.1.0(@aws-sdk/credential-providers@3.613.0): + natural@8.0.1(@aws-sdk/credential-providers@3.613.0): dependencies: afinn-165: 1.0.4 afinn-165-financialmarketnews: 3.0.0 @@ -22645,7 +22655,7 @@ snapshots: spdy-transport@3.0.0: dependencies: - debug: 4.3.5 + debug: 4.3.6 detect-node: 2.1.0 hpack.js: 2.1.6 obuf: 1.1.2