Skip to content
Snippets Groups Projects
Commit 2d558c39 authored by Yi Ding's avatar Yi Ding
Browse files

stop splitting sentences by default

parent 055b4993
No related branches found
No related tags found
No related merge requests found
...@@ -62,7 +62,7 @@ export class SentenceSplitter { ...@@ -62,7 +62,7 @@ export class SentenceSplitter {
private tokenizerDecoder: any; private tokenizerDecoder: any;
private paragraphSeparator: string; private paragraphSeparator: string;
private chunkingTokenizerFn: (text: string) => RegExpMatchArray | null; private chunkingTokenizerFn: (text: string) => RegExpMatchArray | null;
// private _callback_manager: any; private splitLongSentences: boolean;
constructor(options?: { constructor(options?: {
chunkSize?: number; chunkSize?: number;
...@@ -71,6 +71,7 @@ export class SentenceSplitter { ...@@ -71,6 +71,7 @@ export class SentenceSplitter {
tokenizerDecoder?: any; tokenizerDecoder?: any;
paragraphSeparator?: string; paragraphSeparator?: string;
chunkingTokenizerFn?: (text: string) => RegExpMatchArray | null; chunkingTokenizerFn?: (text: string) => RegExpMatchArray | null;
splitLongSentences?: boolean;
}) { }) {
const { const {
chunkSize = DEFAULT_CHUNK_SIZE, chunkSize = DEFAULT_CHUNK_SIZE,
...@@ -79,6 +80,7 @@ export class SentenceSplitter { ...@@ -79,6 +80,7 @@ export class SentenceSplitter {
tokenizerDecoder = null, tokenizerDecoder = null,
paragraphSeparator = unixParagraphSeparator, paragraphSeparator = unixParagraphSeparator,
chunkingTokenizerFn = undefined, chunkingTokenizerFn = undefined,
splitLongSentences = false,
} = options ?? {}; } = options ?? {};
if (chunkOverlap > chunkSize) { if (chunkOverlap > chunkSize) {
...@@ -96,6 +98,7 @@ export class SentenceSplitter { ...@@ -96,6 +98,7 @@ export class SentenceSplitter {
this.paragraphSeparator = paragraphSeparator; this.paragraphSeparator = paragraphSeparator;
this.chunkingTokenizerFn = chunkingTokenizerFn ?? englishSentenceTokenizer; this.chunkingTokenizerFn = chunkingTokenizerFn ?? englishSentenceTokenizer;
this.splitLongSentences = splitLongSentences;
} }
private getEffectiveChunkSize(extraInfoStr?: string): number { private getEffectiveChunkSize(extraInfoStr?: string): number {
...@@ -159,13 +162,28 @@ export class SentenceSplitter { ...@@ -159,13 +162,28 @@ export class SentenceSplitter {
return splits; return splits;
} }
/**
* Splits sentences into chunks if necessary.
*
* This isn't great behavior because it can split down the middle of a
* word or in non-English split down the middle of a Unicode codepoint
* so the splitting is turned off by default. If you need it, please
* set the splitLongSentences option to true.
* @param sentenceSplits
* @param effectiveChunkSize
* @returns
*/
private processSentenceSplits( private processSentenceSplits(
sentenceSplits: string[], sentenceSplits: string[],
effectiveChunkSize: number, effectiveChunkSize: number,
): SplitRep[] { ): SplitRep[] {
// Process sentence splits if (!this.splitLongSentences) {
// Primarily check if any sentences exceed the chunk size. If they do, return sentenceSplits.map((split) => ({
// force split by tokenizer text: split,
numTokens: this.tokenizer(split).length,
}));
}
let newSplits: SplitRep[] = []; let newSplits: SplitRep[] = [];
for (const split of sentenceSplits) { for (const split of sentenceSplits) {
let splitTokens = this.tokenizer(split); let splitTokens = this.tokenizer(split);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment