stop splitting sentences by default

2d558c39 · Yi Ding · 055b4993 · 2d558c39
Commit 2d558c39 authored 1 year ago by Yi Ding
--- a/packages/core/src/TextSplitter.ts
+++ b/packages/core/src/TextSplitter.ts
@@ -62,7 +62,7 @@ export class SentenceSplitter {
  private tokenizerDecoder: any;
  private paragraphSeparator: string;
  private chunkingTokenizerFn: (text: string) => RegExpMatchArray | null;
-  // private _callback_manager: any;
+  private splitLongSentences: boolean;
  constructor(options?: {
    chunkSize?: number;
@@ -71,6 +71,7 @@ export class SentenceSplitter {
    tokenizerDecoder?: any;
    paragraphSeparator?: string;
    chunkingTokenizerFn?: (text: string) => RegExpMatchArray | null;
+    splitLongSentences?: boolean;
  }) {
    const {
      chunkSize = DEFAULT_CHUNK_SIZE,
@@ -79,6 +80,7 @@ export class SentenceSplitter {
      tokenizerDecoder = null,
      paragraphSeparator = unixParagraphSeparator,
      chunkingTokenizerFn = undefined,
+      splitLongSentences = false,
    } = options ?? {};
    if (chunkOverlap > chunkSize) {
@@ -96,6 +98,7 @@ export class SentenceSplitter {
    this.paragraphSeparator = paragraphSeparator;
    this.chunkingTokenizerFn = chunkingTokenizerFn ?? englishSentenceTokenizer;
+    this.splitLongSentences = splitLongSentences;
  }
  private getEffectiveChunkSize(extraInfoStr?: string): number {
@@ -159,13 +162,28 @@ export class SentenceSplitter {
    return splits;
  }
+  /**
+   * Splits sentences into chunks if necessary.
+   *
+   * This isn't great behavior because it can split down the middle of a
+   * word or in non-English split down the middle of a Unicode codepoint
+   * so the splitting is turned off by default. If you need it, please
+   * set the splitLongSentences option to true.
+   * @param sentenceSplits
+   * @param effectiveChunkSize
+   * @returns
+   */
  private processSentenceSplits(
    sentenceSplits: string[],
    effectiveChunkSize: number,
  ): SplitRep[] {
-    // Process sentence splits
+    if (!this.splitLongSentences) {
-    // Primarily check if any sentences exceed the chunk size. If they do,
+      return sentenceSplits.map((split) => ({
-    // force split by tokenizer
+        text: split,
+        numTokens: this.tokenizer(split).length,
+      }));
+    }
    let newSplits: SplitRep[] = [];
    for (const split of sentenceSplits) {
      let splitTokens = this.tokenizer(split);