From 15962b36f0058f49b09c1f4c84864d7ebd93a7bc Mon Sep 17 00:00:00 2001 From: Alex Yang <himself65@outlook.com> Date: Wed, 24 Jul 2024 10:46:00 -0700 Subject: [PATCH] feat: node parser refactor (#1065) --- .changeset/young-schools-kneel.md | 11 + examples/longText.ts | 5 +- examples/split.ts | 2 +- packages/core/package.json | 17 +- packages/core/src/embeddings/base.ts | 4 +- packages/core/src/global/settings.ts | 15 + .../src/global/settings/callback-manager.ts | 29 +- .../core/src/global/settings/chunk-size.ts | 4 +- .../core/src/global/settings/tokenizer.ts | 21 + packages/core/src/node-parser/base.ts | 147 ++ packages/core/src/node-parser/index.ts | 24 + packages/core/src/node-parser/markdown.ts | 87 + .../core/src/node-parser/sentence-splitter.ts | 226 +++ .../sentence-tokenizer-parser.d.ts | 5 + .../node-parser/sentence-tokenizer-parser.js | 1571 +++++++++++++++++ .../core/src/node-parser/sentence-window.ts | 85 + packages/core/src/node-parser/type.ts | 5 + packages/core/src/node-parser/utils.ts | 53 + packages/core/src/schema/node.ts | 45 + packages/core/src/schema/type.ts | 7 +- packages/core/src/schema/zod.ts | 60 + packages/core/src/utils/wrap-llm-event.ts | 10 +- .../tests/node-parser/markdown.test.ts} | 20 +- .../node-parser/sentence-spiller.test.ts} | 22 +- .../node-parser/sentence-window.test.ts} | 11 +- .../tests/node-parser/text-splitter.test.ts} | 29 +- packages/llamaindex/e2e/node/openai.e2e.ts | 4 +- .../llamaindex/e2e/node/snapshot/agent.snap | 24 +- .../e2e/node/snapshot/agent_stream.snap | 84 +- .../agent_with_object_function_call.snap | 8 +- .../snapshot/agent_with_object_retriever.snap | 20 +- .../anthropic-agent-multiple-chat.snap | 76 +- .../e2e/node/snapshot/anthropic-agent.snap | 32 +- .../e2e/node/snapshot/gpt-4-turbo.snap | 8 +- .../e2e/node/snapshot/llm-anthropic.snap | 52 +- .../snapshot/openai_agent_system_prompt.snap | 8 +- .../snapshot/queryEngine_subquestion.snap | 10 +- .../e2e/node/snapshot/react-agent-stream.snap | 12 +- packages/llamaindex/e2e/node/utils.ts | 6 +- packages/llamaindex/src/PromptHelper.ts | 7 +- packages/llamaindex/src/ServiceContext.ts | 8 +- packages/llamaindex/src/Settings.ts | 8 +- packages/llamaindex/src/TextSplitter.ts | 309 ---- .../llamaindex/src/cloud/LlamaCloudIndex.ts | 8 +- packages/llamaindex/src/cloud/config.ts | 12 +- packages/llamaindex/src/extractors/types.ts | 2 +- packages/llamaindex/src/index.edge.ts | 1 - .../src/ingestion/IngestionCache.ts | 4 +- .../src/ingestion/IngestionPipeline.ts | 8 +- .../strategies/DuplicatesStrategy.ts | 2 +- .../strategies/UpsertsAndDeleteStrategy.ts | 2 +- .../ingestion/strategies/UpsertsStrategy.ts | 2 +- .../src/ingestion/strategies/index.ts | 4 +- .../src/nodeParsers/MarkdownNodeParser.ts | 109 -- .../nodeParsers/SentenceWindowNodeParser.ts | 89 - .../src/nodeParsers/SimpleNodeParser.ts | 72 - packages/llamaindex/src/nodeParsers/index.ts | 5 +- packages/llamaindex/src/nodeParsers/types.ts | 13 - packages/llamaindex/src/nodeParsers/utils.ts | 79 - .../tests/ingestion/IngestionCache.test.ts | 2 +- pnpm-lock.yaml | 330 ++++ 61 files changed, 2952 insertions(+), 983 deletions(-) create mode 100644 .changeset/young-schools-kneel.md create mode 100644 packages/core/src/global/settings/tokenizer.ts create mode 100644 packages/core/src/node-parser/base.ts create mode 100644 packages/core/src/node-parser/index.ts create mode 100644 packages/core/src/node-parser/markdown.ts create mode 100644 packages/core/src/node-parser/sentence-splitter.ts create mode 100644 packages/core/src/node-parser/sentence-tokenizer-parser.d.ts create mode 100644 packages/core/src/node-parser/sentence-tokenizer-parser.js create mode 100644 packages/core/src/node-parser/sentence-window.ts create mode 100644 packages/core/src/node-parser/type.ts create mode 100644 packages/core/src/node-parser/utils.ts rename packages/{llamaindex/tests/nodeParsers/MarkdownNodeParser.test.ts => core/tests/node-parser/markdown.test.ts} (78%) rename packages/{llamaindex/tests/nodeParsers/SimpleNodeParser.test.ts => core/tests/node-parser/sentence-spiller.test.ts} (61%) rename packages/{llamaindex/tests/nodeParsers/SentenceWindowNodeParser.test.ts => core/tests/node-parser/sentence-window.test.ts} (77%) rename packages/{llamaindex/tests/TextSplitter.test.ts => core/tests/node-parser/text-splitter.test.ts} (83%) delete mode 100644 packages/llamaindex/src/TextSplitter.ts delete mode 100644 packages/llamaindex/src/nodeParsers/MarkdownNodeParser.ts delete mode 100644 packages/llamaindex/src/nodeParsers/SentenceWindowNodeParser.ts delete mode 100644 packages/llamaindex/src/nodeParsers/SimpleNodeParser.ts delete mode 100644 packages/llamaindex/src/nodeParsers/types.ts delete mode 100644 packages/llamaindex/src/nodeParsers/utils.ts diff --git a/.changeset/young-schools-kneel.md b/.changeset/young-schools-kneel.md new file mode 100644 index 000000000..9a291a0d4 --- /dev/null +++ b/.changeset/young-schools-kneel.md @@ -0,0 +1,11 @@ +--- +"@llamaindex/core": patch +"llamaindex": patch +--- + +feat: node parser refactor + +Align the text splitter logic with Python; it has almost the same logic as Python; Zod checks for input and better error messages and event system. + +This change will not be considered a breaking change since it doesn't have a significant output difference from the last version, +but some edge cases will change, like the page separator and parameter for the constructor. diff --git a/examples/longText.ts b/examples/longText.ts index eb3c3d845..825446e12 100644 --- a/examples/longText.ts +++ b/examples/longText.ts @@ -1,17 +1,16 @@ import { Document, + SentenceSplitter, Settings, - SimpleNodeParser, VectorStoreIndex, } from "llamaindex"; export const STORAGE_DIR = "./data"; // Update node parser -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 512, chunkOverlap: 20, - splitLongSentences: true, }); (async () => { // generate a document with a very long sentence (9000 words long) diff --git a/examples/split.ts b/examples/split.ts index ce070c5f3..3797f51c4 100644 --- a/examples/split.ts +++ b/examples/split.ts @@ -8,7 +8,7 @@ async function main() { const textSplitter = new SentenceSplitter(); - const chunks = textSplitter.splitTextWithOverlaps(essay); + const chunks = textSplitter.splitText(essay); console.log(chunks); } diff --git a/packages/core/package.json b/packages/core/package.json index 37622a163..7074c0181 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -4,6 +4,20 @@ "version": "0.1.3", "description": "LlamaIndex Core Module", "exports": { + "./node-parser": { + "require": { + "types": "./dist/node-parser/index.d.cts", + "default": "./dist/node-parser/index.cjs" + }, + "import": { + "types": "./dist/node-parser/index.d.ts", + "default": "./dist/node-parser/index.js" + }, + "default": { + "types": "./dist/node-parser/index.d.ts", + "default": "./dist/node-parser/index.js" + } + }, "./query-engine": { "require": { "types": "./dist/query-engine/index.d.cts", @@ -117,7 +131,8 @@ }, "devDependencies": { "ajv": "^8.16.0", - "bunchee": "5.3.0-beta.0" + "bunchee": "5.3.0-beta.0", + "natural": "^7.1.0" }, "dependencies": { "@llamaindex/env": "workspace:*", diff --git a/packages/core/src/embeddings/base.ts b/packages/core/src/embeddings/base.ts index 5dd74ac66..a27c5a04e 100644 --- a/packages/core/src/embeddings/base.ts +++ b/packages/core/src/embeddings/base.ts @@ -20,9 +20,7 @@ export type BaseEmbeddingOptions = { logProgress?: boolean; }; -export abstract class BaseEmbedding - implements TransformComponent<BaseEmbeddingOptions> -{ +export abstract class BaseEmbedding implements TransformComponent { embedBatchSize = DEFAULT_EMBED_BATCH_SIZE; embedInfo?: EmbeddingInfo; diff --git a/packages/core/src/global/settings.ts b/packages/core/src/global/settings.ts index 4d8b18e80..57ea265a0 100644 --- a/packages/core/src/global/settings.ts +++ b/packages/core/src/global/settings.ts @@ -1,3 +1,4 @@ +import type { Tokenizer } from "@llamaindex/env"; import { type CallbackManager, getCallbackManager, @@ -9,8 +10,22 @@ import { setChunkSize, withChunkSize, } from "./settings/chunk-size"; +import { + getTokenizer, + setTokenizer, + withTokenizer, +} from "./settings/tokenizer"; export const Settings = { + get tokenizer() { + return getTokenizer(); + }, + set tokenizer(tokenizer) { + setTokenizer(tokenizer); + }, + withTokenizer<Result>(tokenizer: Tokenizer, fn: () => Result): Result { + return withTokenizer(tokenizer, fn); + }, get chunkSize(): number | undefined { return getChunkSize(); }, diff --git a/packages/core/src/global/settings/callback-manager.ts b/packages/core/src/global/settings/callback-manager.ts index e8f6750f7..edaf93237 100644 --- a/packages/core/src/global/settings/callback-manager.ts +++ b/packages/core/src/global/settings/callback-manager.ts @@ -6,6 +6,7 @@ import type { ToolCall, ToolOutput, } from "../../llms"; +import { TextNode } from "../../schema"; import { EventCaller, getEventCaller } from "../../utils/event-caller"; import type { UUID } from "../type"; @@ -33,12 +34,32 @@ export type LLMStreamEvent = { chunk: ChatResponseChunk; }; +export type ChunkingStartEvent = { + text: string[]; +}; + +export type ChunkingEndEvent = { + chunks: string[]; +}; + +export type NodeParsingStartEvent = { + documents: TextNode[]; +}; + +export type NodeParsingEndEvent = { + nodes: TextNode[]; +}; + export interface LlamaIndexEventMaps { "llm-start": LLMStartEvent; "llm-end": LLMEndEvent; "llm-tool-call": LLMToolCallEvent; "llm-tool-result": LLMToolResultEvent; "llm-stream": LLMStreamEvent; + "chunking-start": ChunkingStartEvent; + "chunking-end": ChunkingEndEvent; + "node-parsing-start": NodeParsingStartEvent; + "node-parsing-end": NodeParsingEndEvent; } export class LlamaIndexCustomEvent<T = any> extends CustomEvent<T> { @@ -116,14 +137,10 @@ export const globalCallbackManager = new CallbackManager(); const callbackManagerAsyncLocalStorage = new AsyncLocalStorage<CallbackManager>(); -let currentCallbackManager: CallbackManager | null = null; +let currentCallbackManager: CallbackManager = globalCallbackManager; export function getCallbackManager(): CallbackManager { - return ( - callbackManagerAsyncLocalStorage.getStore() ?? - currentCallbackManager ?? - globalCallbackManager - ); + return callbackManagerAsyncLocalStorage.getStore() ?? currentCallbackManager; } export function setCallbackManager(callbackManager: CallbackManager) { diff --git a/packages/core/src/global/settings/chunk-size.ts b/packages/core/src/global/settings/chunk-size.ts index e8c7810c3..6d24017f4 100644 --- a/packages/core/src/global/settings/chunk-size.ts +++ b/packages/core/src/global/settings/chunk-size.ts @@ -1,9 +1,9 @@ import { AsyncLocalStorage } from "@llamaindex/env"; const chunkSizeAsyncLocalStorage = new AsyncLocalStorage<number | undefined>(); -let globalChunkSize: number | null = null; +let globalChunkSize: number = 1024; -export function getChunkSize(): number | undefined { +export function getChunkSize(): number { return globalChunkSize ?? chunkSizeAsyncLocalStorage.getStore(); } diff --git a/packages/core/src/global/settings/tokenizer.ts b/packages/core/src/global/settings/tokenizer.ts new file mode 100644 index 000000000..dae154c83 --- /dev/null +++ b/packages/core/src/global/settings/tokenizer.ts @@ -0,0 +1,21 @@ +import { AsyncLocalStorage, type Tokenizer, tokenizers } from "@llamaindex/env"; + +const chunkSizeAsyncLocalStorage = new AsyncLocalStorage<Tokenizer>(); +let globalTokenizer: Tokenizer = tokenizers.tokenizer(); + +export function getTokenizer(): Tokenizer { + return globalTokenizer ?? chunkSizeAsyncLocalStorage.getStore(); +} + +export function setTokenizer(tokenizer: Tokenizer | undefined) { + if (tokenizer !== undefined) { + globalTokenizer = tokenizer; + } +} + +export function withTokenizer<Result>( + tokenizer: Tokenizer, + fn: () => Result, +): Result { + return chunkSizeAsyncLocalStorage.run(tokenizer, fn); +} diff --git a/packages/core/src/node-parser/base.ts b/packages/core/src/node-parser/base.ts new file mode 100644 index 000000000..9aeb61f1a --- /dev/null +++ b/packages/core/src/node-parser/base.ts @@ -0,0 +1,147 @@ +import { Settings } from "../global"; +import { + BaseNode, + buildNodeFromSplits, + MetadataMode, + NodeRelationship, + TextNode, + type TransformComponent, +} from "../schema"; + +export abstract class NodeParser implements TransformComponent { + includeMetadata: boolean = true; + includePrevNextRel: boolean = true; + + protected postProcessParsedNodes( + nodes: TextNode[], + parentDocMap: Map<string, TextNode>, + ): TextNode[] { + nodes.forEach((node, i) => { + const parentDoc = parentDocMap.get(node.sourceNode?.nodeId || ""); + + if (parentDoc) { + const startCharIdx = parentDoc.text.indexOf( + node.getContent(MetadataMode.NONE), + ); + if (startCharIdx >= 0) { + node.startCharIdx = startCharIdx; + node.endCharIdx = + startCharIdx + node.getContent(MetadataMode.NONE).length; + } + if (this.includeMetadata && node.metadata && parentDoc.metadata) { + node.metadata = { ...node.metadata, ...parentDoc.metadata }; + } + } + + if (this.includePrevNextRel && node.sourceNode) { + const previousNode = i > 0 ? nodes[i - 1] : null; + const nextNode = i < nodes.length - 1 ? nodes[i + 1] : null; + + if ( + previousNode && + previousNode.sourceNode && + previousNode.sourceNode.nodeId === node.sourceNode.nodeId + ) { + node.relationships = { + ...node.relationships, + [NodeRelationship.PREVIOUS]: previousNode.asRelatedNodeInfo(), + }; + } + + if ( + nextNode && + nextNode.sourceNode && + nextNode.sourceNode.nodeId === node.sourceNode.nodeId + ) { + node.relationships = { + ...node.relationships, + [NodeRelationship.NEXT]: nextNode.asRelatedNodeInfo(), + }; + } + } + }); + + return nodes; + } + + protected abstract parseNodes( + documents: TextNode[], + showProgress?: boolean, + ): TextNode[]; + + public getNodesFromDocuments(documents: TextNode[]): TextNode[] { + const docsId: Map<string, TextNode> = new Map( + documents.map((doc) => [doc.id_, doc]), + ); + const callbackManager = Settings.callbackManager; + + callbackManager.dispatchEvent("node-parsing-start", { + documents, + }); + + const nodes = this.postProcessParsedNodes( + this.parseNodes(documents), + docsId, + ); + + callbackManager.dispatchEvent("node-parsing-end", { + nodes, + }); + + return nodes; + } + + async transform(nodes: BaseNode[], options?: {}): Promise<BaseNode[]> { + return this.getNodesFromDocuments(nodes as TextNode[]); + } +} + +export abstract class TextSplitter extends NodeParser { + abstract splitText(text: string): string[]; + + public splitTexts(texts: string[]): string[] { + return texts.flatMap((text) => this.splitText(text)); + } + + protected parseNodes(nodes: TextNode[]): TextNode[] { + return nodes.reduce<TextNode[]>((allNodes, node) => { + const splits = this.splitText(node.getContent(MetadataMode.ALL)); + const nodes = buildNodeFromSplits(splits, node); + return allNodes.concat(nodes); + }, []); + } +} + +export abstract class MetadataAwareTextSplitter extends TextSplitter { + abstract splitTextMetadataAware(text: string, metadata: string): string[]; + + splitTextsMetadataAware(texts: string[], metadata: string[]): string[] { + if (texts.length !== metadata.length) { + throw new TypeError("`texts` and `metadata` must have the same length"); + } + return texts.flatMap((text, i) => + this.splitTextMetadataAware(text, metadata[i]), + ); + } + + protected getMetadataString(node: TextNode): string { + const embedStr = node.getMetadataStr(MetadataMode.EMBED); + const llmStr = node.getMetadataStr(MetadataMode.LLM); + if (embedStr.length > llmStr.length) { + return embedStr; + } else { + return llmStr; + } + } + + protected parseNodes(nodes: TextNode[]): TextNode[] { + return nodes.reduce<TextNode[]>((allNodes, node) => { + const metadataStr = this.getMetadataString(node); + const splits = this.splitTextMetadataAware( + node.getContent(MetadataMode.ALL), + metadataStr, + ); + return allNodes.concat(buildNodeFromSplits(splits, node)); + }, []); + } +} diff --git a/packages/core/src/node-parser/index.ts b/packages/core/src/node-parser/index.ts new file mode 100644 index 000000000..6ecc98005 --- /dev/null +++ b/packages/core/src/node-parser/index.ts @@ -0,0 +1,24 @@ +/** + * Current logic is based on the following implementation: + * @link @link https://github.com/run-llama/llama_index/blob/cc0ea90e7e72b8e4f5069aac981d56bb1d568323/llama-index-core/llama_index/core/node_parser + */ +import { SentenceSplitter } from "./sentence-splitter"; + +/** + * @deprecated Use `SentenceSplitter` instead + */ +export const SimpleNodeParser = SentenceSplitter; + +export { MetadataAwareTextSplitter, NodeParser, TextSplitter } from "./base"; +export { MarkdownNodeParser } from "./markdown"; +export { SentenceSplitter } from "./sentence-splitter"; +export { SentenceWindowNodeParser } from "./sentence-window"; +export type { SplitterParams } from "./type"; +export { + splitByChar, + splitByPhraseRegex, + splitByRegex, + splitBySentenceTokenizer, + splitBySep, +} from "./utils"; +export type { TextSplitterFn } from "./utils"; diff --git a/packages/core/src/node-parser/markdown.ts b/packages/core/src/node-parser/markdown.ts new file mode 100644 index 000000000..c082c6e6c --- /dev/null +++ b/packages/core/src/node-parser/markdown.ts @@ -0,0 +1,87 @@ +import { + buildNodeFromSplits, + type Metadata, + MetadataMode, + TextNode, +} from "../schema"; +import { NodeParser } from "./base"; + +export class MarkdownNodeParser extends NodeParser { + override parseNodes(nodes: TextNode[], showProgress?: boolean): TextNode[] { + return nodes.reduce<TextNode[]>((allNodes, node) => { + const markdownNodes = this.getNodesFromNode(node); + return allNodes.concat(markdownNodes); + }, []); + } + + protected getNodesFromNode(node: TextNode): TextNode[] { + const text = node.getContent(MetadataMode.NONE); + const markdownNodes: TextNode[] = []; + const lines = text.split("\n"); + let metadata: { [key: string]: string } = {}; + let codeBlock = false; + let currentSection = ""; + + for (const line of lines) { + if (line.trim().startsWith("```")) { + codeBlock = !codeBlock; + } + const headerMatch = /^(#+)\s(.*)/.exec(line); + if (headerMatch && !codeBlock) { + if (currentSection !== "") { + markdownNodes.push( + this.buildNodeFromSplit(currentSection.trim(), node, metadata), + ); + } + metadata = this.updateMetadata( + metadata, + headerMatch[2], + headerMatch[1].trim().length, + ); + currentSection = `${headerMatch[2]}\n`; + } else { + currentSection += line + "\n"; + } + } + + if (currentSection !== "") { + markdownNodes.push( + this.buildNodeFromSplit(currentSection.trim(), node, metadata), + ); + } + + return markdownNodes; + } + + private updateMetadata( + headersMetadata: { [key: string]: string }, + newHeader: string, + newHeaderLevel: number, + ): { [key: string]: string } { + const updatedHeaders: { [key: string]: string } = {}; + + for (let i = 1; i < newHeaderLevel; i++) { + const key = `Header_${i}`; + if (key in headersMetadata) { + updatedHeaders[key] = headersMetadata[key]; + } + } + + updatedHeaders[`Header_${newHeaderLevel}`] = newHeader; + return updatedHeaders; + } + + private buildNodeFromSplit( + textSplit: string, + node: TextNode, + metadata: Metadata, + ): TextNode { + const newNode = buildNodeFromSplits([textSplit], node, undefined)[0]; + + if (this.includeMetadata) { + newNode.metadata = { ...newNode.metadata, ...metadata }; + } + + return newNode; + } +} diff --git a/packages/core/src/node-parser/sentence-splitter.ts b/packages/core/src/node-parser/sentence-splitter.ts new file mode 100644 index 000000000..7091bcfd7 --- /dev/null +++ b/packages/core/src/node-parser/sentence-splitter.ts @@ -0,0 +1,226 @@ +import type { Tokenizer } from "@llamaindex/env"; +import { z } from "zod"; +import { Settings } from "../global"; +import { sentenceSplitterSchema } from "../schema"; +import { MetadataAwareTextSplitter } from "./base"; +import type { SplitterParams } from "./type"; +import { + splitByChar, + splitByRegex, + splitBySentenceTokenizer, + splitBySep, + type TextSplitterFn, +} from "./utils"; + +type _Split = { + text: string; + isSentence: boolean; + tokenSize: number; +}; + +/** + * Parse text with a preference for complete sentences. + */ +export class SentenceSplitter extends MetadataAwareTextSplitter { + /** + * The token chunk size for each chunk. + */ + chunkSize: number = 1024; + /** + * The token overlap of each chunk when splitting. + */ + chunkOverlap: number = 200; + /** + * Default separator for splitting into words + */ + separator: string = " "; + /** + * Separator between paragraphs. + */ + paragraphSeparator: string = "\n\n\n"; + /** + * Backup regex for splitting into sentences. + */ + secondaryChunkingRegex: string = "[^,.;。?ï¼]+[,.;。?ï¼]?"; + + #chunkingTokenizerFn = splitBySentenceTokenizer(); + #splitFns: Set<TextSplitterFn> = new Set(); + #subSentenceSplitFns: Set<TextSplitterFn> = new Set(); + #tokenizer: Tokenizer; + + constructor( + params?: z.input<typeof sentenceSplitterSchema> & SplitterParams, + ) { + super(); + if (params) { + const parsedParams = sentenceSplitterSchema.parse(params); + this.chunkSize = parsedParams.chunkSize; + this.chunkOverlap = parsedParams.chunkOverlap; + this.separator = parsedParams.separator; + this.paragraphSeparator = parsedParams.paragraphSeparator; + this.secondaryChunkingRegex = parsedParams.secondaryChunkingRegex; + } + this.#tokenizer = params?.tokenizer ?? Settings.tokenizer; + this.#splitFns.add(splitBySep(this.paragraphSeparator)); + this.#splitFns.add(this.#chunkingTokenizerFn); + + this.#subSentenceSplitFns.add(splitByRegex(this.secondaryChunkingRegex)); + this.#subSentenceSplitFns.add(splitBySep(this.separator)); + this.#subSentenceSplitFns.add(splitByChar()); + } + + splitTextMetadataAware(text: string, metadata: string): string[] { + const metadataLength = this.tokenSize(metadata); + const effectiveChunkSize = this.chunkSize - metadataLength; + if (effectiveChunkSize <= 0) { + throw new Error( + `Metadata length (${metadataLength}) is longer than chunk size (${this.chunkSize}). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`, + ); + } else if (effectiveChunkSize < 50) { + console.log( + `Metadata length (${metadataLength}) is close to chunk size (${this.chunkSize}). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`, + ); + } + return this._splitText(text, effectiveChunkSize); + } + + splitText(text: string): string[] { + return this._splitText(text, this.chunkSize); + } + + _splitText(text: string, chunkSize: number): string[] { + if (text === "") return [text]; + + const callbackManager = Settings.callbackManager; + + callbackManager.dispatchEvent("chunking-start", { + text: [text], + }); + const splits = this.#split(text, chunkSize); + const chunks = this.#merge(splits, chunkSize); + + callbackManager.dispatchEvent("chunking-end", { + chunks, + }); + return chunks; + } + + #split(text: string, chunkSize: number): _Split[] { + const tokenSize = this.tokenSize(text); + if (tokenSize <= chunkSize) { + return [ + { + text, + isSentence: true, + tokenSize, + }, + ]; + } + const [textSplitsByFns, isSentence] = this.#getSplitsByFns(text); + const textSplits: _Split[] = []; + + for (const textSplit of textSplitsByFns) { + const tokenSize = this.tokenSize(textSplit); + if (tokenSize <= chunkSize) { + textSplits.push({ + text: textSplit, + isSentence, + tokenSize, + }); + } else { + const recursiveTextSplits = this.#split(textSplit, chunkSize); + textSplits.push(...recursiveTextSplits); + } + } + return textSplits; + } + + #getSplitsByFns(text: string): [splits: string[], isSentence: boolean] { + for (const splitFn of this.#splitFns) { + const splits = splitFn(text); + if (splits.length > 1) { + return [splits, true]; + } + } + for (const splitFn of this.#subSentenceSplitFns) { + const splits = splitFn(text); + if (splits.length > 1) { + return [splits, false]; + } + } + return [[text], true]; + } + + #merge(splits: _Split[], chunkSize: number): string[] { + const chunks: string[] = []; + let currentChunk: [string, number][] = []; + let lastChunk: [string, number][] = []; + let currentChunkLength = 0; + let newChunk = true; + + const closeChunk = (): void => { + chunks.push(currentChunk.map(([text]) => text).join("")); + lastChunk = currentChunk; + currentChunk = []; + currentChunkLength = 0; + newChunk = true; + + let lastIndex = lastChunk.length - 1; + while ( + lastIndex >= 0 && + currentChunkLength + lastChunk[lastIndex][1] <= this.chunkOverlap + ) { + const [text, length] = lastChunk[lastIndex]; + currentChunkLength += length; + currentChunk.unshift([text, length]); + lastIndex -= 1; + } + }; + + while (splits.length > 0) { + const curSplit = splits[0]; + if (curSplit.tokenSize > chunkSize) { + throw new Error("Single token exceeded chunk size"); + } + if (currentChunkLength + curSplit.tokenSize > chunkSize && !newChunk) { + closeChunk(); + } else { + if ( + curSplit.isSentence || + currentChunkLength + curSplit.tokenSize <= chunkSize || + newChunk + ) { + currentChunkLength += curSplit.tokenSize; + currentChunk.push([curSplit.text, curSplit.tokenSize]); + splits.shift(); + newChunk = false; + } else { + closeChunk(); + } + } + } + + // Handle the last chunk + if (!newChunk) { + chunks.push(currentChunk.map(([text]) => text).join("")); + } + + return this.#postprocessChunks(chunks); + } + + /** + * Remove whitespace only chunks and remove leading and trailing whitespace. + */ + #postprocessChunks(chunks: string[]): string[] { + const newChunks: string[] = []; + for (const chunk of chunks) { + const trimmedChunk = chunk.trim(); + if (trimmedChunk !== "") { + newChunks.push(trimmedChunk); + } + } + return newChunks; + } + + tokenSize = (text: string) => this.#tokenizer.encode(text).length; +} diff --git a/packages/core/src/node-parser/sentence-tokenizer-parser.d.ts b/packages/core/src/node-parser/sentence-tokenizer-parser.d.ts new file mode 100644 index 000000000..870741514 --- /dev/null +++ b/packages/core/src/node-parser/sentence-tokenizer-parser.d.ts @@ -0,0 +1,5 @@ +declare class SentenceTokenizer { + tokenize(text: string): string[]; +} + +export { SentenceTokenizer as default }; diff --git a/packages/core/src/node-parser/sentence-tokenizer-parser.js b/packages/core/src/node-parser/sentence-tokenizer-parser.js new file mode 100644 index 000000000..ea0526343 --- /dev/null +++ b/packages/core/src/node-parser/sentence-tokenizer-parser.js @@ -0,0 +1,1571 @@ +var __getOwnPropNames = Object.getOwnPropertyNames; +var cjs = (cb, mod) => + function _r() { + return ( + mod || + (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), + mod.exports + ); + }; + +// lib/natural/util/abbreviations_en.js +var require_abbreviations_en = cjs({ + "lib/natural/util/abbreviations_en.js"(exports) { + "use strict"; + var knownAbbreviations = [ + "approx.", + "appt.", + "apt.", + "A.S.A.P.", + "B.Y.O.B.", + "c/o", + "dept.", + "D.I.Y.", + "est.", + "E.T.A.", + "Inc.", + "min.", + "misc.", + "Mr.", + "Mrs.", + "no.", + "R.S.V.P.", + "tel.", + "temp.", + "vet.", + "vs.", + ]; + exports.knownAbbreviations = knownAbbreviations; + }, +}); + +// lib/natural/tokenizers/parser_sentence_tokenizer.js +var require_parser_sentence_tokenizer = cjs({ + "lib/natural/tokenizers/parser_sentence_tokenizer.js"(exports, module) { + "use strict"; + function peg$subclass(child, parent) { + function ctor() { + this.constructor = child; + } + ctor.prototype = parent.prototype; + child.prototype = new ctor(); + } + function peg$SyntaxError(message, expected, found, location) { + this.message = message; + this.expected = expected; + this.found = found; + this.location = location; + this.name = "SyntaxError"; + if (typeof Error.captureStackTrace === "function") { + Error.captureStackTrace(this, peg$SyntaxError); + } + } + peg$subclass(peg$SyntaxError, Error); + peg$SyntaxError.buildMessage = function (expected, found) { + var DESCRIBE_EXPECTATION_FNS = { + literal: function (expectation) { + return '"' + literalEscape(expectation.text) + '"'; + }, + class: function (expectation) { + var escapedParts = "", + i; + for (i = 0; i < expectation.parts.length; i++) { + escapedParts += + expectation.parts[i] instanceof Array + ? classEscape(expectation.parts[i][0]) + + "-" + + classEscape(expectation.parts[i][1]) + : classEscape(expectation.parts[i]); + } + return "[" + (expectation.inverted ? "^" : "") + escapedParts + "]"; + }, + any: function (expectation) { + return "any character"; + }, + end: function (expectation) { + return "end of input"; + }, + other: function (expectation) { + return expectation.description; + }, + }; + function hex(ch) { + return ch.charCodeAt(0).toString(16).toUpperCase(); + } + function literalEscape(s) { + return s + .replace(/\\/g, "\\\\") + .replace(/"/g, '\\"') + .replace(/\0/g, "\\0") + .replace(/\t/g, "\\t") + .replace(/\n/g, "\\n") + .replace(/\r/g, "\\r") + .replace(/[\x00-\x0F]/g, function (ch) { + return "\\x0" + hex(ch); + }) + .replace(/[\x10-\x1F\x7F-\x9F]/g, function (ch) { + return "\\x" + hex(ch); + }); + } + function classEscape(s) { + return s + .replace(/\\/g, "\\\\") + .replace(/\]/g, "\\]") + .replace(/\^/g, "\\^") + .replace(/-/g, "\\-") + .replace(/\0/g, "\\0") + .replace(/\t/g, "\\t") + .replace(/\n/g, "\\n") + .replace(/\r/g, "\\r") + .replace(/[\x00-\x0F]/g, function (ch) { + return "\\x0" + hex(ch); + }) + .replace(/[\x10-\x1F\x7F-\x9F]/g, function (ch) { + return "\\x" + hex(ch); + }); + } + function describeExpectation(expectation) { + return DESCRIBE_EXPECTATION_FNS[expectation.type](expectation); + } + function describeExpected(expected2) { + var descriptions = new Array(expected2.length), + i, + j; + for (i = 0; i < expected2.length; i++) { + descriptions[i] = describeExpectation(expected2[i]); + } + descriptions.sort(); + if (descriptions.length > 0) { + for (i = 1, j = 1; i < descriptions.length; i++) { + if (descriptions[i - 1] !== descriptions[i]) { + descriptions[j] = descriptions[i]; + j++; + } + } + descriptions.length = j; + } + switch (descriptions.length) { + case 1: + return descriptions[0]; + case 2: + return descriptions[0] + " or " + descriptions[1]; + default: + return ( + descriptions.slice(0, -1).join(", ") + + ", or " + + descriptions[descriptions.length - 1] + ); + } + } + function describeFound(found2) { + return found2 ? '"' + literalEscape(found2) + '"' : "end of input"; + } + return ( + "Expected " + + describeExpected(expected) + + " but " + + describeFound(found) + + " found." + ); + }; + function peg$parse(input, options) { + options = options !== void 0 ? options : {}; + var peg$FAILED = {}, + peg$startRuleFunctions = { s: peg$parses }, + peg$startRuleFunction = peg$parses, + peg$c0 = function (sentences) { + const result = []; + sentences.forEach((sent0) => { + sent0[0].forEach((sent1) => { + result.push(sent1); + }); + }); + return result; + }, + peg$c1 = function (sentences) { + return sentences.map((sent) => { + sent[0].push(sent[1]); + return sent[0].reduce((accu, str) => accu + str).trim(); + }); + }, + peg$c2 = function (open, sentences, close) { + const result = sentences.map((sent) => { + sent[0].push(sent[1]); + return sent[0].reduce((accu, str) => accu + str).trim(); + }); + result.unshift(open); + if (close) { + result.push(close); + } + return result; + }, + peg$c3 = function (seqs, end) { + const res = seqs.reduce((accu, seq) => accu.concat(seq)); + res.push(end); + return res; + }, + peg$c4 = function (tokens) { + const result = tokens.map((pair) => pair[0] + pair[1]); + return result; + }, + peg$c5 = function (open, tokens, end, close) { + const result = tokens.map((pair) => pair[0] + pair[1]); + result.unshift(open); + result.push(end); + result.push(close); + return result; + }, + peg$c6 = /^[ \t\n\r.?!]/, + peg$c7 = peg$classExpectation( + [" ", " ", "\n", "\r", ".", "?", "!"], + false, + false, + ), + peg$c8 = function () { + return text(); + }, + peg$c9 = /^[ \t\n\r]/, + peg$c10 = peg$classExpectation([" ", " ", "\n", "\r"], false, false), + peg$c11 = function (t) { + return t; + }, + peg$c12 = /^[^ \t\n\r!?([}"`)\]}"`0-9@]/, + peg$c13 = peg$classExpectation( + [ + " ", + " ", + "\n", + "\r", + "!", + "?", + "(", + "[", + "}", + '"', + "`", + ")", + "]", + "}", + '"', + "`", + ["0", "9"], + "@", + ], + true, + false, + ), + peg$c14 = function (word) { + const tmp = word.reduce((accu, elt) => accu + elt); + return knownAbbreviations.indexOf(tmp) > -1; + }, + peg$c15 = function (word) { + return text(); + }, + peg$c16 = /^[^ \t\n\r!?.([})\]}`"0-9@]/, + peg$c17 = peg$classExpectation( + [ + " ", + " ", + "\n", + "\r", + "!", + "?", + ".", + "(", + "[", + "}", + ")", + "]", + "}", + "`", + '"', + ["0", "9"], + "@", + ], + true, + false, + ), + peg$c18 = function () { + return text(); + }, + peg$c19 = /^[0-9]/, + peg$c20 = peg$classExpectation([["0", "9"]], false, false), + peg$c21 = peg$anyExpectation(), + peg$c22 = /^[a-z]/, + peg$c23 = peg$classExpectation([["a", "z"]], false, false), + peg$c24 = /^[@]/, + peg$c25 = peg$classExpectation(["@"], false, false), + peg$c26 = /^[.]/, + peg$c27 = peg$classExpectation(["."], false, false), + peg$c28 = "http://", + peg$c29 = peg$literalExpectation("http://", false), + peg$c30 = "https://", + peg$c31 = peg$literalExpectation("https://", false), + peg$c32 = /^[a-z0-9]/, + peg$c33 = peg$classExpectation( + [ + ["a", "z"], + ["0", "9"], + ], + false, + false, + ), + peg$c34 = /^[\/]/, + peg$c35 = peg$classExpectation(["/"], false, false), + peg$c36 = function () { + return text(); + }, + peg$c37 = /^[([{"'`\u2018]/, + peg$c38 = peg$classExpectation( + ["(", "[", "{", '"', "'", "`", "\u2018"], + false, + false, + ), + peg$c39 = /^[)\]}"'`\u2019]/, + peg$c40 = peg$classExpectation( + [")", "]", "}", '"', "'", "`", "\u2019"], + false, + false, + ), + peg$currPos = 0, + peg$savedPos = 0, + peg$posDetailsCache = [{ line: 1, column: 1 }], + peg$maxFailPos = 0, + peg$maxFailExpected = [], + peg$silentFails = 0, + peg$result; + if ("startRule" in options) { + if (!(options.startRule in peg$startRuleFunctions)) { + throw new Error( + `Can't start parsing from rule "` + options.startRule + '".', + ); + } + peg$startRuleFunction = peg$startRuleFunctions[options.startRule]; + } + function text() { + return input.substring(peg$savedPos, peg$currPos); + } + function location() { + return peg$computeLocation(peg$savedPos, peg$currPos); + } + function expected(description, location2) { + location2 = + location2 !== void 0 + ? location2 + : peg$computeLocation(peg$savedPos, peg$currPos); + throw peg$buildStructuredError( + [peg$otherExpectation(description)], + input.substring(peg$savedPos, peg$currPos), + location2, + ); + } + function error(message, location2) { + location2 = + location2 !== void 0 + ? location2 + : peg$computeLocation(peg$savedPos, peg$currPos); + throw peg$buildSimpleError(message, location2); + } + function peg$literalExpectation(text2, ignoreCase) { + return { type: "literal", text: text2, ignoreCase }; + } + function peg$classExpectation(parts, inverted, ignoreCase) { + return { type: "class", parts, inverted, ignoreCase }; + } + function peg$anyExpectation() { + return { type: "any" }; + } + function peg$endExpectation() { + return { type: "end" }; + } + function peg$otherExpectation(description) { + return { type: "other", description }; + } + function peg$computePosDetails(pos) { + var details = peg$posDetailsCache[pos], + p; + if (details) { + return details; + } else { + p = pos - 1; + while (!peg$posDetailsCache[p]) { + p--; + } + details = peg$posDetailsCache[p]; + details = { + line: details.line, + column: details.column, + }; + while (p < pos) { + if (input.charCodeAt(p) === 10) { + details.line++; + details.column = 1; + } else { + details.column++; + } + p++; + } + peg$posDetailsCache[pos] = details; + return details; + } + } + function peg$computeLocation(startPos, endPos) { + var startPosDetails = peg$computePosDetails(startPos), + endPosDetails = peg$computePosDetails(endPos); + return { + start: { + offset: startPos, + line: startPosDetails.line, + column: startPosDetails.column, + }, + end: { + offset: endPos, + line: endPosDetails.line, + column: endPosDetails.column, + }, + }; + } + function peg$fail(expected2) { + if (peg$currPos < peg$maxFailPos) { + return; + } + if (peg$currPos > peg$maxFailPos) { + peg$maxFailPos = peg$currPos; + peg$maxFailExpected = []; + } + peg$maxFailExpected.push(expected2); + } + function peg$buildSimpleError(message, location2) { + return new peg$SyntaxError(message, null, null, location2); + } + function peg$buildStructuredError(expected2, found, location2) { + return new peg$SyntaxError( + peg$SyntaxError.buildMessage(expected2, found), + expected2, + found, + location2, + ); + } + function peg$parses() { + var s0, s1, s2, s3, s4; + s0 = peg$currPos; + s1 = []; + s2 = peg$currPos; + s3 = peg$parseSentences(); + if (s3 !== peg$FAILED) { + s4 = peg$parseWhitespace(); + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + if (s2 === peg$FAILED) { + s2 = peg$currPos; + s3 = peg$parseQuotedSentences(); + if (s3 !== peg$FAILED) { + s4 = peg$parseWhitespace(); + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } + if (s2 !== peg$FAILED) { + while (s2 !== peg$FAILED) { + s1.push(s2); + s2 = peg$currPos; + s3 = peg$parseSentences(); + if (s3 !== peg$FAILED) { + s4 = peg$parseWhitespace(); + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + if (s2 === peg$FAILED) { + s2 = peg$currPos; + s3 = peg$parseQuotedSentences(); + if (s3 !== peg$FAILED) { + s4 = peg$parseWhitespace(); + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } + } + } else { + s1 = peg$FAILED; + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c0(s1); + } + s0 = s1; + return s0; + } + function peg$parseSentences() { + var s0, s1, s2, s3, s4; + s0 = peg$currPos; + s1 = []; + s2 = peg$currPos; + s3 = peg$parseSentence(); + if (s3 !== peg$FAILED) { + s4 = peg$parseWhitespace(); + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + if (s2 !== peg$FAILED) { + while (s2 !== peg$FAILED) { + s1.push(s2); + s2 = peg$currPos; + s3 = peg$parseSentence(); + if (s3 !== peg$FAILED) { + s4 = peg$parseWhitespace(); + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } + } else { + s1 = peg$FAILED; + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c1(s1); + } + s0 = s1; + return s0; + } + function peg$parseQuotedSentences() { + var s0, s1, s2, s3, s4, s5; + s0 = peg$currPos; + s1 = peg$parseOpenSymbol(); + if (s1 !== peg$FAILED) { + s2 = []; + s3 = peg$currPos; + s4 = peg$parseSentence(); + if (s4 !== peg$FAILED) { + s5 = peg$parseWhitespace(); + if (s5 !== peg$FAILED) { + s4 = [s4, s5]; + s3 = s4; + } else { + peg$currPos = s3; + s3 = peg$FAILED; + } + } else { + peg$currPos = s3; + s3 = peg$FAILED; + } + if (s3 !== peg$FAILED) { + while (s3 !== peg$FAILED) { + s2.push(s3); + s3 = peg$currPos; + s4 = peg$parseSentence(); + if (s4 !== peg$FAILED) { + s5 = peg$parseWhitespace(); + if (s5 !== peg$FAILED) { + s4 = [s4, s5]; + s3 = s4; + } else { + peg$currPos = s3; + s3 = peg$FAILED; + } + } else { + peg$currPos = s3; + s3 = peg$FAILED; + } + } + } else { + s2 = peg$FAILED; + } + if (s2 !== peg$FAILED) { + s3 = peg$parseCloseSymbol(); + if (s3 === peg$FAILED) { + s3 = null; + } + if (s3 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c2(s1, s2, s3); + s0 = s1; + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + return s0; + } + function peg$parseSentence() { + var s0, s1, s2; + s0 = peg$currPos; + s1 = []; + s2 = peg$parseTokenSeq(); + if (s2 === peg$FAILED) { + s2 = peg$parseQuotedTokenSeq(); + } + if (s2 !== peg$FAILED) { + while (s2 !== peg$FAILED) { + s1.push(s2); + s2 = peg$parseTokenSeq(); + if (s2 === peg$FAILED) { + s2 = peg$parseQuotedTokenSeq(); + } + } + } else { + s1 = peg$FAILED; + } + if (s1 !== peg$FAILED) { + s2 = peg$parseEndOfSentence(); + if (s2 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c3(s1, s2); + s0 = s1; + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + return s0; + } + function peg$parseTokenSeq() { + var s0, s1, s2, s3, s4; + s0 = peg$currPos; + s1 = []; + s2 = peg$currPos; + s3 = peg$parseToken(); + if (s3 !== peg$FAILED) { + s4 = peg$parseWhitespace(); + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + if (s2 !== peg$FAILED) { + while (s2 !== peg$FAILED) { + s1.push(s2); + s2 = peg$currPos; + s3 = peg$parseToken(); + if (s3 !== peg$FAILED) { + s4 = peg$parseWhitespace(); + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } + } else { + s1 = peg$FAILED; + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c4(s1); + } + s0 = s1; + return s0; + } + function peg$parseQuotedTokenSeq() { + var s0, s1, s2, s3, s4, s5; + s0 = peg$currPos; + s1 = peg$parseOpenSymbol(); + if (s1 !== peg$FAILED) { + s2 = []; + s3 = peg$currPos; + s4 = peg$parseToken(); + if (s4 !== peg$FAILED) { + s5 = peg$parseWhitespace(); + if (s5 !== peg$FAILED) { + s4 = [s4, s5]; + s3 = s4; + } else { + peg$currPos = s3; + s3 = peg$FAILED; + } + } else { + peg$currPos = s3; + s3 = peg$FAILED; + } + if (s3 !== peg$FAILED) { + while (s3 !== peg$FAILED) { + s2.push(s3); + s3 = peg$currPos; + s4 = peg$parseToken(); + if (s4 !== peg$FAILED) { + s5 = peg$parseWhitespace(); + if (s5 !== peg$FAILED) { + s4 = [s4, s5]; + s3 = s4; + } else { + peg$currPos = s3; + s3 = peg$FAILED; + } + } else { + peg$currPos = s3; + s3 = peg$FAILED; + } + } + } else { + s2 = peg$FAILED; + } + if (s2 !== peg$FAILED) { + s3 = peg$parseEndOfSentence(); + if (s3 !== peg$FAILED) { + s4 = peg$parseCloseSymbol(); + if (s4 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c5(s1, s2, s3, s4); + s0 = s1; + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + return s0; + } + function peg$parseEndOfSentence() { + var s0, s1, s2; + s0 = peg$currPos; + s1 = []; + if (peg$c6.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c7); + } + } + while (s2 !== peg$FAILED) { + s1.push(s2); + if (peg$c6.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c7); + } + } + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c8(); + } + s0 = s1; + return s0; + } + function peg$parseWhitespace() { + var s0, s1, s2; + s0 = peg$currPos; + s1 = []; + if (peg$c9.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c10); + } + } + while (s2 !== peg$FAILED) { + s1.push(s2); + if (peg$c9.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c10); + } + } + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c8(); + } + s0 = s1; + return s0; + } + function peg$parseToken() { + var s0, s1; + s0 = peg$currPos; + s1 = peg$parseURI(); + if (s1 === peg$FAILED) { + s1 = peg$parseEmail(); + if (s1 === peg$FAILED) { + s1 = peg$parseNumber(); + if (s1 === peg$FAILED) { + s1 = peg$parseAbbreviation(); + if (s1 === peg$FAILED) { + s1 = peg$parseWord(); + } + } + } + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c11(s1); + } + s0 = s1; + return s0; + } + function peg$parseAbbreviation() { + var s0, s1, s2; + s0 = peg$currPos; + s1 = []; + if (peg$c12.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c13); + } + } + if (s2 !== peg$FAILED) { + while (s2 !== peg$FAILED) { + s1.push(s2); + if (peg$c12.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c13); + } + } + } + } else { + s1 = peg$FAILED; + } + if (s1 !== peg$FAILED) { + peg$savedPos = peg$currPos; + s2 = peg$c14(s1); + if (s2) { + s2 = void 0; + } else { + s2 = peg$FAILED; + } + if (s2 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c15(s1); + s0 = s1; + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + return s0; + } + function peg$parseWord() { + var s0, s1, s2; + s0 = peg$currPos; + s1 = []; + if (peg$c16.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c17); + } + } + if (s2 !== peg$FAILED) { + while (s2 !== peg$FAILED) { + s1.push(s2); + if (peg$c16.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c17); + } + } + } + } else { + s1 = peg$FAILED; + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c18(); + } + s0 = s1; + return s0; + } + function peg$parseNumber() { + var s0, s1, s2, s3, s4, s5; + s0 = peg$currPos; + s1 = []; + if (peg$c19.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c20); + } + } + if (s2 !== peg$FAILED) { + while (s2 !== peg$FAILED) { + s1.push(s2); + if (peg$c19.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c20); + } + } + } + } else { + s1 = peg$FAILED; + } + if (s1 !== peg$FAILED) { + s2 = peg$currPos; + if (input.length > peg$currPos) { + s3 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s3 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c21); + } + } + if (s3 !== peg$FAILED) { + s4 = []; + if (peg$c19.test(input.charAt(peg$currPos))) { + s5 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s5 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c20); + } + } + if (s5 !== peg$FAILED) { + while (s5 !== peg$FAILED) { + s4.push(s5); + if (peg$c19.test(input.charAt(peg$currPos))) { + s5 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s5 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c20); + } + } + } + } else { + s4 = peg$FAILED; + } + if (s4 !== peg$FAILED) { + s3 = [s3, s4]; + s2 = s3; + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + } else { + peg$currPos = s2; + s2 = peg$FAILED; + } + if (s2 === peg$FAILED) { + s2 = null; + } + if (s2 !== peg$FAILED) { + s3 = peg$parseCloseSymbol(); + if (s3 === peg$FAILED) { + s3 = null; + } + if (s3 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c8(); + s0 = s1; + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + return s0; + } + function peg$parseEmail() { + var s0, s1, s2, s3, s4, s5, s6; + s0 = peg$currPos; + s1 = []; + if (peg$c22.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c23); + } + } + if (s2 !== peg$FAILED) { + while (s2 !== peg$FAILED) { + s1.push(s2); + if (peg$c22.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c23); + } + } + } + } else { + s1 = peg$FAILED; + } + if (s1 !== peg$FAILED) { + if (peg$c24.test(input.charAt(peg$currPos))) { + s2 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s2 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c25); + } + } + if (s2 !== peg$FAILED) { + s3 = []; + if (peg$c22.test(input.charAt(peg$currPos))) { + s4 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s4 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c23); + } + } + if (s4 !== peg$FAILED) { + while (s4 !== peg$FAILED) { + s3.push(s4); + if (peg$c22.test(input.charAt(peg$currPos))) { + s4 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s4 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c23); + } + } + } + } else { + s3 = peg$FAILED; + } + if (s3 !== peg$FAILED) { + if (peg$c26.test(input.charAt(peg$currPos))) { + s4 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s4 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c27); + } + } + if (s4 !== peg$FAILED) { + s5 = []; + if (peg$c22.test(input.charAt(peg$currPos))) { + s6 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s6 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c23); + } + } + if (s6 !== peg$FAILED) { + while (s6 !== peg$FAILED) { + s5.push(s6); + if (peg$c22.test(input.charAt(peg$currPos))) { + s6 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s6 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c23); + } + } + } + } else { + s5 = peg$FAILED; + } + if (s5 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c8(); + s0 = s1; + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + return s0; + } + function peg$parseURI() { + var s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; + s0 = peg$currPos; + if (input.substr(peg$currPos, 7) === peg$c28) { + s1 = peg$c28; + peg$currPos += 7; + } else { + s1 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c29); + } + } + if (s1 === peg$FAILED) { + if (input.substr(peg$currPos, 8) === peg$c30) { + s1 = peg$c30; + peg$currPos += 8; + } else { + s1 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c31); + } + } + } + if (s1 === peg$FAILED) { + s1 = null; + } + if (s1 !== peg$FAILED) { + s2 = []; + if (peg$c32.test(input.charAt(peg$currPos))) { + s3 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s3 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + if (s3 !== peg$FAILED) { + while (s3 !== peg$FAILED) { + s2.push(s3); + if (peg$c32.test(input.charAt(peg$currPos))) { + s3 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s3 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + } + } else { + s2 = peg$FAILED; + } + if (s2 !== peg$FAILED) { + if (peg$c26.test(input.charAt(peg$currPos))) { + s3 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s3 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c27); + } + } + if (s3 !== peg$FAILED) { + s4 = peg$currPos; + s5 = []; + if (peg$c32.test(input.charAt(peg$currPos))) { + s6 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s6 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + if (s6 !== peg$FAILED) { + while (s6 !== peg$FAILED) { + s5.push(s6); + if (peg$c32.test(input.charAt(peg$currPos))) { + s6 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s6 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + } + } else { + s5 = peg$FAILED; + } + if (s5 !== peg$FAILED) { + if (peg$c26.test(input.charAt(peg$currPos))) { + s6 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s6 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c27); + } + } + if (s6 !== peg$FAILED) { + s5 = [s5, s6]; + s4 = s5; + } else { + peg$currPos = s4; + s4 = peg$FAILED; + } + } else { + peg$currPos = s4; + s4 = peg$FAILED; + } + if (s4 === peg$FAILED) { + s4 = null; + } + if (s4 !== peg$FAILED) { + s5 = []; + if (peg$c32.test(input.charAt(peg$currPos))) { + s6 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s6 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + if (s6 !== peg$FAILED) { + while (s6 !== peg$FAILED) { + s5.push(s6); + if (peg$c32.test(input.charAt(peg$currPos))) { + s6 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s6 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + } + } else { + s5 = peg$FAILED; + } + if (s5 !== peg$FAILED) { + s6 = []; + s7 = peg$currPos; + s8 = []; + if (peg$c32.test(input.charAt(peg$currPos))) { + s9 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s9 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + if (s9 !== peg$FAILED) { + while (s9 !== peg$FAILED) { + s8.push(s9); + if (peg$c32.test(input.charAt(peg$currPos))) { + s9 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s9 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + } + } else { + s8 = peg$FAILED; + } + if (s8 !== peg$FAILED) { + if (peg$c34.test(input.charAt(peg$currPos))) { + s9 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s9 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c35); + } + } + if (s9 !== peg$FAILED) { + s8 = [s8, s9]; + s7 = s8; + } else { + peg$currPos = s7; + s7 = peg$FAILED; + } + } else { + peg$currPos = s7; + s7 = peg$FAILED; + } + while (s7 !== peg$FAILED) { + s6.push(s7); + s7 = peg$currPos; + s8 = []; + if (peg$c32.test(input.charAt(peg$currPos))) { + s9 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s9 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + if (s9 !== peg$FAILED) { + while (s9 !== peg$FAILED) { + s8.push(s9); + if (peg$c32.test(input.charAt(peg$currPos))) { + s9 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s9 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c33); + } + } + } + } else { + s8 = peg$FAILED; + } + if (s8 !== peg$FAILED) { + if (peg$c34.test(input.charAt(peg$currPos))) { + s9 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s9 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c35); + } + } + if (s9 !== peg$FAILED) { + s8 = [s8, s9]; + s7 = s8; + } else { + peg$currPos = s7; + s7 = peg$FAILED; + } + } else { + peg$currPos = s7; + s7 = peg$FAILED; + } + } + if (s6 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c36(); + s0 = s1; + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + } else { + peg$currPos = s0; + s0 = peg$FAILED; + } + return s0; + } + function peg$parseOpenSymbol() { + var s0, s1; + s0 = peg$currPos; + if (peg$c37.test(input.charAt(peg$currPos))) { + s1 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s1 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c38); + } + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c8(); + } + s0 = s1; + return s0; + } + function peg$parseCloseSymbol() { + var s0, s1; + s0 = peg$currPos; + if (peg$c39.test(input.charAt(peg$currPos))) { + s1 = input.charAt(peg$currPos); + peg$currPos++; + } else { + s1 = peg$FAILED; + if (peg$silentFails === 0) { + peg$fail(peg$c40); + } + } + if (s1 !== peg$FAILED) { + peg$savedPos = s0; + s1 = peg$c8(); + } + s0 = s1; + return s0; + } + const knownAbbreviations = require_abbreviations_en().knownAbbreviations; + peg$result = peg$startRuleFunction(); + if (peg$result !== peg$FAILED && peg$currPos === input.length) { + return peg$result; + } else { + if (peg$result !== peg$FAILED && peg$currPos < input.length) { + peg$fail(peg$endExpectation()); + } + throw peg$buildStructuredError( + peg$maxFailExpected, + peg$maxFailPos < input.length ? input.charAt(peg$maxFailPos) : null, + peg$maxFailPos < input.length + ? peg$computeLocation(peg$maxFailPos, peg$maxFailPos + 1) + : peg$computeLocation(peg$maxFailPos, peg$maxFailPos), + ); + } + } + module.exports = { + SyntaxError: peg$SyntaxError, + parse: peg$parse, + }; + }, +}); + +// lib/natural/tokenizers/tokenizer.js +var require_tokenizer = cjs({ + "lib/natural/tokenizers/tokenizer.js"(exports, module) { + "use strict"; + var Tokenizer = class { + trim(array) { + while (array[array.length - 1] === "") { + array.pop(); + } + while (array[0] === "") { + array.shift(); + } + return array; + } + }; + module.exports = Tokenizer; + }, +}); + +// lib/natural/tokenizers/sentence_tokenizer_parser.js +var require_sentence_tokenizer_parser = cjs({ + "lib/natural/tokenizers/sentence_tokenizer_parser.js"(exports, module) { + var parser = require_parser_sentence_tokenizer(); + var Tokenizer = require_tokenizer(); + var SentenceTokenizer = class extends Tokenizer { + tokenize(text) { + return parser.parse(text); + } + }; + module.exports = SentenceTokenizer; + }, +}); +export default require_sentence_tokenizer_parser(); diff --git a/packages/core/src/node-parser/sentence-window.ts b/packages/core/src/node-parser/sentence-window.ts new file mode 100644 index 000000000..be6219a77 --- /dev/null +++ b/packages/core/src/node-parser/sentence-window.ts @@ -0,0 +1,85 @@ +import { randomUUID } from "@llamaindex/env"; +import { z } from "zod"; +import { + buildNodeFromSplits, + Document, + sentenceWindowNodeParserSchema, + TextNode, +} from "../schema"; +import { NodeParser } from "./base"; +import { splitBySentenceTokenizer, type TextSplitterFn } from "./utils"; + +export class SentenceWindowNodeParser extends NodeParser { + static DEFAULT_WINDOW_SIZE = 3; + static DEFAULT_WINDOW_METADATA_KEY = "window"; + static DEFAULT_ORIGINAL_TEXT_METADATA_KEY = "originalText"; + + windowSize: number; + windowMetadataKey: string; + originalTextMetadataKey: string; + sentenceSplitter: TextSplitterFn = splitBySentenceTokenizer(); + idGenerator: () => string = () => randomUUID(); + + constructor(params?: z.input<typeof sentenceWindowNodeParserSchema>) { + super(); + if (params) { + const parsedParams = sentenceWindowNodeParserSchema.parse(params); + this.windowSize = parsedParams.windowSize; + this.windowMetadataKey = parsedParams.windowMetadataKey; + this.originalTextMetadataKey = parsedParams.originalTextMetadataKey; + } else { + this.windowSize = SentenceWindowNodeParser.DEFAULT_WINDOW_SIZE; + this.windowMetadataKey = + SentenceWindowNodeParser.DEFAULT_WINDOW_METADATA_KEY; + this.originalTextMetadataKey = + SentenceWindowNodeParser.DEFAULT_ORIGINAL_TEXT_METADATA_KEY; + } + } + + override parseNodes(nodes: TextNode[], showProgress?: boolean): TextNode[] { + return nodes.reduce<TextNode[]>((allNodes, node) => { + const nodes = this.buildWindowNodesFromDocuments([node]); + return allNodes.concat(nodes); + }, []); + } + + buildWindowNodesFromDocuments(documents: Document[]): TextNode[] { + const allNodes: TextNode[] = []; + + for (const doc of documents) { + const text = doc.text; + const textSplits = this.sentenceSplitter(text); + const nodes = buildNodeFromSplits( + textSplits, + doc, + undefined, + this.idGenerator, + ); + + nodes.forEach((node, i) => { + const windowNodes = nodes.slice( + Math.max(0, i - this.windowSize), + Math.min(i + this.windowSize + 1, nodes.length), + ); + + node.metadata[this.windowMetadataKey] = windowNodes + .map((n) => n.text) + .join(" "); + node.metadata[this.originalTextMetadataKey] = node.text; + + node.excludedEmbedMetadataKeys.push( + this.windowMetadataKey, + this.originalTextMetadataKey, + ); + node.excludedLlmMetadataKeys.push( + this.windowMetadataKey, + this.originalTextMetadataKey, + ); + }); + + allNodes.push(...nodes); + } + + return allNodes; + } +} diff --git a/packages/core/src/node-parser/type.ts b/packages/core/src/node-parser/type.ts new file mode 100644 index 000000000..0974f761f --- /dev/null +++ b/packages/core/src/node-parser/type.ts @@ -0,0 +1,5 @@ +import type { Tokenizer } from "@llamaindex/env"; + +export type SplitterParams = { + tokenizer?: Tokenizer; +}; diff --git a/packages/core/src/node-parser/utils.ts b/packages/core/src/node-parser/utils.ts new file mode 100644 index 000000000..1b9410c2d --- /dev/null +++ b/packages/core/src/node-parser/utils.ts @@ -0,0 +1,53 @@ +import type { TextSplitter } from "./base"; +import SentenceTokenizerNew from "./sentence-tokenizer-parser.js"; + +export type TextSplitterFn = (text: string) => string[]; + +const truncateText = (text: string, textSplitter: TextSplitter): string => { + const chunks = textSplitter.splitText(text); + return chunks[0]; +}; + +const splitTextKeepSeparator = (text: string, separator: string): string[] => { + const parts = text.split(separator); + const result = parts.map((part, index) => + index > 0 ? separator + part : part, + ); + return result.filter((s) => s); +}; + +export const splitBySep = ( + sep: string, + keepSep: boolean = true, +): TextSplitterFn => { + if (keepSep) { + return (text: string) => splitTextKeepSeparator(text, sep); + } else { + return (text: string) => text.split(sep); + } +}; + +export const splitByChar = (): TextSplitterFn => { + return (text: string) => text.split(""); +}; + +let sentenceTokenizer: SentenceTokenizerNew | null = null; + +export const splitBySentenceTokenizer = (): TextSplitterFn => { + if (!sentenceTokenizer) { + sentenceTokenizer = new SentenceTokenizerNew(); + } + const tokenizer = sentenceTokenizer; + return (text: string) => { + return tokenizer.tokenize(text); + }; +}; + +export const splitByRegex = (regex: string): TextSplitterFn => { + return (text: string) => text.match(new RegExp(regex, "g")) || []; +}; + +export const splitByPhraseRegex = (): TextSplitterFn => { + const regex = "[^,.;]+[,.;]?"; + return splitByRegex(regex); +}; diff --git a/packages/core/src/schema/node.ts b/packages/core/src/schema/node.ts index e716b8389..50ab9a973 100644 --- a/packages/core/src/schema/node.ts +++ b/packages/core/src/schema/node.ts @@ -450,3 +450,48 @@ export function splitNodesByType(nodes: BaseNode[]): NodesByType { } return result; } + +export function buildNodeFromSplits( + textSplits: string[], + doc: BaseNode, + refDoc: BaseNode = doc, + idGenerator: (idx: number, refDoc: BaseNode) => string = () => randomUUID(), +) { + const nodes: TextNode[] = []; + const relationships = { + [NodeRelationship.SOURCE]: refDoc.asRelatedNodeInfo(), + }; + + textSplits.forEach((textChunk, i) => { + if (doc instanceof ImageDocument) { + const imageNode = new ImageNode({ + id_: idGenerator(i, doc), + text: textChunk, + image: doc.image, + embedding: doc.embedding, + excludedEmbedMetadataKeys: [...doc.excludedEmbedMetadataKeys], + excludedLlmMetadataKeys: [...doc.excludedLlmMetadataKeys], + metadataSeparator: doc.metadataSeparator, + textTemplate: doc.textTemplate, + relationships: { ...relationships }, + }); + nodes.push(imageNode); + } else if (doc instanceof Document || doc instanceof TextNode) { + const node = new TextNode({ + id_: idGenerator(i, doc), + text: textChunk, + embedding: doc.embedding, + excludedEmbedMetadataKeys: [...doc.excludedEmbedMetadataKeys], + excludedLlmMetadataKeys: [...doc.excludedLlmMetadataKeys], + metadataSeparator: doc.metadataSeparator, + textTemplate: doc.textTemplate, + relationships: { ...relationships }, + }); + nodes.push(node); + } else { + throw new Error(`Unknown document type: ${doc.type}`); + } + }); + + return nodes; +} diff --git a/packages/core/src/schema/type.ts b/packages/core/src/schema/type.ts index 9a16e65ad..688c5ce88 100644 --- a/packages/core/src/schema/type.ts +++ b/packages/core/src/schema/type.ts @@ -1,5 +1,8 @@ import type { BaseNode } from "./node"; -export interface TransformComponent<Options extends Record<string, unknown>> { - transform(nodes: BaseNode[], options?: Options): Promise<BaseNode[]>; +export interface TransformComponent { + transform<Options extends Record<string, unknown>>( + nodes: BaseNode[], + options?: Options, + ): Promise<BaseNode[]>; } diff --git a/packages/core/src/schema/zod.ts b/packages/core/src/schema/zod.ts index 94cd4d2a4..5fa35d144 100644 --- a/packages/core/src/schema/zod.ts +++ b/packages/core/src/schema/zod.ts @@ -1,4 +1,5 @@ import { z } from "zod"; +import { Settings } from "../global"; export const anyFunctionSchema = z.function(z.tuple([]).rest(z.any()), z.any()); @@ -16,3 +17,62 @@ export const baseToolSchema = z.object({ export const baseToolWithCallSchema = baseToolSchema.extend({ call: z.function(), }); + +export const sentenceSplitterSchema = z + .object({ + chunkSize: z + .number({ + description: "The token chunk size for each chunk.", + }) + .gt(0) + .optional() + .default(() => Settings.chunkSize ?? 1024), + chunkOverlap: z + .number({ + description: "The token overlap of each chunk when splitting.", + }) + .gte(0) + .optional() + .default(200), + separator: z + .string({ + description: "Default separator for splitting into words", + }) + .default(" "), + paragraphSeparator: z + .string({ + description: "Separator between paragraphs.", + }) + .optional() + .default("\n\n\n"), + secondaryChunkingRegex: z + .string({ + description: "Backup regex for splitting into sentences.", + }) + .optional() + .default("[^,.;。?ï¼]+[,.;。?ï¼]?"), + }) + .refine( + (data) => data.chunkOverlap < data.chunkSize, + "Chunk overlap must be less than chunk size.", + ); + +export const sentenceWindowNodeParserSchema = z.object({ + windowSize: z + .number({ + description: + "The number of sentences on each side of a sentence to capture.", + }) + .gt(0) + .default(3), + windowMetadataKey: z + .string({ + description: "The metadata key to store the sentence window under.", + }) + .default("window"), + originalTextMetadataKey: z + .string({ + description: "The metadata key to store the original sentence in.", + }) + .default("originalText"), +}); diff --git a/packages/core/src/utils/wrap-llm-event.ts b/packages/core/src/utils/wrap-llm-event.ts index 88e309a3f..cc0948f82 100644 --- a/packages/core/src/utils/wrap-llm-event.ts +++ b/packages/core/src/utils/wrap-llm-event.ts @@ -1,5 +1,5 @@ import { AsyncLocalStorage, randomUUID } from "@llamaindex/env"; -import { getCallbackManager } from "../global/settings/callback-manager"; +import { Settings } from "../global"; import type { ChatResponse, ChatResponseChunk, LLM, LLMChat } from "../llms"; export function wrapLLMEvent< @@ -21,7 +21,7 @@ export function wrapLLMEvent< LLMChat<AdditionalChatOptions, AdditionalMessageOptions>["chat"] > { const id = randomUUID(); - getCallbackManager().dispatchEvent("llm-start", { + Settings.callbackManager.dispatchEvent("llm-start", { id, messages: params[0].messages, }); @@ -55,7 +55,7 @@ export function wrapLLMEvent< ...chunk.options, }; } - getCallbackManager().dispatchEvent("llm-stream", { + Settings.callbackManager.dispatchEvent("llm-stream", { id, chunk, }); @@ -63,14 +63,14 @@ export function wrapLLMEvent< yield chunk; } snapshot(() => { - getCallbackManager().dispatchEvent("llm-end", { + Settings.callbackManager.dispatchEvent("llm-end", { id, response: finalResponse, }); }); }; } else { - getCallbackManager().dispatchEvent("llm-end", { + Settings.callbackManager.dispatchEvent("llm-end", { id, response, }); diff --git a/packages/llamaindex/tests/nodeParsers/MarkdownNodeParser.test.ts b/packages/core/tests/node-parser/markdown.test.ts similarity index 78% rename from packages/llamaindex/tests/nodeParsers/MarkdownNodeParser.test.ts rename to packages/core/tests/node-parser/markdown.test.ts index 189adf740..4ebea6411 100644 --- a/packages/llamaindex/tests/nodeParsers/MarkdownNodeParser.test.ts +++ b/packages/core/tests/node-parser/markdown.test.ts @@ -1,5 +1,5 @@ +import { MarkdownNodeParser } from "@llamaindex/core/node-parser"; import { Document, MetadataMode } from "@llamaindex/core/schema"; -import { MarkdownNodeParser } from "llamaindex/nodeParsers/index"; import { describe, expect, test } from "vitest"; describe("MarkdownNodeParser", () => { @@ -19,8 +19,8 @@ Header 2 content ]); expect(splits.length).toBe(2); - expect(splits[0].metadata).toEqual({ "Header 1": "Main Header" }); - expect(splits[1].metadata).toEqual({ "Header 1": "Header 2" }); + expect(splits[0].metadata).toEqual({ Header_1: "Main Header" }); + expect(splits[1].metadata).toEqual({ Header_1: "Header 2" }); expect(splits[0].getContent(MetadataMode.NONE)).toStrictEqual( "Main Header\n\nHeader 1 content", ); @@ -89,16 +89,16 @@ Content }), ]); expect(splits.length).toBe(4); - expect(splits[0].metadata).toEqual({ "Header 1": "Main Header" }); + expect(splits[0].metadata).toEqual({ Header_1: "Main Header" }); expect(splits[1].metadata).toEqual({ - "Header 1": "Main Header", - "Header 2": "Sub-header", + Header_1: "Main Header", + Header_2: "Sub-header", }); expect(splits[2].metadata).toEqual({ - "Header 1": "Main Header", - "Header 2": "Sub-header", - "Header 3": "Sub-sub header", + Header_1: "Main Header", + Header_2: "Sub-header", + Header_3: "Sub-sub header", }); - expect(splits[3].metadata).toEqual({ "Header 1": "New title" }); + expect(splits[3].metadata).toEqual({ Header_1: "New title" }); }); }); diff --git a/packages/llamaindex/tests/nodeParsers/SimpleNodeParser.test.ts b/packages/core/tests/node-parser/sentence-spiller.test.ts similarity index 61% rename from packages/llamaindex/tests/nodeParsers/SimpleNodeParser.test.ts rename to packages/core/tests/node-parser/sentence-spiller.test.ts index a69a4f554..d29bbee2c 100644 --- a/packages/llamaindex/tests/nodeParsers/SimpleNodeParser.test.ts +++ b/packages/core/tests/node-parser/sentence-spiller.test.ts @@ -1,12 +1,13 @@ +import { SentenceSplitter } from "@llamaindex/core/node-parser"; import { Document } from "@llamaindex/core/schema"; -import { SimpleNodeParser } from "llamaindex/nodeParsers/index"; +import { tokenizers } from "@llamaindex/env"; import { beforeEach, describe, expect, test } from "vitest"; -describe("SimpleNodeParser", () => { - let simpleNodeParser: SimpleNodeParser; +describe("SentenceSplitter", () => { + let sentenceSplitter: SentenceSplitter; beforeEach(() => { - simpleNodeParser = new SimpleNodeParser({ + sentenceSplitter = new SentenceSplitter({ chunkSize: 1024, chunkOverlap: 20, }); @@ -19,7 +20,7 @@ describe("SimpleNodeParser", () => { excludedLlmMetadataKeys: ["animals"], excludedEmbedMetadataKeys: ["animals"], }); - const result = simpleNodeParser.getNodesFromDocuments([doc]); + const result = sentenceSplitter.getNodesFromDocuments([doc]); expect(result.length).toEqual(1); const node = result[0]; // check not the same object @@ -37,4 +38,15 @@ describe("SimpleNodeParser", () => { // check relationship expect(node.sourceNode?.nodeId).toBe(doc.id_); }); + + test("split long text", async () => { + const longSentence = "is ".repeat(9000) + "."; + const document = new Document({ text: longSentence, id_: "1" }); + const result = sentenceSplitter.getNodesFromDocuments([document]); + expect(result.length).toEqual(9); + result.forEach((node) => { + const { length } = tokenizers.tokenizer().encode(node.text); + expect(length).toBeLessThanOrEqual(1024); + }); + }); }); diff --git a/packages/llamaindex/tests/nodeParsers/SentenceWindowNodeParser.test.ts b/packages/core/tests/node-parser/sentence-window.test.ts similarity index 77% rename from packages/llamaindex/tests/nodeParsers/SentenceWindowNodeParser.test.ts rename to packages/core/tests/node-parser/sentence-window.test.ts index 6bf7473a9..0d580999a 100644 --- a/packages/llamaindex/tests/nodeParsers/SentenceWindowNodeParser.test.ts +++ b/packages/core/tests/node-parser/sentence-window.test.ts @@ -1,8 +1,5 @@ +import { SentenceWindowNodeParser } from "@llamaindex/core/node-parser"; import { Document, MetadataMode } from "@llamaindex/core/schema"; -import { - DEFAULT_WINDOW_METADATA_KEY, - SentenceWindowNodeParser, -} from "llamaindex/nodeParsers/index"; import { describe, expect, test } from "vitest"; describe("Tests for the SentenceWindowNodeParser class", () => { @@ -11,7 +8,7 @@ describe("Tests for the SentenceWindowNodeParser class", () => { expect(sentenceWindowNodeParser).toBeDefined(); }); test("testing the getNodesFromDocuments method", () => { - const sentenceWindowNodeParser = SentenceWindowNodeParser.fromDefaults({ + const sentenceWindowNodeParser = new SentenceWindowNodeParser({ windowSize: 1, }); const doc = new Document({ text: "Hello. Cat Mouse. Dog." }); @@ -25,7 +22,9 @@ describe("Tests for the SentenceWindowNodeParser class", () => { "Dog.", ]); expect( - resultingNodes.map((n) => n.metadata[DEFAULT_WINDOW_METADATA_KEY]), + resultingNodes.map( + (n) => n.metadata[SentenceWindowNodeParser.DEFAULT_WINDOW_METADATA_KEY], + ), ).toEqual([ "Hello. Cat Mouse.", "Hello. Cat Mouse. Dog.", diff --git a/packages/llamaindex/tests/TextSplitter.test.ts b/packages/core/tests/node-parser/text-splitter.test.ts similarity index 83% rename from packages/llamaindex/tests/TextSplitter.test.ts rename to packages/core/tests/node-parser/text-splitter.test.ts index 90ae962a0..7618e781a 100644 --- a/packages/llamaindex/tests/TextSplitter.test.ts +++ b/packages/core/tests/node-parser/text-splitter.test.ts @@ -1,7 +1,4 @@ -import { - SentenceSplitter, - cjkSentenceTokenizer, -} from "llamaindex/TextSplitter"; +import { SentenceSplitter } from "@llamaindex/core/node-parser"; import { describe, expect, test } from "vitest"; describe("SentenceSplitter", () => { @@ -10,14 +7,16 @@ describe("SentenceSplitter", () => { expect(sentenceSplitter).toBeDefined(); }); + test("chunk size should less than chunk", async () => {}); + test("splits paragraphs w/o effective chunk size", () => { const sentenceSplitter = new SentenceSplitter({ - paragraphSeparator: "\n\n\n", + chunkSize: 9, + chunkOverlap: 0, }); // generate the same line as above but correct syntax errors - const splits = sentenceSplitter.getParagraphSplits( + const splits = sentenceSplitter.splitText( "This is a paragraph.\n\n\nThis is another paragraph.", - undefined, ); expect(splits).toEqual([ "This is a paragraph.", @@ -30,9 +29,8 @@ describe("SentenceSplitter", () => { paragraphSeparator: "\n", }); // generate the same line as above but correct syntax errors - const splits = sentenceSplitter.getParagraphSplits( + const splits = sentenceSplitter.splitText( "This is a paragraph.\nThis is another paragraph.", - 1000, ); expect(splits).toEqual([ "This is a paragraph.\nThis is another paragraph.", @@ -40,10 +38,12 @@ describe("SentenceSplitter", () => { }); test("splits sentences", () => { - const sentenceSplitter = new SentenceSplitter(); - const splits = sentenceSplitter.getSentenceSplits( + const sentenceSplitter = new SentenceSplitter({ + chunkSize: 9, + chunkOverlap: 0, + }); + const splits = sentenceSplitter.splitText( "This is a sentence. This is another sentence.", - undefined, ); expect(splits).toEqual([ "This is a sentence.", @@ -89,9 +89,10 @@ describe("SentenceSplitter", () => { test("splits cjk", () => { const sentenceSplitter = new SentenceSplitter({ - chunkSize: 12, + chunkSize: 30, chunkOverlap: 0, - chunkingTokenizerFn: cjkSentenceTokenizer, + secondaryChunkingRegex: + '.*?([﹒﹔﹖﹗.;。ï¼ï¼Ÿ]["’â€ã€ã€]{0,2}|:(?=["‘“「『]{1,2}|$))', }); const splits = sentenceSplitter.splitText( diff --git a/packages/llamaindex/e2e/node/openai.e2e.ts b/packages/llamaindex/e2e/node/openai.e2e.ts index 7193e4644..63f0a1e3f 100644 --- a/packages/llamaindex/e2e/node/openai.e2e.ts +++ b/packages/llamaindex/e2e/node/openai.e2e.ts @@ -7,8 +7,8 @@ import { OpenAI, OpenAIAgent, QueryEngineTool, + SentenceSplitter, Settings, - SimpleNodeParser, SimpleToolNodeMapping, SubQuestionQueryEngine, SummaryIndex, @@ -124,7 +124,7 @@ await test("agent with object retriever", async (t) => { const alexInfoText = await readFile(alexInfoPath, "utf-8"); const alexDocument = new Document({ text: alexInfoText, id_: alexInfoPath }); - const nodes = new SimpleNodeParser({ + const nodes = new SentenceSplitter({ chunkSize: 200, chunkOverlap: 20, }).getNodesFromDocuments([alexDocument]); diff --git a/packages/llamaindex/e2e/node/snapshot/agent.snap b/packages/llamaindex/e2e/node/snapshot/agent.snap index bf57c1111..28ca46ff7 100644 --- a/packages/llamaindex/e2e/node/snapshot/agent.snap +++ b/packages/llamaindex/e2e/node/snapshot/agent.snap @@ -22,7 +22,7 @@ "options": { "toolCall": [ { - "id": "call_sH6QfjsymHW7JFl68j8AY6xg", + "id": "call_8kF02T5eJKwUL5hCGF8upWgn", "name": "Weather", "input": "{\"location\":\"San Francisco\"}" } @@ -30,13 +30,13 @@ } }, { - "role": "user", "content": "35 degrees and sunny in San Francisco", + "role": "user", "options": { "toolResult": { "result": "35 degrees and sunny in San Francisco", "isError": false, - "id": "call_sH6QfjsymHW7JFl68j8AY6xg" + "id": "call_8kF02T5eJKwUL5hCGF8upWgn" } } } @@ -64,7 +64,7 @@ "options": { "toolCall": [ { - "id": "call_V7zs8cyDT5FqJhjwBqcCydgA", + "id": "call_xsgmMFgliEDiOmZuLaBjUiXE", "name": "unique_id", "input": "{\"firstName\":\"Alex\",\"lastName\":\"Yang\"}" } @@ -72,13 +72,13 @@ } }, { - "role": "user", "content": "123456789", + "role": "user", "options": { "toolResult": { "result": "123456789", "isError": false, - "id": "call_V7zs8cyDT5FqJhjwBqcCydgA" + "id": "call_xsgmMFgliEDiOmZuLaBjUiXE" } } } @@ -106,7 +106,7 @@ "options": { "toolCall": [ { - "id": "call_BrlGGU6GDGWr0hrwXt9qZKyt", + "id": "call_OTMrLMikpT37PLm6KOIG5LCF", "name": "sumNumbers", "input": "{\"a\":1,\"b\":1}" } @@ -114,13 +114,13 @@ } }, { - "role": "user", "content": "2", + "role": "user", "options": { "toolResult": { "result": "2", "isError": false, - "id": "call_BrlGGU6GDGWr0hrwXt9qZKyt" + "id": "call_OTMrLMikpT37PLm6KOIG5LCF" } } } @@ -138,7 +138,7 @@ "options": { "toolCall": [ { - "id": "call_sH6QfjsymHW7JFl68j8AY6xg", + "id": "call_8kF02T5eJKwUL5hCGF8upWgn", "name": "Weather", "input": "{\"location\":\"San Francisco\"}" } @@ -168,7 +168,7 @@ "options": { "toolCall": [ { - "id": "call_V7zs8cyDT5FqJhjwBqcCydgA", + "id": "call_xsgmMFgliEDiOmZuLaBjUiXE", "name": "unique_id", "input": "{\"firstName\":\"Alex\",\"lastName\":\"Yang\"}" } @@ -198,7 +198,7 @@ "options": { "toolCall": [ { - "id": "call_BrlGGU6GDGWr0hrwXt9qZKyt", + "id": "call_OTMrLMikpT37PLm6KOIG5LCF", "name": "sumNumbers", "input": "{\"a\":1,\"b\":1}" } diff --git a/packages/llamaindex/e2e/node/snapshot/agent_stream.snap b/packages/llamaindex/e2e/node/snapshot/agent_stream.snap index 87fed616d..0174e339e 100644 --- a/packages/llamaindex/e2e/node/snapshot/agent_stream.snap +++ b/packages/llamaindex/e2e/node/snapshot/agent_stream.snap @@ -23,7 +23,7 @@ "toolCall": [ { "name": "divideNumbers", - "id": "call_V4daSdWk9QeYeSMKLBisNbSf", + "id": "call_XgB0tixgYDhGXXSgdY19XDmt", "input": { "a": 16, "b": 2 @@ -31,7 +31,7 @@ }, { "name": "sumNumbers", - "id": "call_n2OlBxlaoeMIMVeU9DeDfiPX", + "id": "call_nXP1Rc85Ntdv5XcI6FBWmB6d", "input": "{\"a\": 8, \"b\": 20}" } ] @@ -44,7 +44,7 @@ "toolResult": { "result": "8", "isError": false, - "id": "call_V4daSdWk9QeYeSMKLBisNbSf" + "id": "call_XgB0tixgYDhGXXSgdY19XDmt" } } }, @@ -55,7 +55,7 @@ "toolResult": { "result": "28", "isError": false, - "id": "call_n2OlBxlaoeMIMVeU9DeDfiPX" + "id": "call_nXP1Rc85Ntdv5XcI6FBWmB6d" } } } @@ -74,7 +74,7 @@ "toolCall": [ { "name": "sumNumbers", - "id": "call_n2OlBxlaoeMIMVeU9DeDfiPX", + "id": "call_nXP1Rc85Ntdv5XcI6FBWmB6d", "input": "{\"a\": 8, \"b\": 20}" } ] @@ -87,7 +87,7 @@ "response": { "raw": null, "message": { - "content": "The result of dividing 16 by 2 is 8. When you add 20 to 8, the total is 28.", + "content": "After dividing 16 by 2, we get 8. Adding 20 to 8 gives us 28.", "role": "assistant", "options": {} } @@ -103,7 +103,7 @@ "toolCall": [ { "name": "divideNumbers", - "id": "call_V4daSdWk9QeYeSMKLBisNbSf", + "id": "call_XgB0tixgYDhGXXSgdY19XDmt", "input": "{\"a\": 16, \"b\": 2}" } ] @@ -119,7 +119,7 @@ "toolCall": [ { "name": "divideNumbers", - "id": "call_V4daSdWk9QeYeSMKLBisNbSf", + "id": "call_XgB0tixgYDhGXXSgdY19XDmt", "input": "{\"a\": 16, \"b\": 2}" } ] @@ -135,7 +135,7 @@ "toolCall": [ { "name": "divideNumbers", - "id": "call_V4daSdWk9QeYeSMKLBisNbSf", + "id": "call_XgB0tixgYDhGXXSgdY19XDmt", "input": "{\"a\": 16, \"b\": 2}" } ] @@ -151,7 +151,7 @@ "toolCall": [ { "name": "divideNumbers", - "id": "call_V4daSdWk9QeYeSMKLBisNbSf", + "id": "call_XgB0tixgYDhGXXSgdY19XDmt", "input": "{\"a\": 16, \"b\": 2}" } ] @@ -167,7 +167,7 @@ "toolCall": [ { "name": "divideNumbers", - "id": "call_V4daSdWk9QeYeSMKLBisNbSf", + "id": "call_XgB0tixgYDhGXXSgdY19XDmt", "input": "{\"a\": 16, \"b\": 2}" } ] @@ -183,7 +183,7 @@ "toolCall": [ { "name": "divideNumbers", - "id": "call_V4daSdWk9QeYeSMKLBisNbSf", + "id": "call_XgB0tixgYDhGXXSgdY19XDmt", "input": { "a": 16, "b": 2 @@ -202,7 +202,7 @@ "toolCall": [ { "name": "sumNumbers", - "id": "call_n2OlBxlaoeMIMVeU9DeDfiPX", + "id": "call_nXP1Rc85Ntdv5XcI6FBWmB6d", "input": "{\"a\": 8, \"b\": 20}" } ] @@ -218,7 +218,7 @@ "toolCall": [ { "name": "sumNumbers", - "id": "call_n2OlBxlaoeMIMVeU9DeDfiPX", + "id": "call_nXP1Rc85Ntdv5XcI6FBWmB6d", "input": "{\"a\": 8, \"b\": 20}" } ] @@ -234,7 +234,7 @@ "toolCall": [ { "name": "sumNumbers", - "id": "call_n2OlBxlaoeMIMVeU9DeDfiPX", + "id": "call_nXP1Rc85Ntdv5XcI6FBWmB6d", "input": "{\"a\": 8, \"b\": 20}" } ] @@ -250,7 +250,7 @@ "toolCall": [ { "name": "sumNumbers", - "id": "call_n2OlBxlaoeMIMVeU9DeDfiPX", + "id": "call_nXP1Rc85Ntdv5XcI6FBWmB6d", "input": "{\"a\": 8, \"b\": 20}" } ] @@ -263,23 +263,7 @@ "chunk": { "raw": null, "options": {}, - "delta": "The" - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "options": {}, - "delta": " result" - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "options": {}, - "delta": " of" + "delta": "After" } }, { @@ -335,7 +319,7 @@ "chunk": { "raw": null, "options": {}, - "delta": " is" + "delta": "," } }, { @@ -343,7 +327,7 @@ "chunk": { "raw": null, "options": {}, - "delta": " " + "delta": " we" } }, { @@ -351,7 +335,7 @@ "chunk": { "raw": null, "options": {}, - "delta": "8" + "delta": " get" } }, { @@ -359,7 +343,7 @@ "chunk": { "raw": null, "options": {}, - "delta": "." + "delta": " " } }, { @@ -367,7 +351,7 @@ "chunk": { "raw": null, "options": {}, - "delta": " When" + "delta": "8" } }, { @@ -375,7 +359,7 @@ "chunk": { "raw": null, "options": {}, - "delta": " you" + "delta": "." } }, { @@ -383,7 +367,7 @@ "chunk": { "raw": null, "options": {}, - "delta": " add" + "delta": " Adding" } }, { @@ -431,23 +415,7 @@ "chunk": { "raw": null, "options": {}, - "delta": "," - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "options": {}, - "delta": " the" - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "options": {}, - "delta": " total" + "delta": " gives" } }, { @@ -455,7 +423,7 @@ "chunk": { "raw": null, "options": {}, - "delta": " is" + "delta": " us" } }, { diff --git a/packages/llamaindex/e2e/node/snapshot/agent_with_object_function_call.snap b/packages/llamaindex/e2e/node/snapshot/agent_with_object_function_call.snap index f0209e8a9..9348b26fe 100644 --- a/packages/llamaindex/e2e/node/snapshot/agent_with_object_function_call.snap +++ b/packages/llamaindex/e2e/node/snapshot/agent_with_object_function_call.snap @@ -22,7 +22,7 @@ "options": { "toolCall": [ { - "id": "call_uERMumWlJLTO2GW93X6C2W3N", + "id": "call_FNnPEPNeSDmdDj7b5x4LIxBG", "name": "get_weather", "input": "{\"location\":\"San Francisco\"}" } @@ -30,8 +30,8 @@ } }, { - "role": "user", "content": "{\n location: San Francisco,\n temperature: 72,\n weather: cloudy,\n rain_prediction: 0.89\n}", + "role": "user", "options": { "toolResult": { "result": { @@ -41,7 +41,7 @@ "rain_prediction": 0.89 }, "isError": false, - "id": "call_uERMumWlJLTO2GW93X6C2W3N" + "id": "call_FNnPEPNeSDmdDj7b5x4LIxBG" } } } @@ -59,7 +59,7 @@ "options": { "toolCall": [ { - "id": "call_uERMumWlJLTO2GW93X6C2W3N", + "id": "call_FNnPEPNeSDmdDj7b5x4LIxBG", "name": "get_weather", "input": "{\"location\":\"San Francisco\"}" } diff --git a/packages/llamaindex/e2e/node/snapshot/agent_with_object_retriever.snap b/packages/llamaindex/e2e/node/snapshot/agent_with_object_retriever.snap index 82c55b32a..446596243 100644 --- a/packages/llamaindex/e2e/node/snapshot/agent_with_object_retriever.snap +++ b/packages/llamaindex/e2e/node/snapshot/agent_with_object_retriever.snap @@ -17,7 +17,7 @@ "id": "PRESERVE_1", "messages": [ { - "content": "Context information is below.\n---------------------\nAlex is a male. What's very important, Alex is not in the Brazil.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: Alex\nAnswer:", + "content": "Context information is below.\n---------------------\nAlex is a male.\nWhat's very important, Alex is not in the Brazil.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: Alex\nAnswer:", "role": "user" } ] @@ -26,7 +26,7 @@ "id": "PRESERVE_2", "messages": [ { - "content": "Context information is below.\n---------------------\nAlex is a male. What's very important, Alex is not in the Brazil.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: Brazil\nAnswer:", + "content": "Context information is below.\n---------------------\nAlex is a male.\nWhat's very important, Alex is not in the Brazil.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: Brazil\nAnswer:", "role": "user" } ] @@ -48,12 +48,12 @@ "options": { "toolCall": [ { - "id": "call_vv1XW3xv4j2us5sZOtzCU2lL", + "id": "call_xkMkcEJYa2vVFUpg1Ng3UZTK", "name": "summary_tool", "input": "{\"query\": \"Alex\"}" }, { - "id": "call_V36LMHbwUJkEa20A3GoA4wIr", + "id": "call_BYhP9Coo4NIBJDFaEKRA5qSl", "name": "summary_tool", "input": "{\"query\": \"Brazil\"}" } @@ -61,24 +61,24 @@ } }, { - "role": "user", "content": "Alex is not in Brazil.", + "role": "user", "options": { "toolResult": { "result": "Alex is not in Brazil.", "isError": false, - "id": "call_vv1XW3xv4j2us5sZOtzCU2lL" + "id": "call_xkMkcEJYa2vVFUpg1Ng3UZTK" } } }, { - "role": "user", "content": "Alex is not in Brazil.", + "role": "user", "options": { "toolResult": { "result": "Alex is not in Brazil.", "isError": false, - "id": "call_V36LMHbwUJkEa20A3GoA4wIr" + "id": "call_BYhP9Coo4NIBJDFaEKRA5qSl" } } } @@ -96,12 +96,12 @@ "options": { "toolCall": [ { - "id": "call_vv1XW3xv4j2us5sZOtzCU2lL", + "id": "call_xkMkcEJYa2vVFUpg1Ng3UZTK", "name": "summary_tool", "input": "{\"query\": \"Alex\"}" }, { - "id": "call_V36LMHbwUJkEa20A3GoA4wIr", + "id": "call_BYhP9Coo4NIBJDFaEKRA5qSl", "name": "summary_tool", "input": "{\"query\": \"Brazil\"}" } diff --git a/packages/llamaindex/e2e/node/snapshot/anthropic-agent-multiple-chat.snap b/packages/llamaindex/e2e/node/snapshot/anthropic-agent-multiple-chat.snap index 141f6c773..23aae2126 100644 --- a/packages/llamaindex/e2e/node/snapshot/anthropic-agent-multiple-chat.snap +++ b/packages/llamaindex/e2e/node/snapshot/anthropic-agent-multiple-chat.snap @@ -20,7 +20,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Yes\".\n</thinking>\n\nYes" + "text": "<thinking>\nThe user has simply asked me to respond with the word \"Yes\". This is a very straightforward request that does not require the use of any tools. I have all the information I need to directly provide the response the user has requested.\n</thinking>\n\nYes" } ], "role": "assistant", @@ -43,7 +43,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Yes\".\n</thinking>\n\nYes" + "text": "<thinking>\nThe user has simply asked me to respond with the word \"Yes\". This is a very straightforward request that does not require the use of any tools. I have all the information I need to directly provide the response the user has requested.\n</thinking>\n\nYes" } ], "role": "assistant", @@ -57,7 +57,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"No\".\n</thinking>\n\nNo" + "text": "<thinking>\nThe user has now asked me to respond with the word \"No\" instead of \"Yes\". Once again, this is a very simple and direct request. No additional information or tool usage is needed. I can immediately provide the response the user has requested by simply replying with the word \"No\".\n</thinking>\n\nNo" } ], "role": "assistant", @@ -80,7 +80,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Yes\".\n</thinking>\n\nYes" + "text": "<thinking>\nThe user has simply asked me to respond with the word \"Yes\". This is a very straightforward request that does not require the use of any tools. I have all the information I need to directly provide the response the user has requested.\n</thinking>\n\nYes" } ], "role": "assistant", @@ -94,7 +94,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"No\".\n</thinking>\n\nNo" + "text": "<thinking>\nThe user has now asked me to respond with the word \"No\" instead of \"Yes\". Once again, this is a very simple and direct request. No additional information or tool usage is needed. I can immediately provide the response the user has requested by simply replying with the word \"No\".\n</thinking>\n\nNo" } ], "role": "assistant", @@ -108,7 +108,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Maybe\".\n</thinking>\n\nMaybe" + "text": "<thinking>\nThe user has made another simple request, this time asking me to respond with the word \"Maybe\". Just like the previous requests for \"Yes\" and \"No\", I have all the necessary information to fulfill this request without needing to use any tools or gather additional details. I can provide the requested response by replying with only the word \"Maybe\".\n</thinking>\n\nMaybe" } ], "role": "assistant", @@ -131,7 +131,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Yes\".\n</thinking>\n\nYes" + "text": "<thinking>\nThe user has simply asked me to respond with the word \"Yes\". This is a very straightforward request that does not require the use of any tools. I have all the information I need to directly provide the response the user has requested.\n</thinking>\n\nYes" } ], "role": "assistant", @@ -145,7 +145,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"No\".\n</thinking>\n\nNo" + "text": "<thinking>\nThe user has now asked me to respond with the word \"No\" instead of \"Yes\". Once again, this is a very simple and direct request. No additional information or tool usage is needed. I can immediately provide the response the user has requested by simply replying with the word \"No\".\n</thinking>\n\nNo" } ], "role": "assistant", @@ -159,7 +159,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Maybe\".\n</thinking>\n\nMaybe" + "text": "<thinking>\nThe user has made another simple request, this time asking me to respond with the word \"Maybe\". Just like the previous requests for \"Yes\" and \"No\", I have all the necessary information to fulfill this request without needing to use any tools or gather additional details. I can provide the requested response by replying with only the word \"Maybe\".\n</thinking>\n\nMaybe" } ], "role": "assistant", @@ -173,14 +173,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has asked for the weather in a specific city, San Francisco. The relevant tool to answer this is the getWeather function.\n\nLooking at the required parameters for getWeather:\ncity (string): The user directly provided the city \"San Francisco\"\n\nSince the required \"city\" parameter has been provided, we can proceed with the getWeather function call.\n</thinking>" + "text": "<thinking>\nThe user has asked for the current weather in San Francisco. To answer this, I will need to use the getWeather tool.\n\nLooking at the parameters for getWeather, I see it requires a single parameter:\n- city (string): The city to get the weather for \n\nThe user has directly provided the value for the city parameter in their request - they want the weather for San Francisco.\n\nI have all the required information to make the getWeather tool call, so I will proceed with calling the tool with \"San Francisco\" as the city parameter value.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_017MxwaxaLYkmt4dpP5HKzFe", + "id": "toolu_01Q5tWdx8CTrqUaQty8C1PD5", "name": "getWeather", "input": { "city": "San Francisco" @@ -196,7 +196,7 @@ "toolResult": { "result": "The weather in San Francisco is 72 degrees", "isError": false, - "id": "toolu_017MxwaxaLYkmt4dpP5HKzFe" + "id": "toolu_01Q5tWdx8CTrqUaQty8C1PD5" } } } @@ -213,7 +213,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Yes\".\n</thinking>\n\nYes" + "text": "<thinking>\nThe user has simply asked me to respond with the word \"Yes\". This is a very straightforward request that does not require the use of any tools. I have all the information I need to directly provide the response the user has requested.\n</thinking>\n\nYes" } ], "role": "assistant", @@ -227,7 +227,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"No\".\n</thinking>\n\nNo" + "text": "<thinking>\nThe user has now asked me to respond with the word \"No\" instead of \"Yes\". Once again, this is a very simple and direct request. No additional information or tool usage is needed. I can immediately provide the response the user has requested by simply replying with the word \"No\".\n</thinking>\n\nNo" } ], "role": "assistant", @@ -241,7 +241,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Maybe\".\n</thinking>\n\nMaybe" + "text": "<thinking>\nThe user has made another simple request, this time asking me to respond with the word \"Maybe\". Just like the previous requests for \"Yes\" and \"No\", I have all the necessary information to fulfill this request without needing to use any tools or gather additional details. I can provide the requested response by replying with only the word \"Maybe\".\n</thinking>\n\nMaybe" } ], "role": "assistant", @@ -255,14 +255,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has asked for the weather in a specific city, San Francisco. The relevant tool to answer this is the getWeather function.\n\nLooking at the required parameters for getWeather:\ncity (string): The user directly provided the city \"San Francisco\"\n\nSince the required \"city\" parameter has been provided, we can proceed with the getWeather function call.\n</thinking>" + "text": "<thinking>\nThe user has asked for the current weather in San Francisco. To answer this, I will need to use the getWeather tool.\n\nLooking at the parameters for getWeather, I see it requires a single parameter:\n- city (string): The city to get the weather for \n\nThe user has directly provided the value for the city parameter in their request - they want the weather for San Francisco.\n\nI have all the required information to make the getWeather tool call, so I will proceed with calling the tool with \"San Francisco\" as the city parameter value.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_017MxwaxaLYkmt4dpP5HKzFe", + "id": "toolu_01Q5tWdx8CTrqUaQty8C1PD5", "name": "getWeather", "input": { "city": "San Francisco" @@ -278,7 +278,7 @@ "toolResult": { "result": "The weather in San Francisco is 72 degrees", "isError": false, - "id": "toolu_017MxwaxaLYkmt4dpP5HKzFe" + "id": "toolu_01Q5tWdx8CTrqUaQty8C1PD5" } } }, @@ -286,7 +286,7 @@ "content": [ { "type": "text", - "text": "The current weather in San Francisco is 72 degrees." + "text": "Based on the result from calling the getWeather tool, the current weather in San Francisco is 72 degrees." } ], "role": "assistant", @@ -309,7 +309,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Yes\".\n</thinking>\n\nYes" + "text": "<thinking>\nThe user has simply asked me to respond with the word \"Yes\". This is a very straightforward request that does not require the use of any tools. I have all the information I need to directly provide the response the user has requested.\n</thinking>\n\nYes" } ], "role": "assistant", @@ -323,7 +323,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"No\".\n</thinking>\n\nNo" + "text": "<thinking>\nThe user has now asked me to respond with the word \"No\" instead of \"Yes\". Once again, this is a very simple and direct request. No additional information or tool usage is needed. I can immediately provide the response the user has requested by simply replying with the word \"No\".\n</thinking>\n\nNo" } ], "role": "assistant", @@ -337,7 +337,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Maybe\".\n</thinking>\n\nMaybe" + "text": "<thinking>\nThe user has made another simple request, this time asking me to respond with the word \"Maybe\". Just like the previous requests for \"Yes\" and \"No\", I have all the necessary information to fulfill this request without needing to use any tools or gather additional details. I can provide the requested response by replying with only the word \"Maybe\".\n</thinking>\n\nMaybe" } ], "role": "assistant", @@ -351,14 +351,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has asked for the weather in a specific city, San Francisco. The relevant tool to answer this is the getWeather function.\n\nLooking at the required parameters for getWeather:\ncity (string): The user directly provided the city \"San Francisco\"\n\nSince the required \"city\" parameter has been provided, we can proceed with the getWeather function call.\n</thinking>" + "text": "<thinking>\nThe user has asked for the current weather in San Francisco. To answer this, I will need to use the getWeather tool.\n\nLooking at the parameters for getWeather, I see it requires a single parameter:\n- city (string): The city to get the weather for \n\nThe user has directly provided the value for the city parameter in their request - they want the weather for San Francisco.\n\nI have all the required information to make the getWeather tool call, so I will proceed with calling the tool with \"San Francisco\" as the city parameter value.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_017MxwaxaLYkmt4dpP5HKzFe", + "id": "toolu_01Q5tWdx8CTrqUaQty8C1PD5", "name": "getWeather", "input": { "city": "San Francisco" @@ -374,7 +374,7 @@ "toolResult": { "result": "The weather in San Francisco is 72 degrees", "isError": false, - "id": "toolu_017MxwaxaLYkmt4dpP5HKzFe" + "id": "toolu_01Q5tWdx8CTrqUaQty8C1PD5" } } }, @@ -382,7 +382,7 @@ "content": [ { "type": "text", - "text": "The current weather in San Francisco is 72 degrees." + "text": "Based on the result from calling the getWeather tool, the current weather in San Francisco is 72 degrees." } ], "role": "assistant", @@ -396,14 +396,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has asked for the weather in a specific city, Shanghai. The relevant tool to answer this is the getWeather function.\n\nLooking at the required parameters for getWeather:\ncity (string): The user directly provided the city \"Shanghai\"\n\nSince the required \"city\" parameter has been provided, we can proceed with the getWeather function call.\n</thinking>" + "text": "<thinking>\nThe user has now asked for the current weather in Shanghai. This is very similar to the previous request for San Francisco's weather. I will once again need to use the getWeather tool to answer this.\n\nThe getWeather tool requires the following parameter:\n- city (string): The city to get the weather for\n\nThe user has provided the city they want the weather for directly in their request - in this case, the city is Shanghai.\n\nSince I have the value for the required city parameter, I can go ahead and call the getWeather tool, passing in \"Shanghai\" as the city.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_013ZnATGKhvPkt2jjxdFheDK", + "id": "toolu_01HxDqJpm33QxDXAExTxm6zu", "name": "getWeather", "input": { "city": "Shanghai" @@ -419,7 +419,7 @@ "toolResult": { "result": "The weather in Shanghai is 72 degrees", "isError": false, - "id": "toolu_013ZnATGKhvPkt2jjxdFheDK" + "id": "toolu_01HxDqJpm33QxDXAExTxm6zu" } } } @@ -435,7 +435,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Yes\".\n</thinking>\n\nYes" + "text": "<thinking>\nThe user has simply asked me to respond with the word \"Yes\". This is a very straightforward request that does not require the use of any tools. I have all the information I need to directly provide the response the user has requested.\n</thinking>\n\nYes" } ], "role": "assistant", @@ -451,7 +451,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"No\".\n</thinking>\n\nNo" + "text": "<thinking>\nThe user has now asked me to respond with the word \"No\" instead of \"Yes\". Once again, this is a very simple and direct request. No additional information or tool usage is needed. I can immediately provide the response the user has requested by simply replying with the word \"No\".\n</thinking>\n\nNo" } ], "role": "assistant", @@ -467,7 +467,7 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has not asked a question that requires using any of the available tools. They have simply requested that I respond with the word \"Maybe\".\n</thinking>\n\nMaybe" + "text": "<thinking>\nThe user has made another simple request, this time asking me to respond with the word \"Maybe\". Just like the previous requests for \"Yes\" and \"No\", I have all the necessary information to fulfill this request without needing to use any tools or gather additional details. I can provide the requested response by replying with only the word \"Maybe\".\n</thinking>\n\nMaybe" } ], "role": "assistant", @@ -483,14 +483,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has asked for the weather in a specific city, San Francisco. The relevant tool to answer this is the getWeather function.\n\nLooking at the required parameters for getWeather:\ncity (string): The user directly provided the city \"San Francisco\"\n\nSince the required \"city\" parameter has been provided, we can proceed with the getWeather function call.\n</thinking>" + "text": "<thinking>\nThe user has asked for the current weather in San Francisco. To answer this, I will need to use the getWeather tool.\n\nLooking at the parameters for getWeather, I see it requires a single parameter:\n- city (string): The city to get the weather for \n\nThe user has directly provided the value for the city parameter in their request - they want the weather for San Francisco.\n\nI have all the required information to make the getWeather tool call, so I will proceed with calling the tool with \"San Francisco\" as the city parameter value.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_017MxwaxaLYkmt4dpP5HKzFe", + "id": "toolu_01Q5tWdx8CTrqUaQty8C1PD5", "name": "getWeather", "input": { "city": "San Francisco" @@ -509,7 +509,7 @@ "content": [ { "type": "text", - "text": "The current weather in San Francisco is 72 degrees." + "text": "Based on the result from calling the getWeather tool, the current weather in San Francisco is 72 degrees." } ], "role": "assistant", @@ -525,14 +525,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has asked for the weather in a specific city, Shanghai. The relevant tool to answer this is the getWeather function.\n\nLooking at the required parameters for getWeather:\ncity (string): The user directly provided the city \"Shanghai\"\n\nSince the required \"city\" parameter has been provided, we can proceed with the getWeather function call.\n</thinking>" + "text": "<thinking>\nThe user has now asked for the current weather in Shanghai. This is very similar to the previous request for San Francisco's weather. I will once again need to use the getWeather tool to answer this.\n\nThe getWeather tool requires the following parameter:\n- city (string): The city to get the weather for\n\nThe user has provided the city they want the weather for directly in their request - in this case, the city is Shanghai.\n\nSince I have the value for the required city parameter, I can go ahead and call the getWeather tool, passing in \"Shanghai\" as the city.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_013ZnATGKhvPkt2jjxdFheDK", + "id": "toolu_01HxDqJpm33QxDXAExTxm6zu", "name": "getWeather", "input": { "city": "Shanghai" @@ -551,7 +551,7 @@ "content": [ { "type": "text", - "text": "The current weather in Shanghai is 72 degrees." + "text": "The getWeather tool indicates that the current weather in Shanghai is 72 degrees." } ], "role": "assistant", diff --git a/packages/llamaindex/e2e/node/snapshot/anthropic-agent.snap b/packages/llamaindex/e2e/node/snapshot/anthropic-agent.snap index 2763fbb13..15b4f948e 100644 --- a/packages/llamaindex/e2e/node/snapshot/anthropic-agent.snap +++ b/packages/llamaindex/e2e/node/snapshot/anthropic-agent.snap @@ -20,14 +20,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has asked for the weather in a specific location, San Francisco. The Weather tool is the relevant function to answer this request, as it returns weather information for a given location.\n\nThe Weather tool requires a single parameter:\n- location (string, required): The user has directly provided the location as \"San Francisco\".\n\nSince the required location parameter has been provided, we have enough information to call the Weather tool.\n</thinking>" + "text": "<thinking>\nThe Weather tool is relevant to answer this question, as it can provide weather information for a specified location.\n\nThe Weather tool requires a \"location\" parameter. The user has directly provided the location of \"San Francisco\" in their request.\n\nSince the required \"location\" parameter has been provided, we can proceed with calling the Weather tool to get the weather information for San Francisco. No other tools are needed.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_011YcKkPygw2woJZrVWRRMwH", + "id": "toolu_01Y6KpbYCFw4XGtvyXEmeTrB", "name": "Weather", "input": { "location": "San Francisco" @@ -43,7 +43,7 @@ "toolResult": { "result": "35 degrees and sunny in San Francisco", "isError": false, - "id": "toolu_011YcKkPygw2woJZrVWRRMwH" + "id": "toolu_01Y6KpbYCFw4XGtvyXEmeTrB" } } } @@ -69,14 +69,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe unique_id function takes firstName and lastName as required parameters. The user has provided their first name (Alex) and last name (Yang) in the request, so we have all the necessary information to call the function.\n</thinking>" + "text": "<thinking>\nThe unique_id tool looks relevant for answering this request, as it can provide a unique identifier for a user given their first and last name. \nThe tool requires two parameters:\n- firstName (string)\n- lastName (string)\nLooking at the user's request, they have provided both their first name (Alex) and last name (Yang). So we have all the required parameters to call the unique_id tool.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_012ZLUq5SWsghkXUjsEXsB1f", + "id": "toolu_01Q7wYKoNtpGAphTbRSv51fE", "name": "unique_id", "input": { "firstName": "Alex", @@ -93,7 +93,7 @@ "toolResult": { "result": "123456789", "isError": false, - "id": "toolu_012ZLUq5SWsghkXUjsEXsB1f" + "id": "toolu_01Q7wYKoNtpGAphTbRSv51fE" } } } @@ -119,14 +119,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user is asking to sum the numbers 1 and 1. The relevant tool is the sumNumbers function, which takes two number parameters a and b.\nThe user has directly provided the values for the required parameters:\na = 1 \nb = 1\nSince all the required parameters are provided, we can proceed with calling the function.\n</thinking>" + "text": "<thinking>\nThe user is asking to sum the numbers 1 and 1. The sumNumbers tool is directly relevant for this request.\n\nLooking at the required parameters for sumNumbers:\na (number): The user provided the value 1 for this\nb (number): The user also provided the value 1 for this\n\nSince the user has directly provided values for all the required parameters, we can proceed with calling the sumNumbers tool without needing any additional information from the user.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_01SQD2XkaXkDNLQ2xaFzmguG", + "id": "toolu_01AxEUd1s1UmhPc6Y91LBSmw", "name": "sumNumbers", "input": { "a": 1, @@ -143,7 +143,7 @@ "toolResult": { "result": "2", "isError": false, - "id": "toolu_01SQD2XkaXkDNLQ2xaFzmguG" + "id": "toolu_01AxEUd1s1UmhPc6Y91LBSmw" } } } @@ -159,14 +159,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user has asked for the weather in a specific location, San Francisco. The Weather tool is the relevant function to answer this request, as it returns weather information for a given location.\n\nThe Weather tool requires a single parameter:\n- location (string, required): The user has directly provided the location as \"San Francisco\".\n\nSince the required location parameter has been provided, we have enough information to call the Weather tool.\n</thinking>" + "text": "<thinking>\nThe Weather tool is relevant to answer this question, as it can provide weather information for a specified location.\n\nThe Weather tool requires a \"location\" parameter. The user has directly provided the location of \"San Francisco\" in their request.\n\nSince the required \"location\" parameter has been provided, we can proceed with calling the Weather tool to get the weather information for San Francisco. No other tools are needed.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_011YcKkPygw2woJZrVWRRMwH", + "id": "toolu_01Y6KpbYCFw4XGtvyXEmeTrB", "name": "Weather", "input": { "location": "San Francisco" @@ -201,14 +201,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe unique_id function takes firstName and lastName as required parameters. The user has provided their first name (Alex) and last name (Yang) in the request, so we have all the necessary information to call the function.\n</thinking>" + "text": "<thinking>\nThe unique_id tool looks relevant for answering this request, as it can provide a unique identifier for a user given their first and last name. \nThe tool requires two parameters:\n- firstName (string)\n- lastName (string)\nLooking at the user's request, they have provided both their first name (Alex) and last name (Yang). So we have all the required parameters to call the unique_id tool.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_012ZLUq5SWsghkXUjsEXsB1f", + "id": "toolu_01Q7wYKoNtpGAphTbRSv51fE", "name": "unique_id", "input": { "firstName": "Alex", @@ -244,14 +244,14 @@ "content": [ { "type": "text", - "text": "<thinking>\nThe user is asking to sum the numbers 1 and 1. The relevant tool is the sumNumbers function, which takes two number parameters a and b.\nThe user has directly provided the values for the required parameters:\na = 1 \nb = 1\nSince all the required parameters are provided, we can proceed with calling the function.\n</thinking>" + "text": "<thinking>\nThe user is asking to sum the numbers 1 and 1. The sumNumbers tool is directly relevant for this request.\n\nLooking at the required parameters for sumNumbers:\na (number): The user provided the value 1 for this\nb (number): The user also provided the value 1 for this\n\nSince the user has directly provided values for all the required parameters, we can proceed with calling the sumNumbers tool without needing any additional information from the user.\n</thinking>" } ], "role": "assistant", "options": { "toolCall": [ { - "id": "toolu_01SQD2XkaXkDNLQ2xaFzmguG", + "id": "toolu_01AxEUd1s1UmhPc6Y91LBSmw", "name": "sumNumbers", "input": { "a": 1, @@ -271,7 +271,7 @@ "content": [ { "type": "text", - "text": "So 1 + 1 = 2." + "text": "So 1 + 1 = 2" } ], "role": "assistant", diff --git a/packages/llamaindex/e2e/node/snapshot/gpt-4-turbo.snap b/packages/llamaindex/e2e/node/snapshot/gpt-4-turbo.snap index 951442c7b..48f7fc2dd 100644 --- a/packages/llamaindex/e2e/node/snapshot/gpt-4-turbo.snap +++ b/packages/llamaindex/e2e/node/snapshot/gpt-4-turbo.snap @@ -22,7 +22,7 @@ "options": { "toolCall": [ { - "id": "call_Xa2Kxa2zUE073mnougPWzRlh", + "id": "call_UMsqyh51lvDjy2JvMKFoyai3", "name": "Weather", "input": "{\"location\":\"San Jose\"}" } @@ -30,13 +30,13 @@ } }, { - "role": "user", "content": "45 degrees and sunny in San Jose", + "role": "user", "options": { "toolResult": { "result": "45 degrees and sunny in San Jose", "isError": false, - "id": "call_Xa2Kxa2zUE073mnougPWzRlh" + "id": "call_UMsqyh51lvDjy2JvMKFoyai3" } } } @@ -54,7 +54,7 @@ "options": { "toolCall": [ { - "id": "call_Xa2Kxa2zUE073mnougPWzRlh", + "id": "call_UMsqyh51lvDjy2JvMKFoyai3", "name": "Weather", "input": "{\"location\":\"San Jose\"}" } diff --git a/packages/llamaindex/e2e/node/snapshot/llm-anthropic.snap b/packages/llamaindex/e2e/node/snapshot/llm-anthropic.snap index 106104b5e..1e847a7bf 100644 --- a/packages/llamaindex/e2e/node/snapshot/llm-anthropic.snap +++ b/packages/llamaindex/e2e/node/snapshot/llm-anthropic.snap @@ -49,7 +49,7 @@ "id": "PRESERVE_1", "chunk": { "raw": null, - "delta": "Hello", + "delta": "Hello! How can", "options": {} } }, @@ -57,55 +57,7 @@ "id": "PRESERVE_1", "chunk": { "raw": null, - "delta": "!", - "options": {} - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "delta": " How", - "options": {} - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "delta": " can", - "options": {} - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "delta": " I", - "options": {} - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "delta": " assist", - "options": {} - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "delta": " you", - "options": {} - } - }, - { - "id": "PRESERVE_1", - "chunk": { - "raw": null, - "delta": " today", + "delta": " I assist you today", "options": {} } }, diff --git a/packages/llamaindex/e2e/node/snapshot/openai_agent_system_prompt.snap b/packages/llamaindex/e2e/node/snapshot/openai_agent_system_prompt.snap index cfcda8739..6a518caa9 100644 --- a/packages/llamaindex/e2e/node/snapshot/openai_agent_system_prompt.snap +++ b/packages/llamaindex/e2e/node/snapshot/openai_agent_system_prompt.snap @@ -30,7 +30,7 @@ "options": { "toolCall": [ { - "id": "call_1SlugFuJ7rhwsmXd5aRnJhPe", + "id": "call_i3rlYaBlvhTf60phYJh8irvZ", "name": "getWeather", "input": "{\"city\":\"San Francisco\"}" } @@ -38,13 +38,13 @@ } }, { - "role": "user", "content": "The weather in San Francisco is 72 degrees", + "role": "user", "options": { "toolResult": { "result": "The weather in San Francisco is 72 degrees", "isError": false, - "id": "call_1SlugFuJ7rhwsmXd5aRnJhPe" + "id": "call_i3rlYaBlvhTf60phYJh8irvZ" } } } @@ -62,7 +62,7 @@ "options": { "toolCall": [ { - "id": "call_1SlugFuJ7rhwsmXd5aRnJhPe", + "id": "call_i3rlYaBlvhTf60phYJh8irvZ", "name": "getWeather", "input": "{\"city\":\"San Francisco\"}" } diff --git a/packages/llamaindex/e2e/node/snapshot/queryEngine_subquestion.snap b/packages/llamaindex/e2e/node/snapshot/queryEngine_subquestion.snap index bcadd820d..8890ca8a8 100644 --- a/packages/llamaindex/e2e/node/snapshot/queryEngine_subquestion.snap +++ b/packages/llamaindex/e2e/node/snapshot/queryEngine_subquestion.snap @@ -13,7 +13,7 @@ "id": "PRESERVE_1", "messages": [ { - "content": "Context information is below.\n---------------------\nBill Gates stole from Apple. Steve Jobs stole from Xerox.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: What is Bill Gates' idea\nAnswer:", + "content": "Context information is below.\n---------------------\nBill Gates stole from Apple.\n Steve Jobs stole from Xerox.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: What idea did Bill Gates get?\nAnswer:", "role": "user" } ] @@ -22,7 +22,7 @@ "id": "PRESERVE_2", "messages": [ { - "content": "Context information is below.\n---------------------\nSub question: What is Bill Gates' idea\nResponse: Bill Gates' idea was to steal from Apple.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: What did Bill Gates steal from?\nAnswer:", + "content": "Context information is below.\n---------------------\nSub question: What idea did Bill Gates get?\nResponse: Bill Gates got the idea from Apple.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: What did Bill Gates steal from?\nAnswer:", "role": "user" } ] @@ -34,7 +34,7 @@ "response": { "raw": null, "message": { - "content": "```json\n[\n {\n \"subQuestion\": \"What is Bill Gates' idea\",\n \"toolName\": \"bill_gates_idea\"\n }\n]\n```", + "content": "```json\n[\n {\n \"subQuestion\": \"What idea did Bill Gates get?\",\n \"toolName\": \"bill_gates_idea\"\n }\n]\n```", "role": "assistant", "options": {} } @@ -45,7 +45,7 @@ "response": { "raw": null, "message": { - "content": "Bill Gates' idea was to steal from Apple.", + "content": "Bill Gates got the idea from Apple.", "role": "assistant", "options": {} } @@ -56,7 +56,7 @@ "response": { "raw": null, "message": { - "content": "Bill Gates stole from Apple.", + "content": "Bill Gates stole the idea from Apple.", "role": "assistant", "options": {} } diff --git a/packages/llamaindex/e2e/node/snapshot/react-agent-stream.snap b/packages/llamaindex/e2e/node/snapshot/react-agent-stream.snap index a7cf28444..df21c92f1 100644 --- a/packages/llamaindex/e2e/node/snapshot/react-agent-stream.snap +++ b/packages/llamaindex/e2e/node/snapshot/react-agent-stream.snap @@ -41,7 +41,7 @@ "response": { "raw": null, "message": { - "content": "Thought: I need to use a tool to help me answer the question. \nAction: getWeather\nAction Input: {\"city\": \"San Francisco\"}", + "content": "Thought: I need to use a tool to help me answer the question.\nAction: getWeather\nAction Input: {\"city\": \"San Francisco\"}", "role": "assistant", "options": {} } @@ -177,15 +177,7 @@ "chunk": { "raw": null, "options": {}, - "delta": "." - } - }, - { - "id": "PRESERVE_0", - "chunk": { - "raw": null, - "options": {}, - "delta": " \n" + "delta": ".\n" } }, { diff --git a/packages/llamaindex/e2e/node/utils.ts b/packages/llamaindex/e2e/node/utils.ts index 7f216cccb..29bcc6ab9 100644 --- a/packages/llamaindex/e2e/node/utils.ts +++ b/packages/llamaindex/e2e/node/utils.ts @@ -42,7 +42,7 @@ export async function mockLLMEvent( newLLMCompleteMockStorage.llmEventStart.push({ ...event.detail, // @ts-expect-error id is not UUID, but it is fine for testing - id: idMap.get(event.detail.payload.id)!, + id: idMap.get(event.detail.id)!, }); } @@ -50,7 +50,7 @@ export async function mockLLMEvent( newLLMCompleteMockStorage.llmEventEnd.push({ ...event.detail, // @ts-expect-error id is not UUID, but it is fine for testing - id: idMap.get(event.detail.payload.id)!, + id: idMap.get(event.detail.id)!, response: { ...event.detail.response, // hide raw object since it might too big @@ -63,7 +63,7 @@ export async function mockLLMEvent( newLLMCompleteMockStorage.llmEventStream.push({ ...event.detail, // @ts-expect-error id is not UUID, but it is fine for testing - id: idMap.get(event.detail.payload.id)!, + id: idMap.get(event.detail.id)!, chunk: { ...event.detail.chunk, // hide raw object since it might too big diff --git a/packages/llamaindex/src/PromptHelper.ts b/packages/llamaindex/src/PromptHelper.ts index 809a75e6b..411c4487f 100644 --- a/packages/llamaindex/src/PromptHelper.ts +++ b/packages/llamaindex/src/PromptHelper.ts @@ -1,6 +1,6 @@ -import { tokenizers, type Tokenizer } from "@llamaindex/env"; +import { SentenceSplitter } from "@llamaindex/core/node-parser"; +import { type Tokenizer, tokenizers } from "@llamaindex/env"; import type { SimplePrompt } from "./Prompt.js"; -import { SentenceSplitter } from "./TextSplitter.js"; import { DEFAULT_CHUNK_OVERLAP_RATIO, DEFAULT_CONTEXT_WINDOW, @@ -107,8 +107,7 @@ export class PromptHelper { throw new Error("Got 0 as available chunk size"); } const chunkOverlap = this.chunkOverlapRatio * chunkSize; - const textSplitter = new SentenceSplitter({ chunkSize, chunkOverlap }); - return textSplitter; + return new SentenceSplitter({ chunkSize, chunkOverlap }); } /** diff --git a/packages/llamaindex/src/ServiceContext.ts b/packages/llamaindex/src/ServiceContext.ts index be0dcb394..8cb24c94c 100644 --- a/packages/llamaindex/src/ServiceContext.ts +++ b/packages/llamaindex/src/ServiceContext.ts @@ -1,10 +1,12 @@ import type { BaseEmbedding } from "@llamaindex/core/embeddings"; import type { LLM } from "@llamaindex/core/llms"; +import { + type NodeParser, + SentenceSplitter, +} from "@llamaindex/core/node-parser"; import { PromptHelper } from "./PromptHelper.js"; import { OpenAIEmbedding } from "./embeddings/OpenAIEmbedding.js"; import { OpenAI } from "./llm/openai.js"; -import { SimpleNodeParser } from "./nodeParsers/SimpleNodeParser.js"; -import type { NodeParser } from "./nodeParsers/types.js"; /** * The ServiceContext is a collection of components that are used in different parts of the application. @@ -33,7 +35,7 @@ export function serviceContextFromDefaults(options?: ServiceContextOptions) { embedModel: options?.embedModel ?? new OpenAIEmbedding(), nodeParser: options?.nodeParser ?? - new SimpleNodeParser({ + new SentenceSplitter({ chunkSize: options?.chunkSize, chunkOverlap: options?.chunkOverlap, }), diff --git a/packages/llamaindex/src/Settings.ts b/packages/llamaindex/src/Settings.ts index 4adc6dcd2..0fb712d6c 100644 --- a/packages/llamaindex/src/Settings.ts +++ b/packages/llamaindex/src/Settings.ts @@ -5,10 +5,13 @@ import { import { OpenAI } from "./llm/openai.js"; import { PromptHelper } from "./PromptHelper.js"; -import { SimpleNodeParser } from "./nodeParsers/SimpleNodeParser.js"; import type { BaseEmbedding } from "@llamaindex/core/embeddings"; import type { LLM } from "@llamaindex/core/llms"; +import { + type NodeParser, + SentenceSplitter, +} from "@llamaindex/core/node-parser"; import { AsyncLocalStorage, getEnv } from "@llamaindex/env"; import type { ServiceContext } from "./ServiceContext.js"; import { @@ -16,7 +19,6 @@ import { setEmbeddedModel, withEmbeddedModel, } from "./internal/settings/EmbedModel.js"; -import type { NodeParser } from "./nodeParsers/types.js"; export type PromptConfig = { llm?: string; @@ -108,7 +110,7 @@ class GlobalSettings implements Config { get nodeParser(): NodeParser { if (this.#nodeParser === null) { - this.#nodeParser = new SimpleNodeParser({ + this.#nodeParser = new SentenceSplitter({ chunkSize: this.chunkSize, chunkOverlap: this.chunkOverlap, }); diff --git a/packages/llamaindex/src/TextSplitter.ts b/packages/llamaindex/src/TextSplitter.ts deleted file mode 100644 index f8e01121b..000000000 --- a/packages/llamaindex/src/TextSplitter.ts +++ /dev/null @@ -1,309 +0,0 @@ -import { EOL, tokenizers, type Tokenizer } from "@llamaindex/env"; -// GitHub translated -import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE } from "./constants.js"; - -class TextSplit { - textChunk: string; - numCharOverlap: number | undefined; - - constructor( - textChunk: string, - numCharOverlap: number | undefined = undefined, - ) { - this.textChunk = textChunk; - this.numCharOverlap = numCharOverlap; - } -} - -type SplitRep = { text: string; numTokens: number }; - -const defaultregex = /[.?!][\])'"`’â€]*(?:\s|$)/g; -export const defaultSentenceTokenizer = (text: string): string[] => { - const slist = []; - const iter = text.matchAll(defaultregex); - let lastIdx = 0; - for (const match of iter) { - slist.push(text.slice(lastIdx, match.index! + 1)); - lastIdx = match.index! + 1; - } - slist.push(text.slice(lastIdx)); - return slist.filter((s) => s.length > 0); -}; - -// Refs: https://github.com/fxsjy/jieba/issues/575#issuecomment-359637511 -const resentencesp = - /([﹒﹔﹖﹗.;。ï¼ï¼Ÿ]["’â€ã€ã€]{0,2}|:(?=["‘“「『]{1,2}|$))/; -/** - * Tokenizes sentences. Suitable for Chinese, Japanese, and Korean. Use instead of `defaultSentenceTokenizer`. - * @param text - * @returns string[] - */ -export function cjkSentenceTokenizer(sentence: string): string[] { - const slist = []; - const parts = sentence.split(resentencesp); - - for (let i = 0; i < parts.length; i++) { - const part = parts[i]; - if (resentencesp.test(part) && slist.length > 0) { - slist[slist.length - 1] += part; - } else if (part) { - slist.push(part); - } - } - - return slist.filter((s) => s.length > 0); -} - -export const defaultParagraphSeparator = EOL + EOL + EOL; - -// In theory there's also Mac style \r only, but it's pre-OSX and I don't think -// many documents will use it. - -/** - * SentenceSplitter is our default text splitter that supports splitting into sentences, paragraphs, or fixed length chunks with overlap. - * - * One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together. - */ -export class SentenceSplitter { - public chunkSize: number; - public chunkOverlap: number; - - private tokenizer: Tokenizer; - private paragraphSeparator: string; - private chunkingTokenizerFn: (text: string) => string[]; - private splitLongSentences: boolean; - - constructor(options?: { - chunkSize?: number; - chunkOverlap?: number; - tokenizer?: Tokenizer; - paragraphSeparator?: string; - chunkingTokenizerFn?: (text: string) => string[]; - splitLongSentences?: boolean; - }) { - const { - chunkSize = DEFAULT_CHUNK_SIZE, - chunkOverlap = DEFAULT_CHUNK_OVERLAP, - tokenizer = null, - paragraphSeparator = defaultParagraphSeparator, - chunkingTokenizerFn, - splitLongSentences = false, - } = options ?? {}; - - if (chunkOverlap > chunkSize) { - throw new Error( - `Got a larger chunk overlap (${chunkOverlap}) than chunk size (${chunkSize}), should be smaller.`, - ); - } - this.chunkSize = chunkSize; - this.chunkOverlap = chunkOverlap; - - this.tokenizer = tokenizer ?? tokenizers.tokenizer(); - - this.paragraphSeparator = paragraphSeparator; - this.chunkingTokenizerFn = chunkingTokenizerFn ?? defaultSentenceTokenizer; - this.splitLongSentences = splitLongSentences; - } - - private getEffectiveChunkSize(extraInfoStr?: string): number { - // get "effective" chunk size by removing the metadata - let effectiveChunkSize; - if (extraInfoStr != undefined) { - const numExtraTokens = - this.tokenizer.encode(`${extraInfoStr}\n\n`).length + 1; - effectiveChunkSize = this.chunkSize - numExtraTokens; - if (effectiveChunkSize <= 0) { - throw new Error( - "Effective chunk size is non positive after considering extra_info", - ); - } - } else { - effectiveChunkSize = this.chunkSize; - } - return effectiveChunkSize; - } - - getParagraphSplits(text: string, effectiveChunkSize?: number): string[] { - // get paragraph splits - const paragraphSplits: string[] = text.split(this.paragraphSeparator); - let idx = 0; - if (effectiveChunkSize == undefined) { - return paragraphSplits; - } - - // merge paragraphs that are too small - while (idx < paragraphSplits.length) { - if ( - idx < paragraphSplits.length - 1 && - paragraphSplits[idx].length < effectiveChunkSize - ) { - paragraphSplits[idx] = [ - paragraphSplits[idx], - paragraphSplits[idx + 1], - ].join(this.paragraphSeparator); - paragraphSplits.splice(idx + 1, 1); - } else { - idx += 1; - } - } - return paragraphSplits; - } - - getSentenceSplits(text: string, effectiveChunkSize?: number): string[] { - const paragraphSplits = this.getParagraphSplits(text, effectiveChunkSize); - // Next we split the text using the chunk tokenizer fn/ - const splits = []; - for (const parText of paragraphSplits) { - const sentenceSplits = this.chunkingTokenizerFn(parText); - - if (!sentenceSplits) { - continue; - } - - for (const sentence_split of sentenceSplits) { - splits.push(sentence_split.trim()); - } - } - return splits; - } - - /** - * Splits sentences into chunks if necessary. - * - * This isn't great behavior because it can split down the middle of a - * word or in non-English split down the middle of a Unicode codepoint - * so the splitting is turned off by default. If you need it, please - * set the splitLongSentences option to true. - * @param sentenceSplits - * @param effectiveChunkSize - * @returns - */ - private processSentenceSplits( - sentenceSplits: string[], - effectiveChunkSize: number, - ): SplitRep[] { - if (!this.splitLongSentences) { - return sentenceSplits.map((split) => ({ - text: split, - numTokens: this.tokenizer.encode(split).length, - })); - } - - const newSplits: SplitRep[] = []; - for (const split of sentenceSplits) { - const splitTokens = this.tokenizer.encode(split); - const splitLen = splitTokens.length; - if (splitLen <= effectiveChunkSize) { - newSplits.push({ text: split, numTokens: splitLen }); - } else { - for (let i = 0; i < splitLen; i += effectiveChunkSize) { - const cur_split = this.tokenizer.decode( - splitTokens.slice(i, i + effectiveChunkSize), - ); - newSplits.push({ text: cur_split, numTokens: effectiveChunkSize }); - } - } - } - return newSplits; - } - - combineTextSplits( - newSentenceSplits: SplitRep[], - effectiveChunkSize: number, - ): TextSplit[] { - // go through sentence splits, combine to chunks that are within the chunk size - - // docs represents final list of text chunks - const docs: TextSplit[] = []; - // curChunkSentences represents the current list of sentence splits (that) - // will be merged into a chunk - let curChunkSentences: SplitRep[] = []; - let curChunkTokens = 0; - - for (let i = 0; i < newSentenceSplits.length; i++) { - // if adding newSentenceSplits[i] to curDocBuffer would exceed effectiveChunkSize, - // then we need to add the current curDocBuffer to docs - if ( - curChunkTokens + newSentenceSplits[i].numTokens > - effectiveChunkSize - ) { - if (curChunkSentences.length > 0) { - // push curent doc list to docs - docs.push( - new TextSplit( - curChunkSentences - .map((sentence) => sentence.text) - .join(" ") - .trim(), - ), - ); - } - - const lastChunkSentences = curChunkSentences; - - // reset docs list - curChunkTokens = 0; - curChunkSentences = []; - - // add the last sentences from the last chunk until we've hit the overlap - // do it in reverse order - for (let j = lastChunkSentences.length - 1; j >= 0; j--) { - if ( - curChunkTokens + lastChunkSentences[j].numTokens > - this.chunkOverlap - ) { - break; - } - curChunkSentences.unshift(lastChunkSentences[j]); - curChunkTokens += lastChunkSentences[j].numTokens + 1; - } - } - - curChunkSentences.push(newSentenceSplits[i]); - curChunkTokens += newSentenceSplits[i].numTokens + 1; - } - docs.push( - new TextSplit( - curChunkSentences - .map((sentence) => sentence.text) - .join(" ") - .trim(), - ), - ); - return docs; - } - - splitTextWithOverlaps(text: string, extraInfoStr?: string): TextSplit[] { - // Split incoming text and return chunks with overlap size. - // Has a preference for complete sentences, phrases, and minimal overlap. - - // here is the typescript code (skip callback manager) - if (text == "") { - return []; - } - - const effectiveChunkSize = this.getEffectiveChunkSize(extraInfoStr); - const sentenceSplits = this.getSentenceSplits(text, effectiveChunkSize); - - // Check if any sentences exceed the chunk size. If they don't, - // force split by tokenizer - const newSentenceSplits = this.processSentenceSplits( - sentenceSplits, - effectiveChunkSize, - ); - - // combine sentence splits into chunks of text that can then be returned - const combinedTextSplits = this.combineTextSplits( - newSentenceSplits, - effectiveChunkSize, - ); - - return combinedTextSplits; - } - - splitText(text: string, extraInfoStr?: string): string[] { - const text_splits = this.splitTextWithOverlaps(text); - const chunks = text_splits.map((text_split) => text_split.textChunk); - return chunks; - } -} diff --git a/packages/llamaindex/src/cloud/LlamaCloudIndex.ts b/packages/llamaindex/src/cloud/LlamaCloudIndex.ts index ab39f86b5..3353e6b84 100644 --- a/packages/llamaindex/src/cloud/LlamaCloudIndex.ts +++ b/packages/llamaindex/src/cloud/LlamaCloudIndex.ts @@ -11,10 +11,10 @@ import type { CloudConstructorParams } from "./constants.js"; import { getAppBaseUrl, initService } from "./utils.js"; import { PipelinesService, ProjectsService } from "@llamaindex/cloud/api"; +import { SentenceSplitter } from "@llamaindex/core/node-parser"; import { getEnv } from "@llamaindex/env"; import { Settings } from "../Settings.js"; import { OpenAIEmbedding } from "../embeddings/OpenAIEmbedding.js"; -import { SimpleNodeParser } from "../nodeParsers/SimpleNodeParser.js"; export class LlamaCloudIndex { params: CloudConstructorParams; @@ -147,13 +147,13 @@ export class LlamaCloudIndex { static async fromDocuments( params: { documents: Document[]; - transformations?: TransformComponent<any>[]; + transformations?: TransformComponent[]; verbose?: boolean; } & CloudConstructorParams, ): Promise<LlamaCloudIndex> { initService(params); - const defaultTransformations: TransformComponent<any>[] = [ - new SimpleNodeParser(), + const defaultTransformations: TransformComponent[] = [ + new SentenceSplitter(), new OpenAIEmbedding({ apiKey: getEnv("OPENAI_API_KEY"), }), diff --git a/packages/llamaindex/src/cloud/config.ts b/packages/llamaindex/src/cloud/config.ts index b61212292..0a562fd58 100644 --- a/packages/llamaindex/src/cloud/config.ts +++ b/packages/llamaindex/src/cloud/config.ts @@ -3,26 +3,26 @@ import type { PipelineCreate, PipelineType, } from "@llamaindex/cloud/api"; +import { SentenceSplitter } from "@llamaindex/core/node-parser"; import { BaseNode, type TransformComponent } from "@llamaindex/core/schema"; import { OpenAIEmbedding } from "../embeddings/OpenAIEmbedding.js"; -import { SimpleNodeParser } from "../nodeParsers/SimpleNodeParser.js"; export type GetPipelineCreateParams = { pipelineName: string; pipelineType: PipelineType; - transformations?: TransformComponent<any>[]; + transformations?: TransformComponent[]; inputNodes?: BaseNode[]; }; function getTransformationConfig( - transformation: TransformComponent<any>, + transformation: TransformComponent, ): ConfiguredTransformationItem { - if (transformation instanceof SimpleNodeParser) { + if (transformation instanceof SentenceSplitter) { return { configurable_transformation_type: "SENTENCE_AWARE_NODE_PARSER", component: { - chunk_size: transformation.textSplitter.chunkSize, // TODO: set to public in SentenceSplitter - chunk_overlap: transformation.textSplitter.chunkOverlap, // TODO: set to public in SentenceSplitter + chunk_size: transformation.chunkSize, // TODO: set to public in SentenceSplitter + chunk_overlap: transformation.chunkOverlap, // TODO: set to public in SentenceSplitter include_metadata: transformation.includeMetadata, include_prev_next_rel: transformation.includePrevNextRel, }, diff --git a/packages/llamaindex/src/extractors/types.ts b/packages/llamaindex/src/extractors/types.ts index 7b5063f23..d0af55f88 100644 --- a/packages/llamaindex/src/extractors/types.ts +++ b/packages/llamaindex/src/extractors/types.ts @@ -5,7 +5,7 @@ import { defaultNodeTextTemplate } from "./prompts.js"; /* * Abstract class for all extractors. */ -export abstract class BaseExtractor implements TransformComponent<any> { +export abstract class BaseExtractor implements TransformComponent { isTextNodeOnly: boolean = true; showProgress: boolean = true; metadataMode: MetadataMode = MetadataMode.ALL; diff --git a/packages/llamaindex/src/index.edge.ts b/packages/llamaindex/src/index.edge.ts index c76d3ea08..ebc8807fd 100644 --- a/packages/llamaindex/src/index.edge.ts +++ b/packages/llamaindex/src/index.edge.ts @@ -50,6 +50,5 @@ export * from "./ServiceContext.js"; export { Settings } from "./Settings.js"; export * from "./storage/StorageContext.js"; export * from "./synthesizers/index.js"; -export * from "./TextSplitter.js"; export * from "./tools/index.js"; export * from "./types.js"; diff --git a/packages/llamaindex/src/ingestion/IngestionCache.ts b/packages/llamaindex/src/ingestion/IngestionCache.ts index 05adcd25c..7dea514c2 100644 --- a/packages/llamaindex/src/ingestion/IngestionCache.ts +++ b/packages/llamaindex/src/ingestion/IngestionCache.ts @@ -5,7 +5,7 @@ import { docToJson, jsonToDoc } from "../storage/docStore/utils.js"; import { SimpleKVStore } from "../storage/kvStore/SimpleKVStore.js"; import type { BaseKVStore } from "../storage/kvStore/types.js"; -const transformToJSON = (obj: TransformComponent<any>) => { +const transformToJSON = (obj: TransformComponent) => { const seen: any[] = []; const replacer = (key: string, value: any) => { @@ -26,7 +26,7 @@ const transformToJSON = (obj: TransformComponent<any>) => { export function getTransformationHash( nodes: BaseNode[], - transform: TransformComponent<any>, + transform: TransformComponent, ) { const nodesStr: string = nodes .map((node) => node.getContent(MetadataMode.ALL)) diff --git a/packages/llamaindex/src/ingestion/IngestionPipeline.ts b/packages/llamaindex/src/ingestion/IngestionPipeline.ts index e2191a687..c7174aa17 100644 --- a/packages/llamaindex/src/ingestion/IngestionPipeline.ts +++ b/packages/llamaindex/src/ingestion/IngestionPipeline.ts @@ -26,12 +26,12 @@ type IngestionRunArgs = { type TransformRunArgs = { inPlace?: boolean; cache?: IngestionCache; - docStoreStrategy?: TransformComponent<any>; + docStoreStrategy?: TransformComponent; }; export async function runTransformations( nodesToRun: BaseNode[], - transformations: TransformComponent<any>[], + transformations: TransformComponent[], transformOptions: any = {}, { inPlace = true, cache, docStoreStrategy }: TransformRunArgs = {}, ): Promise<BaseNode[]> { @@ -60,7 +60,7 @@ export async function runTransformations( } export class IngestionPipeline { - transformations: TransformComponent<any>[] = []; + transformations: TransformComponent[] = []; documents?: Document[]; reader?: BaseReader; vectorStore?: VectorStore; @@ -70,7 +70,7 @@ export class IngestionPipeline { cache?: IngestionCache; disableCache: boolean = false; - private _docStoreStrategy?: TransformComponent<any>; + private _docStoreStrategy?: TransformComponent; constructor(init?: Partial<IngestionPipeline>) { Object.assign(this, init); diff --git a/packages/llamaindex/src/ingestion/strategies/DuplicatesStrategy.ts b/packages/llamaindex/src/ingestion/strategies/DuplicatesStrategy.ts index 679256a53..5755370bb 100644 --- a/packages/llamaindex/src/ingestion/strategies/DuplicatesStrategy.ts +++ b/packages/llamaindex/src/ingestion/strategies/DuplicatesStrategy.ts @@ -4,7 +4,7 @@ import type { BaseDocumentStore } from "../../storage/docStore/types.js"; /** * Handle doc store duplicates by checking all hashes. */ -export class DuplicatesStrategy implements TransformComponent<any> { +export class DuplicatesStrategy implements TransformComponent { private docStore: BaseDocumentStore; constructor(docStore: BaseDocumentStore) { diff --git a/packages/llamaindex/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts b/packages/llamaindex/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts index 700c23f00..561c522a0 100644 --- a/packages/llamaindex/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts +++ b/packages/llamaindex/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts @@ -7,7 +7,7 @@ import { classify } from "./classify.js"; * Handle docstore upserts by checking hashes and ids. * Identify missing docs and delete them from docstore and vector store */ -export class UpsertsAndDeleteStrategy implements TransformComponent<any> { +export class UpsertsAndDeleteStrategy implements TransformComponent { protected docStore: BaseDocumentStore; protected vectorStores?: VectorStore[]; diff --git a/packages/llamaindex/src/ingestion/strategies/UpsertsStrategy.ts b/packages/llamaindex/src/ingestion/strategies/UpsertsStrategy.ts index cc30716a1..69ae36beb 100644 --- a/packages/llamaindex/src/ingestion/strategies/UpsertsStrategy.ts +++ b/packages/llamaindex/src/ingestion/strategies/UpsertsStrategy.ts @@ -6,7 +6,7 @@ import { classify } from "./classify.js"; /** * Handles doc store upserts by checking hashes and ids. */ -export class UpsertsStrategy implements TransformComponent<any> { +export class UpsertsStrategy implements TransformComponent { protected docStore: BaseDocumentStore; protected vectorStores?: VectorStore[]; diff --git a/packages/llamaindex/src/ingestion/strategies/index.ts b/packages/llamaindex/src/ingestion/strategies/index.ts index 6e2c7ecbe..96765a758 100644 --- a/packages/llamaindex/src/ingestion/strategies/index.ts +++ b/packages/llamaindex/src/ingestion/strategies/index.ts @@ -19,7 +19,7 @@ export enum DocStoreStrategy { NONE = "none", // no-op strategy } -class NoOpStrategy implements TransformComponent<any> { +class NoOpStrategy implements TransformComponent { async transform(nodes: any[]): Promise<any[]> { return nodes; } @@ -29,7 +29,7 @@ export function createDocStoreStrategy( docStoreStrategy: DocStoreStrategy, docStore?: BaseDocumentStore, vectorStores: VectorStore[] = [], -): TransformComponent<any> { +): TransformComponent { if (docStoreStrategy === DocStoreStrategy.NONE) { return new NoOpStrategy(); } diff --git a/packages/llamaindex/src/nodeParsers/MarkdownNodeParser.ts b/packages/llamaindex/src/nodeParsers/MarkdownNodeParser.ts deleted file mode 100644 index 4ac1127eb..000000000 --- a/packages/llamaindex/src/nodeParsers/MarkdownNodeParser.ts +++ /dev/null @@ -1,109 +0,0 @@ -import type { BaseNode, Metadata } from "@llamaindex/core/schema"; -import { MetadataMode, TextNode } from "@llamaindex/core/schema"; -import type { NodeParser } from "./types.js"; - -export class MarkdownNodeParser implements NodeParser { - includeMetadata: boolean; - includePrevNextRel: boolean; - - constructor(init?: { - includeMetadata?: boolean; - includePrevNextRel?: boolean; - }) { - this.includeMetadata = init?.includeMetadata ?? true; - this.includePrevNextRel = init?.includePrevNextRel ?? true; - } - - async transform(nodes: BaseNode[], _options?: any): Promise<BaseNode[]> { - return this.getNodesFromDocuments(nodes); - } - - static fromDefaults(init?: { - includeMetadata?: boolean; - includePrevNextRel?: boolean; - }): MarkdownNodeParser { - return new MarkdownNodeParser(init); - } - - buildNodeFromSplit( - textSplit: string, - node: BaseNode<Metadata>, - metadata: Metadata, - ): BaseNode<Metadata> { - const newNode = new TextNode({ - text: textSplit, - relationships: { - PARENT: [ - { - ...node, - nodeId: node.id_, - }, - ], - }, - metadata: this.includeMetadata ? metadata : {}, - }); - return newNode; - } - - updateMetadata( - headersMetadata: Metadata, - newHeader: string, - newHeaderLevel: number, - ): Metadata { - const updatedHeaders: Metadata = {}; - for (let i = 1; i < newHeaderLevel; i++) { - const key = `Header ${i}`; - if (key in headersMetadata) { - updatedHeaders[key] = headersMetadata[key]; - } - } - updatedHeaders[`Header ${newHeaderLevel}`] = newHeader; - return updatedHeaders; - } - - getNodesFromNode(node: BaseNode<Metadata>): BaseNode<Metadata>[] { - const text = node.getContent(MetadataMode.NONE); - const markdownNodes: BaseNode<Metadata>[] = []; - const lines = text.split("\n"); - let metadata: Metadata = {}; - let codeBlock = false; - let currentSection = ""; - - for (const line of lines) { - if (line.startsWith("```")) { - codeBlock = !codeBlock; - } - const headerMatch = line.match(/^(#+)\s(.*)/); - if (headerMatch && !codeBlock) { - if (currentSection !== "") { - markdownNodes.push( - this.buildNodeFromSplit(currentSection.trim(), node, metadata), - ); - } - metadata = this.updateMetadata( - metadata, - headerMatch[2], - headerMatch[1].length, - ); - currentSection = `${headerMatch[2]}\n`; - } else { - currentSection += line + "\n"; - } - } - - markdownNodes.push( - this.buildNodeFromSplit(currentSection.trim(), node, metadata), - ); - - return markdownNodes; - } - - getNodesFromDocuments(documents: BaseNode<Metadata>[]): BaseNode<Metadata>[] { - let allNodes: BaseNode<Metadata>[] = []; - for (const node of documents) { - const nodes = this.getNodesFromNode(node); - allNodes = allNodes.concat(nodes); - } - return allNodes; - } -} diff --git a/packages/llamaindex/src/nodeParsers/SentenceWindowNodeParser.ts b/packages/llamaindex/src/nodeParsers/SentenceWindowNodeParser.ts deleted file mode 100644 index 98a049f56..000000000 --- a/packages/llamaindex/src/nodeParsers/SentenceWindowNodeParser.ts +++ /dev/null @@ -1,89 +0,0 @@ -import type { BaseNode } from "@llamaindex/core/schema"; -import { SentenceSplitter } from "../TextSplitter.js"; -import type { NodeParser } from "./types.js"; -import { getNodesFromDocument } from "./utils.js"; - -export const DEFAULT_WINDOW_SIZE = 3; -export const DEFAULT_WINDOW_METADATA_KEY = "window"; -export const DEFAULT_OG_TEXT_METADATA_KEY = "original_text"; - -export class SentenceWindowNodeParser implements NodeParser { - /** - * The text splitter to use. - */ - textSplitter: SentenceSplitter; - /** - * The number of sentences on each side of a sentence to capture. - */ - windowSize: number = DEFAULT_WINDOW_SIZE; - /** - * The metadata key to store the sentence window under. - */ - windowMetadataKey: string = DEFAULT_WINDOW_METADATA_KEY; - /** - * The metadata key to store the original sentence in. - */ - originalTextMetadataKey: string = DEFAULT_OG_TEXT_METADATA_KEY; - /** - * Whether to include metadata in the nodes. - */ - includeMetadata: boolean = true; - /** - * Whether to include previous and next relationships in the nodes. - */ - includePrevNextRel: boolean = true; - - constructor(init?: Partial<SentenceWindowNodeParser>) { - Object.assign(this, init); - this.textSplitter = init?.textSplitter ?? new SentenceSplitter(); - } - - static fromDefaults( - init?: Partial<SentenceWindowNodeParser>, - ): SentenceWindowNodeParser { - return new SentenceWindowNodeParser(init); - } - - async transform(nodes: BaseNode[], _options?: any): Promise<BaseNode[]> { - return this.getNodesFromDocuments(nodes); - } - - getNodesFromDocuments(documents: BaseNode[]) { - return documents - .map((document) => this.buildWindowNodesFromDocument(document)) - .flat(); - } - - protected buildWindowNodesFromDocument(doc: BaseNode): BaseNode[] { - const nodes = getNodesFromDocument( - doc, - this.textSplitter.getSentenceSplits.bind(this.textSplitter), - this.includeMetadata, - this.includePrevNextRel, - ); - - for (let i = 0; i < nodes.length; i++) { - const node = nodes[i]; - const windowNodes = nodes.slice( - Math.max(0, i - this.windowSize), - Math.min(i + this.windowSize + 1, nodes.length), - ); - - node.metadata[this.windowMetadataKey] = windowNodes - .map((n) => n.getText()) - .join(" "); - node.metadata[this.originalTextMetadataKey] = node.getText(); - - node.excludedEmbedMetadataKeys.push( - this.windowMetadataKey, - this.originalTextMetadataKey, - ); - node.excludedLlmMetadataKeys.push( - this.windowMetadataKey, - this.originalTextMetadataKey, - ); - } - - return nodes; - } -} diff --git a/packages/llamaindex/src/nodeParsers/SimpleNodeParser.ts b/packages/llamaindex/src/nodeParsers/SimpleNodeParser.ts deleted file mode 100644 index 4cbd4eb20..000000000 --- a/packages/llamaindex/src/nodeParsers/SimpleNodeParser.ts +++ /dev/null @@ -1,72 +0,0 @@ -import type { BaseNode } from "@llamaindex/core/schema"; -import { SentenceSplitter } from "../TextSplitter.js"; -import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE } from "../constants.js"; -import type { NodeParser } from "./types.js"; -import { getNodesFromDocument } from "./utils.js"; - -/** - * SimpleNodeParser is the default NodeParser. It splits documents into TextNodes using a splitter, by default SentenceSplitter - */ -export class SimpleNodeParser implements NodeParser { - /** - * The text splitter to use. - */ - textSplitter: SentenceSplitter; - /** - * Whether to include metadata in the nodes. - */ - includeMetadata: boolean; - /** - * Whether to include previous and next relationships in the nodes. - */ - includePrevNextRel: boolean; - - constructor(init?: { - textSplitter?: SentenceSplitter; - includeMetadata?: boolean; - includePrevNextRel?: boolean; - chunkSize?: number; - chunkOverlap?: number; - splitLongSentences?: boolean; - }) { - this.textSplitter = - init?.textSplitter ?? - new SentenceSplitter({ - chunkSize: init?.chunkSize ?? DEFAULT_CHUNK_SIZE, - chunkOverlap: init?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP, - splitLongSentences: init?.splitLongSentences ?? false, - }); - this.includeMetadata = init?.includeMetadata ?? true; - this.includePrevNextRel = init?.includePrevNextRel ?? true; - } - - async transform(nodes: BaseNode[], _options?: any): Promise<BaseNode[]> { - return this.getNodesFromDocuments(nodes); - } - - static fromDefaults(init?: { - chunkSize?: number; - chunkOverlap?: number; - includeMetadata?: boolean; - includePrevNextRel?: boolean; - }): SimpleNodeParser { - return new SimpleNodeParser(init); - } - - /** - * Generate Node objects from documents - * @param documents - */ - getNodesFromDocuments(documents: BaseNode[]) { - return documents - .map((document) => - getNodesFromDocument( - document, - this.textSplitter.splitText.bind(this.textSplitter), - this.includeMetadata, - this.includePrevNextRel, - ), - ) - .flat(); - } -} diff --git a/packages/llamaindex/src/nodeParsers/index.ts b/packages/llamaindex/src/nodeParsers/index.ts index 083fc4a55..63eab7354 100644 --- a/packages/llamaindex/src/nodeParsers/index.ts +++ b/packages/llamaindex/src/nodeParsers/index.ts @@ -1,4 +1 @@ -export * from "./MarkdownNodeParser.js"; -export * from "./SentenceWindowNodeParser.js"; -export * from "./SimpleNodeParser.js"; -export * from "./types.js"; +export * from "@llamaindex/core/node-parser"; diff --git a/packages/llamaindex/src/nodeParsers/types.ts b/packages/llamaindex/src/nodeParsers/types.ts deleted file mode 100644 index 05828addd..000000000 --- a/packages/llamaindex/src/nodeParsers/types.ts +++ /dev/null @@ -1,13 +0,0 @@ -import type { BaseNode, TransformComponent } from "@llamaindex/core/schema"; - -/** - * A NodeParser generates Nodes from Documents - */ -export interface NodeParser extends TransformComponent<any> { - /** - * Generates an array of nodes from an array of documents. - * @param documents - The documents to generate nodes from. - * @returns An array of nodes. - */ - getNodesFromDocuments(documents: BaseNode[]): BaseNode[]; -} diff --git a/packages/llamaindex/src/nodeParsers/utils.ts b/packages/llamaindex/src/nodeParsers/utils.ts deleted file mode 100644 index 18f1f24a1..000000000 --- a/packages/llamaindex/src/nodeParsers/utils.ts +++ /dev/null @@ -1,79 +0,0 @@ -import type { BaseNode } from "@llamaindex/core/schema"; -import { - Document, - ImageDocument, - NodeRelationship, - TextNode, -} from "@llamaindex/core/schema"; -import _ from "lodash"; - -type TextSplitter = (s: string) => string[]; - -/** - * Splits the text of a document into smaller parts. - * @param document - The document to split. - * @param textSplitter - The text splitter to use. - * @returns An array of text splits. - */ -function getTextSplitsFromDocument( - document: Document, - textSplitter: TextSplitter, -) { - const text = document.getText(); - return textSplitter(text); -} - -/** - * Generates an array of nodes from a document. - * @param doc - * @param textSplitter - The text splitter to use. - * @param includeMetadata - Whether to include metadata in the nodes. - * @param includePrevNextRel - Whether to include previous and next relationships in the nodes. - * @returns An array of nodes. - */ -export function getNodesFromDocument( - doc: BaseNode, - textSplitter: TextSplitter, - includeMetadata: boolean = true, - includePrevNextRel: boolean = true, -): TextNode[] { - if (doc instanceof ImageDocument) { - // TODO: use text splitter on text of image documents - return [doc]; - } - if (!(doc instanceof Document)) { - throw new Error("Expected either an Image Document or Document"); - } - const document = doc as Document; - const nodes: TextNode[] = []; - - const textSplits = getTextSplitsFromDocument(document, textSplitter); - - textSplits.forEach((textSplit) => { - const node = new TextNode({ - text: textSplit, - metadata: includeMetadata ? _.cloneDeep(document.metadata) : {}, - excludedEmbedMetadataKeys: _.cloneDeep( - document.excludedEmbedMetadataKeys, - ), - excludedLlmMetadataKeys: _.cloneDeep(document.excludedLlmMetadataKeys), - }); - node.relationships[NodeRelationship.SOURCE] = document.asRelatedNodeInfo(); - nodes.push(node); - }); - - if (includePrevNextRel) { - nodes.forEach((node, index) => { - if (index > 0) { - node.relationships[NodeRelationship.PREVIOUS] = - nodes[index - 1].asRelatedNodeInfo(); - } - if (index < nodes.length - 1) { - node.relationships[NodeRelationship.NEXT] = - nodes[index + 1].asRelatedNodeInfo(); - } - }); - } - - return nodes; -} diff --git a/packages/llamaindex/tests/ingestion/IngestionCache.test.ts b/packages/llamaindex/tests/ingestion/IngestionCache.test.ts index 5af8b3e70..6d6f63db3 100644 --- a/packages/llamaindex/tests/ingestion/IngestionCache.test.ts +++ b/packages/llamaindex/tests/ingestion/IngestionCache.test.ts @@ -28,7 +28,7 @@ describe("IngestionCache", () => { }); describe("getTransformationHash", () => { - let nodes: BaseNode[], transform: TransformComponent<any>; + let nodes: BaseNode[], transform: TransformComponent; beforeAll(() => { nodes = [new TextNode({ text: "some text", id_: "some id" })]; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a1716a3ce..03cf0f81b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -378,6 +378,9 @@ importers: bunchee: specifier: 5.3.0-beta.0 version: 5.3.0-beta.0(typescript@5.5.3) + natural: + specifier: ^7.1.0 + version: 7.1.0(@aws-sdk/credential-providers@3.613.0) packages/core/tests: devDependencies: @@ -3111,6 +3114,35 @@ packages: '@types/react': optional: true + '@redis/bloom@1.2.0': + resolution: {integrity: sha512-HG2DFjYKbpNmVXsa0keLHp/3leGJz1mjh09f2RLGGLQZzSHpkmZWuwJbAvo3QcRY8p80m5+ZdXZdYOSBLlp7Cg==} + peerDependencies: + '@redis/client': ^1.0.0 + + '@redis/client@1.5.17': + resolution: {integrity: sha512-IPvU9A31qRCZ7lds/x+ksuK/UMndd0EASveAvCvEtFFKIZjZ+m/a4a0L7S28KEWoR5ka8526hlSghDo4Hrc2Hg==} + engines: {node: '>=14'} + + '@redis/graph@1.1.1': + resolution: {integrity: sha512-FEMTcTHZozZciLRl6GiiIB4zGm5z5F3F6a6FZCyrfxdKOhFlGkiAqlexWMBzCi4DcRoyiOsuLfW+cjlGWyExOw==} + peerDependencies: + '@redis/client': ^1.0.0 + + '@redis/json@1.0.6': + resolution: {integrity: sha512-rcZO3bfQbm2zPRpqo82XbW8zg4G/w4W3tI7X8Mqleq9goQjAGLL7q/1n1ZX4dXEAmORVZ4s1+uKLaUOg7LrUhw==} + peerDependencies: + '@redis/client': ^1.0.0 + + '@redis/search@1.1.6': + resolution: {integrity: sha512-mZXCxbTYKBQ3M2lZnEddwEAks0Kc7nauire8q20oA0oA/LoA+E/b5Y5KZn232ztPb1FkIGqo12vh3Lf+Vw5iTw==} + peerDependencies: + '@redis/client': ^1.0.0 + + '@redis/time-series@1.0.5': + resolution: {integrity: sha512-IFjIgTusQym2B5IZJG3XKr5llka7ey84fw/NOYqESP5WUfQs9zz1ww/9+qoz4ka/S6KcGBodzlCeZ5UImKbscg==} + peerDependencies: + '@redis/client': ^1.0.0 + '@rollup/plugin-commonjs@25.0.8': resolution: {integrity: sha512-ZEZWTK5n6Qde0to4vS9Mr5x/0UZoqCxPVR9KRUjU4kA2sO7GEUn1fop0DAwpO6z0Nw/kJON9bDmSxdWxO/TT1A==} engines: {node: '>=14.0.0'} @@ -4279,6 +4311,12 @@ packages: resolution: {integrity: sha512-4B/qKCfeE/ODUaAUpSwfzazo5x29WD4r3vXiWsB7I2mSDAihwEqKO+g8GELZUQSSAo5e1XTYh3ZVfLyxBc12nA==} engines: {node: '>= 10.0.0'} + afinn-165-financialmarketnews@3.0.0: + resolution: {integrity: sha512-0g9A1S3ZomFIGDTzZ0t6xmv4AuokBvBmpes8htiyHpH7N4xDmvSQL6UxL/Zcs2ypRb3VwgCscaD8Q3zEawKYhw==} + + afinn-165@1.0.4: + resolution: {integrity: sha512-7+Wlx3BImrK0HiG6y3lU4xX7SpBPSSu8T9iguPMlaueRFxjbYwAQrp9lqZUuFikqKbd/en8lVREILvP2J80uJA==} + agent-base@6.0.2: resolution: {integrity: sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==} engines: {node: '>= 6.0.0'} @@ -4398,6 +4436,10 @@ packages: app-module-path@2.2.0: resolution: {integrity: sha512-gkco+qxENJV+8vFcDiiFhuoSvRXb2a/QPqpSoWhVz829VNJfOTnELbBmPmNKFxf3xdNnw4DWCkzkDaavcX/1YQ==} + apparatus@0.0.10: + resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==} + engines: {node: '>=0.2.6'} + aproba@2.0.0: resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==} @@ -4496,6 +4538,9 @@ packages: resolution: {integrity: sha512-ISvCdHdlTDlH5IpxQJIex7BWBywFWgjJSVdwst+/iQCoEYnyOaQ95+X1JGshuBjGp6nxKUy1jMgE3zPqN7fQdg==} hasBin: true + async@2.6.4: + resolution: {integrity: sha512-mzo5dfJYwAn29PeiJ0zvwTo04zj8HDJj0Mn8TD7sno7q12prdbnasKJHhkm2c1LgrhlJ0teaea8860oxi51mGA==} + async@3.2.5: resolution: {integrity: sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==} @@ -4582,6 +4627,10 @@ packages: base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} + basic-auth@2.0.1: + resolution: {integrity: sha512-NF+epuEdnUYVlGuhaxbbq+dvJttwLnGY+YixlXlME5KpQ5W3CnXA5cVTneY3SPbPDRkcjMbifrwmFYcClgOZeg==} + engines: {node: '>= 0.8'} + batch@0.6.1: resolution: {integrity: sha512-x+VAiMRL6UPkx+kudNvxTl6hB2XNNCG2r+7wixVfIYwu/2HKRXimwQyaumLjMveWvT2Hkd/cAJw+QBMfJ/EKVw==} @@ -4949,6 +4998,10 @@ packages: resolution: {integrity: sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==} engines: {node: '>=6'} + cluster-key-slot@1.1.2: + resolution: {integrity: sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA==} + engines: {node: '>=0.10.0'} + code-red@1.0.4: resolution: {integrity: sha512-7qJWqItLA8/VPVlKJlFXU+NBlo/qyfs39aJcuMT/2ere32ZqvF5OSxgdM5xOfJJ7O429gg2HM47y8v9P+9wrNw==} @@ -5133,6 +5186,10 @@ packages: core-util-is@1.0.3: resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==} + corser@2.0.1: + resolution: {integrity: sha512-utCYNzRSQIZNPIcGZdQc92UVJYAhtGAteCFg0yRaFm8f0P+CPtyGyHXJcGXnffjCybUCEx3FQ2G7U3/o9eIkVQ==} + engines: {node: '>= 0.4.0'} + cosmiconfig@6.0.0: resolution: {integrity: sha512-xb3ZL6+L8b9JLLCx3ZdoZy4+2ECphCMo2PwqgP1tlfVq6M6YReyzBJtvWWtbDSpNr9hn96pkCiZqUcFEc+54Qg==} engines: {node: '>=8'} @@ -6517,6 +6574,10 @@ packages: hpack.js@2.1.6: resolution: {integrity: sha512-zJxVehUdMGIKsRaNt7apO2Gqp0BdqW5yaiGHXXmbpvxgBYVZnAql+BJb4RO5ad2MgpbZKn5G6nMnegrH1FcNYQ==} + html-encoding-sniffer@3.0.0: + resolution: {integrity: sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==} + engines: {node: '>=12'} + html-entities@2.5.2: resolution: {integrity: sha512-K//PSRMQk4FZ78Kyau+mZurHn3FH0Vwr+H36eE0rPbeYkRRi9YxceYPhuN60UwWorxyKHhqoAJl2OFKa4BVtaA==} @@ -6595,6 +6656,11 @@ packages: resolution: {integrity: sha512-7mz/721AbnJwIVbnaSv1Cz3Am0ZLT/UBwkC92VlxhXv/k/BBQfM2fXElQNC27BVGr0uwUpplYPQM9LnaBMR5NQ==} engines: {node: '>=8.0.0'} + http-server@14.1.1: + resolution: {integrity: sha512-+cbxadF40UXd9T01zUHgA+rlo2Bg1Srer4+B4NwIHdaGxAGGv59nYRnGGDJ9LBk7alpS0US+J+bLLdQOOkJq4A==} + engines: {node: '>=12'} + hasBin: true + http2-wrapper@1.0.3: resolution: {integrity: sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==} engines: {node: '>=10.19.0'} @@ -7172,6 +7238,10 @@ packages: jws@4.0.0: resolution: {integrity: sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==} + kareem@2.6.3: + resolution: {integrity: sha512-C3iHfuGUXK2u8/ipq9LfjFfXFxAZMQJJq7vLS45r3D9Y2xQ/m4S8zaR4zMLFWh9AsNPXmcFfUDhTEO8UIC/V6Q==} + engines: {node: '>=12.0.0'} + keyv@4.5.4: resolution: {integrity: sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==} @@ -7512,6 +7582,10 @@ packages: resolution: {integrity: sha512-UERzLsxzllchadvbPs5aolHh65ISpKpM+ccLbOJ8/vvpBKmAWf+la7dXFy7Mr0ySHbdHrFv5kGFCUHHe6GFEmw==} engines: {node: '>= 4.0.0'} + memjs@1.3.2: + resolution: {integrity: sha512-qUEg2g8vxPe+zPn09KidjIStHPtoBO8Cttm8bgJFWWabbsjQ9Av9Ky+6UcvKx6ue0LLb/LEhtcyQpRyKfzeXcg==} + engines: {node: '>=0.10.0'} + memory-pager@1.5.0: resolution: {integrity: sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==} @@ -7761,6 +7835,10 @@ packages: mkdirp-classic@0.5.3: resolution: {integrity: sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==} + mkdirp@0.5.6: + resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} + hasBin: true + mkdirp@1.0.4: resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==} engines: {node: '>=10'} @@ -7782,6 +7860,33 @@ packages: mongodb-connection-string-url@3.0.1: resolution: {integrity: sha512-XqMGwRX0Lgn05TDB4PyG2h2kKO/FfWJyCzYQbIhXUxz7ETt0I/FqHjUeqj37irJ+Dl1ZtU82uYyj14u2XsZKfg==} + mongodb@6.7.0: + resolution: {integrity: sha512-TMKyHdtMcO0fYBNORiYdmM25ijsHs+Njs963r4Tro4OQZzqYigAzYQouwWRg4OIaiLRUEGUh/1UAcH5lxdSLIA==} + engines: {node: '>=16.20.1'} + peerDependencies: + '@aws-sdk/credential-providers': ^3.188.0 + '@mongodb-js/zstd': ^1.1.0 + gcp-metadata: ^5.2.0 + kerberos: ^2.0.1 + mongodb-client-encryption: '>=6.0.0 <7' + snappy: ^7.2.2 + socks: ^2.7.1 + peerDependenciesMeta: + '@aws-sdk/credential-providers': + optional: true + '@mongodb-js/zstd': + optional: true + gcp-metadata: + optional: true + kerberos: + optional: true + mongodb-client-encryption: + optional: true + snappy: + optional: true + socks: + optional: true + mongodb@6.8.0: resolution: {integrity: sha512-HGQ9NWDle5WvwMnrvUxsFYPd3JEbqD3RgABHBQRuoCEND0qzhsd0iH5ypHsf1eJ+sXmvmyKpP+FLOKY8Il7jMw==} engines: {node: '>=16.20.1'} @@ -7809,6 +7914,18 @@ packages: socks: optional: true + mongoose@8.5.1: + resolution: {integrity: sha512-OhVcwVl91A1G6+XpjDcpkGP7l7ikZkxa0DylX7NT/lcEqAjggzSdqDxb48A+xsDxqNAr0ntSJ1yiE3+KJTOd5Q==} + engines: {node: '>=16.20.1'} + + mpath@0.9.0: + resolution: {integrity: sha512-ikJRQTk8hw5DEoFVxHG1Gn9T/xcjtdnOKIU1JTmGjZZlg9LST2mBLmcX3/ICIbgJydT2GOc15RnNy5mHmzfSew==} + engines: {node: '>=4.0.0'} + + mquery@5.0.0: + resolution: {integrity: sha512-iQMncpmEK8R8ncT8HJGsGc9Dsp8xcgYMVSbs5jgnm1lFHTZqMJTUWTDx1LBO8+mK3tPNZWFLBghQEIOULSTHZg==} + engines: {node: '>=14.0.0'} + mri@1.2.0: resolution: {integrity: sha512-tzzskb3bG8LvYGFF/mDTpq3jpI6Q9wc3LEmBaghu+DdCssd1FakN7Bc0hVNmEyGq1bq3RgfkCb3cmQLpNPOroA==} engines: {node: '>=4'} @@ -7856,6 +7973,10 @@ packages: natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} + natural@7.1.0: + resolution: {integrity: sha512-GBhiRgF0VUX+zPWahBVir1ajARQDZF1Fe6UpQORNzyQT57JQ2KLKYvubecvjIYh/uDaociusmySeRh+WL5OdxQ==} + engines: {node: '>=0.4.10'} + negotiator@0.6.3: resolution: {integrity: sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==} engines: {node: '>= 0.6'} @@ -8424,6 +8545,10 @@ packages: resolution: {integrity: sha512-Nc3IT5yHzflTfbjgqWcCPpo7DaKy4FnpB0l/zCAW0Tc7jxAiuqSxHasntB3D7887LSrA93kDJ9IXovxJYxyLCA==} engines: {node: '>=4'} + portfinder@1.0.32: + resolution: {integrity: sha512-on2ZJVVDXRADWE6jnQaX0ioEylzgBpQk8r55NE4wjXW1ZxO+BgDlY6DXwj20i0V8eB4SenDQ00WEaxfiIQPcxg==} + engines: {node: '>= 0.12.0'} + portkey-ai@0.1.16: resolution: {integrity: sha512-EY4FRp6PZSD75Q1o1qc08DfPNTG9FnkUPN3Z1/lEvaq9iFpSO5UekcagUZaKSVhao311qjBjns+kF0rS9ht7iA==} @@ -9101,6 +9226,9 @@ packages: resolution: {integrity: sha512-8HrF5ZsXk5FAH9dgsx3BlUer73nIhuj+9OrQwEbLTPOBzGkL1lsFCR01am+v+0m2Cmbs1nP12hLDl5FA7EszKA==} engines: {node: '>=6.0.0'} + redis@4.6.15: + resolution: {integrity: sha512-2NtuOpMW3tnYzBw6S8mbXSX7RPzvVFCA2wFJq9oErushO2UeBkxObk+uvo7gv7n0rhWeOj/IzrHO8TjcFlRSOg==} + reflect.getprototypeof@1.0.6: resolution: {integrity: sha512-fmfw4XgoDke3kdI6h4xcUz1dG8uaiv5q9gcEwLS4Pnth2kxT+GZ7YehS1JTMGBQmtV7Y4GFGbs2re2NqhdozUg==} engines: {node: '>= 0.4'} @@ -9371,6 +9499,9 @@ packages: resolution: {integrity: sha512-vfD3pmTzGpufjScBh50YHKzEu2lxBWhVEHsNGoEXmCmn2hKGfeNLYMzCJpe8cD7gqX7TJluOVpBkAequ6dgMmA==} engines: {node: '>=4'} + secure-compare@3.0.1: + resolution: {integrity: sha512-AckIIV90rPDcBcglUwXPF3kg0P0qmPsPXAj6BBEENQE1p5yA1xfmDJzfi1Tappj37Pv2mVbKpL3Z1T+Nn7k1Qw==} + secure-json-parse@2.7.0: resolution: {integrity: sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==} @@ -9486,6 +9617,9 @@ packages: resolution: {integrity: sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==} engines: {node: '>= 0.4'} + sift@17.1.3: + resolution: {integrity: sha512-Rtlj66/b0ICeFzYTuNvX/EF1igRbbnGSvEyT79McoZa/DeGhMyC5pWKOEsZKnpkqtSeovd5FL/bjHWC3CIIvCQ==} + siginfo@2.0.0: resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==} @@ -9656,6 +9790,10 @@ packages: resolution: {integrity: sha512-KXDYZ9dszj6bzvnEMRYvxgeTHU74QBFL54XKtP3nyMuJ81CFYtABZ3bAzL2EdFUaEwJOBOgENyFj3R7oTzDyyw==} engines: {node: '>=4', npm: '>=6'} + stopwords-iso@1.1.0: + resolution: {integrity: sha512-I6GPS/E0zyieHehMRPQcqkiBMJKGgLta+1hREixhoLPqEA0AlVFiC43dl8uPpmkkeRdDMzYRWFWk5/l9x7nmNg==} + engines: {node: '>=0.10.0'} + stream-to-array@2.3.0: resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==} @@ -9868,6 +10006,10 @@ packages: peerDependencies: vue: '>=3.2.26 < 4' + sylvester@0.0.12: + resolution: {integrity: sha512-SzRP5LQ6Ts2G5NyAa/jg16s8e3R7rfdFjizy1zeoecYWw+nGL+YA1xZvW/+iJmidBGSdLkuvdwTYEyJEb+EiUw==} + engines: {node: '>=0.2.6'} + tailwind-merge@2.4.0: resolution: {integrity: sha512-49AwoOQNKdqKPd9CViyH5wJoSKsCDjUlzL8DxuGp3P1FsGY36NJDAa18jLZcaHAUUuTj+JB8IAo8zWgBNvBF7A==} @@ -10240,6 +10382,10 @@ packages: unified@11.0.5: resolution: {integrity: sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==} + union@0.5.0: + resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==} + engines: {node: '>= 0.8.0'} + unique-string@3.0.0: resolution: {integrity: sha512-VGXBUVwxKMBUznyffQweQABPRRW1vHZAbadFZud4pLFAqRGvv/96vafgjWFqzourzr8YonlQiPgH0YCJfawoGQ==} engines: {node: '>=12'} @@ -10592,6 +10738,10 @@ packages: resolution: {integrity: sha512-OqedPIGOfsDlo31UNwYbCFMSaO9m9G/0faIHj5/dZFDMFqPTcx6UwqyOy3COEaEOg/9VsGIpdqn62W5KhoKSpg==} engines: {node: '>=0.8.0'} + whatwg-encoding@2.0.0: + resolution: {integrity: sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==} + engines: {node: '>=12'} + whatwg-fetch@3.6.20: resolution: {integrity: sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==} @@ -10664,6 +10814,10 @@ packages: resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==} engines: {node: '>=0.10.0'} + wordnet-db@3.1.14: + resolution: {integrity: sha512-zVyFsvE+mq9MCmwXUWHIcpfbrHHClZWZiVOzKSxNJruIcFn2RbY55zkhiAMMxM8zCVSmtNiViq8FsAZSFpMYag==} + engines: {node: '>=0.6.0'} + wordwrap@1.0.0: resolution: {integrity: sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==} @@ -14130,6 +14284,32 @@ snapshots: optionalDependencies: '@types/react': 18.3.3 + '@redis/bloom@1.2.0(@redis/client@1.5.17)': + dependencies: + '@redis/client': 1.5.17 + + '@redis/client@1.5.17': + dependencies: + cluster-key-slot: 1.1.2 + generic-pool: 3.9.0 + yallist: 4.0.0 + + '@redis/graph@1.1.1(@redis/client@1.5.17)': + dependencies: + '@redis/client': 1.5.17 + + '@redis/json@1.0.6(@redis/client@1.5.17)': + dependencies: + '@redis/client': 1.5.17 + + '@redis/search@1.1.6(@redis/client@1.5.17)': + dependencies: + '@redis/client': 1.5.17 + + '@redis/time-series@1.0.5(@redis/client@1.5.17)': + dependencies: + '@redis/client': 1.5.17 + '@rollup/plugin-commonjs@25.0.8(rollup@4.18.1)': dependencies: '@rollup/pluginutils': 5.1.0(rollup@4.18.1) @@ -15523,6 +15703,10 @@ snapshots: address@1.2.2: {} + afinn-165-financialmarketnews@3.0.0: {} + + afinn-165@1.0.4: {} + agent-base@6.0.2: dependencies: debug: 4.3.5 @@ -15657,6 +15841,10 @@ snapshots: app-module-path@2.2.0: {} + apparatus@0.0.10: + dependencies: + sylvester: 0.0.12 + aproba@2.0.0: optional: true @@ -15786,6 +15974,10 @@ snapshots: astring@1.8.6: {} + async@2.6.4: + dependencies: + lodash: 4.17.21 + async@3.2.5: {} asynckit@0.4.0: {} @@ -15890,6 +16082,10 @@ snapshots: base64-js@1.5.1: {} + basic-auth@2.0.1: + dependencies: + safe-buffer: 5.1.2 + batch@0.6.1: {} better-path-resolve@1.0.0: @@ -16315,6 +16511,8 @@ snapshots: clsx@2.1.1: {} + cluster-key-slot@1.1.2: {} + code-red@1.0.4: dependencies: '@jridgewell/sourcemap-codec': 1.5.0 @@ -16507,6 +16705,8 @@ snapshots: core-util-is@1.0.3: {} + corser@2.0.1: {} + cosmiconfig@6.0.0: dependencies: '@types/parse-json': 4.0.2 @@ -18428,6 +18628,10 @@ snapshots: readable-stream: 2.3.8 wbuf: 1.7.3 + html-encoding-sniffer@3.0.0: + dependencies: + whatwg-encoding: 2.0.0 + html-entities@2.5.2: {} html-escaper@2.0.2: {} @@ -18530,6 +18734,25 @@ snapshots: transitivePeerDependencies: - debug + http-server@14.1.1: + dependencies: + basic-auth: 2.0.1 + chalk: 4.1.2 + corser: 2.0.1 + he: 1.2.0 + html-encoding-sniffer: 3.0.0 + http-proxy: 1.18.1 + mime: 1.6.0 + minimist: 1.2.8 + opener: 1.5.2 + portfinder: 1.0.32 + secure-compare: 3.0.1 + union: 0.5.0 + url-join: 4.0.1 + transitivePeerDependencies: + - debug + - supports-color + http2-wrapper@1.0.3: dependencies: quick-lru: 5.1.1 @@ -19061,6 +19284,8 @@ snapshots: jwa: 2.0.0 safe-buffer: 5.2.1 + kareem@2.6.3: {} + keyv@4.5.4: dependencies: json-buffer: 3.0.1 @@ -19554,6 +19779,8 @@ snapshots: dependencies: fs-monkey: 1.0.6 + memjs@1.3.2: {} + memory-pager@1.5.0: {} merge-descriptors@1.0.1: {} @@ -19957,6 +20184,10 @@ snapshots: mkdirp-classic@0.5.3: {} + mkdirp@0.5.6: + dependencies: + minimist: 1.2.8 + mkdirp@1.0.4: {} mlly@1.7.1: @@ -19983,6 +20214,14 @@ snapshots: '@types/whatwg-url': 11.0.5 whatwg-url: 13.0.0 + mongodb@6.7.0(@aws-sdk/credential-providers@3.613.0): + dependencies: + '@mongodb-js/saslprep': 1.1.7 + bson: 6.8.0 + mongodb-connection-string-url: 3.0.1 + optionalDependencies: + '@aws-sdk/credential-providers': 3.613.0(@aws-sdk/client-sso-oidc@3.613.0(@aws-sdk/client-sts@3.613.0)) + mongodb@6.8.0(@aws-sdk/credential-providers@3.613.0(@aws-sdk/client-sso-oidc@3.613.0(@aws-sdk/client-sts@3.613.0))): dependencies: '@mongodb-js/saslprep': 1.1.7 @@ -19991,6 +20230,33 @@ snapshots: optionalDependencies: '@aws-sdk/credential-providers': 3.613.0(@aws-sdk/client-sso-oidc@3.613.0(@aws-sdk/client-sts@3.613.0)) + mongoose@8.5.1(@aws-sdk/credential-providers@3.613.0): + dependencies: + bson: 6.8.0 + kareem: 2.6.3 + mongodb: 6.7.0(@aws-sdk/credential-providers@3.613.0) + mpath: 0.9.0 + mquery: 5.0.0 + ms: 2.1.3 + sift: 17.1.3 + transitivePeerDependencies: + - '@aws-sdk/credential-providers' + - '@mongodb-js/zstd' + - gcp-metadata + - kerberos + - mongodb-client-encryption + - snappy + - socks + - supports-color + + mpath@0.9.0: {} + + mquery@5.0.0: + dependencies: + debug: 4.3.5 + transitivePeerDependencies: + - supports-color + mri@1.2.0: {} mrmime@2.0.0: {} @@ -20025,6 +20291,35 @@ snapshots: natural-compare@1.4.0: {} + natural@7.1.0(@aws-sdk/credential-providers@3.613.0): + dependencies: + afinn-165: 1.0.4 + afinn-165-financialmarketnews: 3.0.0 + apparatus: 0.0.10 + dotenv: 16.4.5 + http-server: 14.1.1 + memjs: 1.3.2 + mongoose: 8.5.1(@aws-sdk/credential-providers@3.613.0) + pg: 8.12.0 + redis: 4.6.15 + safe-stable-stringify: 2.4.3 + stopwords-iso: 1.1.0 + sylvester: 0.0.12 + underscore: 1.13.6 + uuid: 9.0.1 + wordnet-db: 3.1.14 + transitivePeerDependencies: + - '@aws-sdk/credential-providers' + - '@mongodb-js/zstd' + - debug + - gcp-metadata + - kerberos + - mongodb-client-encryption + - pg-native + - snappy + - socks + - supports-color + negotiator@0.6.3: {} neo-async@2.6.2: {} @@ -20660,6 +20955,14 @@ snapshots: pluralize@8.0.0: {} + portfinder@1.0.32: + dependencies: + async: 2.6.4 + debug: 3.2.7 + mkdirp: 0.5.6 + transitivePeerDependencies: + - supports-color + portkey-ai@0.1.16: dependencies: agentkeepalive: 4.5.0 @@ -21379,6 +21682,15 @@ snapshots: dependencies: minimatch: 3.1.2 + redis@4.6.15: + dependencies: + '@redis/bloom': 1.2.0(@redis/client@1.5.17) + '@redis/client': 1.5.17 + '@redis/graph': 1.1.1(@redis/client@1.5.17) + '@redis/json': 1.0.6(@redis/client@1.5.17) + '@redis/search': 1.1.6(@redis/client@1.5.17) + '@redis/time-series': 1.0.5(@redis/client@1.5.17) + reflect.getprototypeof@1.0.6: dependencies: call-bind: 1.0.7 @@ -21725,6 +22037,8 @@ snapshots: extend-shallow: 2.0.1 kind-of: 6.0.3 + secure-compare@3.0.1: {} + secure-json-parse@2.7.0: {} select-hose@2.0.0: {} @@ -21903,6 +22217,8 @@ snapshots: get-intrinsic: 1.2.4 object-inspect: 1.13.2 + sift@17.1.3: {} + siginfo@2.0.0: {} signal-exit@3.0.7: {} @@ -22074,6 +22390,8 @@ snapshots: stoppable@1.1.0: {} + stopwords-iso@1.1.0: {} + stream-to-array@2.3.0: dependencies: any-promise: 1.3.0 @@ -22317,6 +22635,8 @@ snapshots: dependencies: vue: 3.4.31(typescript@5.5.3) + sylvester@0.0.12: {} + tailwind-merge@2.4.0: {} tailwindcss@3.4.4: @@ -22713,6 +23033,10 @@ snapshots: trough: 2.2.0 vfile: 6.0.1 + union@0.5.0: + dependencies: + qs: 6.11.2 + unique-string@3.0.0: dependencies: crypto-random-string: 4.0.0 @@ -23186,6 +23510,10 @@ snapshots: websocket-extensions@0.1.4: {} + whatwg-encoding@2.0.0: + dependencies: + iconv-lite: 0.6.3 + whatwg-fetch@3.6.20: {} whatwg-url@13.0.0: @@ -23296,6 +23624,8 @@ snapshots: word-wrap@1.2.5: {} + wordnet-db@3.1.14: {} + wordwrap@1.0.0: {} workerd@1.20240701.0: -- GitLab