From be3e280f2a8de9efe80e441a4590d44d0d55f094 Mon Sep 17 00:00:00 2001 From: Phil Nash <philnash@gmail.com> Date: Fri, 30 Aug 2024 06:15:57 +0200 Subject: [PATCH] Updates references to SimpleNodeParser to SentenceSplitter. (#1129) --- .../docs/modules/ingestion_pipeline/index.md | 8 ++++---- .../ingestion_pipeline/transformations.md | 16 ++++++++-------- apps/docs/docs/modules/node_parser.md | 5 ++--- .../modules/query_engines/router_query_engine.md | 10 +++++----- examples/agent/multi_document_agent.ts | 4 ++-- examples/extractors/keywordExtractor.ts | 4 ++-- .../extractors/questionsAnsweredExtractor.ts | 4 ++-- examples/extractors/summaryExtractor.ts | 4 ++-- examples/extractors/titleExtractor.ts | 4 ++-- examples/jupyter/nodeparser.ipynb | 7 ++----- examples/lowlevel.ts | 4 ++-- examples/pipeline/ingestion.ts | 4 ++-- examples/routerQueryEngine.ts | 4 ++-- examples/summaryIndex.ts | 4 ++-- .../llamaindex/tests/MetadataExtractors.test.ts | 10 +++++----- .../tests/ingestion/IngestionCache.test.ts | 6 +++--- 16 files changed, 47 insertions(+), 51 deletions(-) diff --git a/apps/docs/docs/modules/ingestion_pipeline/index.md b/apps/docs/docs/modules/ingestion_pipeline/index.md index adb4d8bf9..6bf5c2995 100644 --- a/apps/docs/docs/modules/ingestion_pipeline/index.md +++ b/apps/docs/docs/modules/ingestion_pipeline/index.md @@ -16,7 +16,7 @@ import { MetadataMode, OpenAIEmbedding, TitleExtractor, - SimpleNodeParser, + SentenceSplitter, } from "llamaindex"; async function main() { @@ -29,7 +29,7 @@ async function main() { const document = new Document({ text: essay, id_: path }); const pipeline = new IngestionPipeline({ transformations: [ - new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), + new SentenceSplitter({ chunkSize: 1024, chunkOverlap: 20 }), new TitleExtractor(), new OpenAIEmbedding(), ], @@ -62,7 +62,7 @@ import { MetadataMode, OpenAIEmbedding, TitleExtractor, - SimpleNodeParser, + SentenceSplitter, QdrantVectorStore, VectorStoreIndex, } from "llamaindex"; @@ -81,7 +81,7 @@ async function main() { const document = new Document({ text: essay, id_: path }); const pipeline = new IngestionPipeline({ transformations: [ - new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), + new SentenceSplitter({ chunkSize: 1024, chunkOverlap: 20 }), new TitleExtractor(), new OpenAIEmbedding(), ], diff --git a/apps/docs/docs/modules/ingestion_pipeline/transformations.md b/apps/docs/docs/modules/ingestion_pipeline/transformations.md index d1e82b743..8cc35c54a 100644 --- a/apps/docs/docs/modules/ingestion_pipeline/transformations.md +++ b/apps/docs/docs/modules/ingestion_pipeline/transformations.md @@ -4,7 +4,7 @@ A transformation is something that takes a list of nodes as an input, and return Currently, the following components are Transformation objects: -- [SimpleNodeParser](../../api/classes/SimpleNodeParser.md) +- [SentenceSplitter](../../api/classes/SentenceSplitter.md) - [MetadataExtractor](../documents_and_nodes/metadata_extraction.md) - [Embeddings](../embeddings/index.md) @@ -13,10 +13,10 @@ Currently, the following components are Transformation objects: While transformations are best used with with an IngestionPipeline, they can also be used directly. ```ts -import { SimpleNodeParser, TitleExtractor, Document } from "llamaindex"; +import { SentenceSplitter, TitleExtractor, Document } from "llamaindex"; async function main() { - let nodes = new SimpleNodeParser().getNodesFromDocuments([ + let nodes = new SentenceSplitter().getNodesFromDocuments([ new Document({ text: "I am 10 years old. John is 20 years old." }), ]); @@ -34,15 +34,15 @@ main().catch(console.error); ## Custom Transformations -You can implement any transformation yourself by implementing the `TransformerComponent`. +You can implement any transformation yourself by implementing the `TransformComponent`. -The following custom transformation will remove any special characters or punctutaion in text. +The following custom transformation will remove any special characters or punctutation in text. ```ts -import { TransformerComponent, Node } from "llamaindex"; +import { TransformComponent, TextNode } from "llamaindex"; -class RemoveSpecialCharacters extends TransformerComponent { - async transform(nodes: Node[]): Promise<Node[]> { +export class RemoveSpecialCharacters extends TransformComponent { + async transform(nodes: TextNode[]): Promise<TextNode[]> { for (const node of nodes) { node.text = node.text.replace(/[^\w\s]/gi, ""); } diff --git a/apps/docs/docs/modules/node_parser.md b/apps/docs/docs/modules/node_parser.md index 68a024903..6efaca848 100644 --- a/apps/docs/docs/modules/node_parser.md +++ b/apps/docs/docs/modules/node_parser.md @@ -7,9 +7,9 @@ sidebar_position: 4 The `NodeParser` in LlamaIndex is responsible for splitting `Document` objects into more manageable `Node` objects. When you call `.fromDocuments()`, the `NodeParser` from the `Settings` is used to do this automatically for you. Alternatively, you can use it to split documents ahead of time. ```typescript -import { Document, SimpleNodeParser } from "llamaindex"; +import { Document, SentenceSplitter } from "llamaindex"; -const nodeParser = new SimpleNodeParser(); +const nodeParser = new SentenceSplitter(); Settings.nodeParser = nodeParser; ``` @@ -93,6 +93,5 @@ The output metadata will be something like: ## API Reference -- [SimpleNodeParser](../api/classes/SimpleNodeParser.md) - [SentenceSplitter](../api/classes/SentenceSplitter.md) - [MarkdownNodeParser](../api/classes/MarkdownNodeParser.md) diff --git a/apps/docs/docs/modules/query_engines/router_query_engine.md b/apps/docs/docs/modules/query_engines/router_query_engine.md index be7c46e67..1bc78272d 100644 --- a/apps/docs/docs/modules/query_engines/router_query_engine.md +++ b/apps/docs/docs/modules/query_engines/router_query_engine.md @@ -15,7 +15,7 @@ import { OpenAI, RouterQueryEngine, SimpleDirectoryReader, - SimpleNodeParser, + SentenceSplitter, SummaryIndex, VectorStoreIndex, Settings, @@ -34,11 +34,11 @@ const documents = await new SimpleDirectoryReader().loadData({ ## Service Context -Next, we need to define some basic rules and parse the documents into nodes. We will use the `SimpleNodeParser` to parse the documents into nodes and `Settings` to define the rules (eg. LLM API key, chunk size, etc.): +Next, we need to define some basic rules and parse the documents into nodes. We will use the `SentenceSplitter` to parse the documents into nodes and `Settings` to define the rules (eg. LLM API key, chunk size, etc.): ```ts Settings.llm = new OpenAI(); -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 1024, }); ``` @@ -104,14 +104,14 @@ import { OpenAI, RouterQueryEngine, SimpleDirectoryReader, - SimpleNodeParser, + SentenceSplitter, SummaryIndex, VectorStoreIndex, Settings, } from "llamaindex"; Settings.llm = new OpenAI(); -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 1024, }); diff --git a/examples/agent/multi_document_agent.ts b/examples/agent/multi_document_agent.ts index 8a7861a15..f5aa159e4 100644 --- a/examples/agent/multi_document_agent.ts +++ b/examples/agent/multi_document_agent.ts @@ -6,8 +6,8 @@ import { OpenAI, OpenAIAgent, QueryEngineTool, + SentenceSplitter, Settings, - SimpleNodeParser, SimpleToolNodeMapping, SummaryIndex, VectorStoreIndex, @@ -43,7 +43,7 @@ async function main() { for (const title of wikiTitles) { console.log(`Processing ${title}`); - const nodes = new SimpleNodeParser({ + const nodes = new SentenceSplitter({ chunkSize: 200, chunkOverlap: 20, }).getNodesFromDocuments([countryDocs[title]]); diff --git a/examples/extractors/keywordExtractor.ts b/examples/extractors/keywordExtractor.ts index 8669c0004..78095cefb 100644 --- a/examples/extractors/keywordExtractor.ts +++ b/examples/extractors/keywordExtractor.ts @@ -2,13 +2,13 @@ import { Document, KeywordExtractor, OpenAI, - SimpleNodeParser, + SentenceSplitter, } from "llamaindex"; (async () => { const openaiLLM = new OpenAI({ model: "gpt-3.5-turbo", temperature: 0 }); - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: "banana apple orange pear peach watermelon" }), diff --git a/examples/extractors/questionsAnsweredExtractor.ts b/examples/extractors/questionsAnsweredExtractor.ts index 30c55d98d..d9a4f8a1b 100644 --- a/examples/extractors/questionsAnsweredExtractor.ts +++ b/examples/extractors/questionsAnsweredExtractor.ts @@ -2,13 +2,13 @@ import { Document, OpenAI, QuestionsAnsweredExtractor, - SimpleNodeParser, + SentenceSplitter, } from "llamaindex"; (async () => { const openaiLLM = new OpenAI({ model: "gpt-3.5-turbo", temperature: 0 }); - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ diff --git a/examples/extractors/summaryExtractor.ts b/examples/extractors/summaryExtractor.ts index 1bb2c903a..f7e38c1f4 100644 --- a/examples/extractors/summaryExtractor.ts +++ b/examples/extractors/summaryExtractor.ts @@ -1,14 +1,14 @@ import { Document, OpenAI, - SimpleNodeParser, + SentenceSplitter, SummaryExtractor, } from "llamaindex"; (async () => { const openaiLLM = new OpenAI({ model: "gpt-3.5-turbo", temperature: 0 }); - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ diff --git a/examples/extractors/titleExtractor.ts b/examples/extractors/titleExtractor.ts index 0843e9208..cdd22d930 100644 --- a/examples/extractors/titleExtractor.ts +++ b/examples/extractors/titleExtractor.ts @@ -1,11 +1,11 @@ -import { Document, OpenAI, SimpleNodeParser, TitleExtractor } from "llamaindex"; +import { Document, OpenAI, SentenceSplitter, TitleExtractor } from "llamaindex"; import essay from "../essay"; (async () => { const openaiLLM = new OpenAI({ model: "gpt-3.5-turbo-0125", temperature: 0 }); - const nodeParser = new SimpleNodeParser({}); + const nodeParser = new SentenceSplitter({}); const nodes = nodeParser.getNodesFromDocuments([ new Document({ diff --git a/examples/jupyter/nodeparser.ipynb b/examples/jupyter/nodeparser.ipynb index 14df45b97..ffb48331a 100644 --- a/examples/jupyter/nodeparser.ipynb +++ b/examples/jupyter/nodeparser.ipynb @@ -7,10 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "import {\n", - " Document,\n", - " SimpleNodeParser\n", - "} from \"npm:llamaindex\";" + "import { Document, SentenceSplitter } from \"npm:llamaindex\";" ] }, { @@ -45,7 +42,7 @@ } ], "source": [ - "const nodeParser = new SimpleNodeParser();\n", + "const nodeParser = new SentenceSplitter();\n", "const nodes = nodeParser.getNodesFromDocuments([\n", " new Document({ text: \"I am 10 years old. John is 20 years old.\" }),\n", "]);\n", diff --git a/examples/lowlevel.ts b/examples/lowlevel.ts index 5db9b71dc..5f96692f3 100644 --- a/examples/lowlevel.ts +++ b/examples/lowlevel.ts @@ -2,12 +2,12 @@ import { Document, NodeWithScore, ResponseSynthesizer, - SimpleNodeParser, + SentenceSplitter, TextNode, } from "llamaindex"; (async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: "I am 10 years old. John is 20 years old." }), ]); diff --git a/examples/pipeline/ingestion.ts b/examples/pipeline/ingestion.ts index 341bfea77..d3a851843 100644 --- a/examples/pipeline/ingestion.ts +++ b/examples/pipeline/ingestion.ts @@ -5,7 +5,7 @@ import { IngestionPipeline, MetadataMode, OpenAIEmbedding, - SimpleNodeParser, + SentenceSplitter, } from "llamaindex"; async function main() { @@ -18,7 +18,7 @@ async function main() { const document = new Document({ text: essay, id_: path }); const pipeline = new IngestionPipeline({ transformations: [ - new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), + new SentenceSplitter({ chunkSize: 1024, chunkOverlap: 20 }), new OpenAIEmbedding(), ], }); diff --git a/examples/routerQueryEngine.ts b/examples/routerQueryEngine.ts index 5bd169008..b1081d5c3 100644 --- a/examples/routerQueryEngine.ts +++ b/examples/routerQueryEngine.ts @@ -1,9 +1,9 @@ import { OpenAI, RouterQueryEngine, + SentenceSplitter, Settings, SimpleDirectoryReader, - SimpleNodeParser, SummaryIndex, VectorStoreIndex, } from "llamaindex"; @@ -12,7 +12,7 @@ import { Settings.llm = new OpenAI(); // Update node parser -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 1024, }); diff --git a/examples/summaryIndex.ts b/examples/summaryIndex.ts index 31710d41c..0813cf760 100644 --- a/examples/summaryIndex.ts +++ b/examples/summaryIndex.ts @@ -1,7 +1,7 @@ import { Document, + SentenceSplitter, Settings, - SimpleNodeParser, SummaryIndex, SummaryRetrieverMode, } from "llamaindex"; @@ -9,7 +9,7 @@ import { import essay from "./essay"; // Update node parser -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 40, }); diff --git a/packages/llamaindex/tests/MetadataExtractors.test.ts b/packages/llamaindex/tests/MetadataExtractors.test.ts index 13ef65132..76eb487ca 100644 --- a/packages/llamaindex/tests/MetadataExtractors.test.ts +++ b/packages/llamaindex/tests/MetadataExtractors.test.ts @@ -10,7 +10,7 @@ import { TitleExtractor, } from "llamaindex/extractors/index"; import { OpenAI } from "llamaindex/llm/openai"; -import { SimpleNodeParser } from "llamaindex/nodeParsers/index"; +import { SentenceSplitter } from "llamaindex/nodeParsers/index"; import { afterAll, beforeAll, describe, expect, test, vi } from "vitest"; import { DEFAULT_LLM_TEXT_OUTPUT, @@ -45,7 +45,7 @@ describe("[MetadataExtractor]: Extractors should populate the metadata", () => { }); test("[MetadataExtractor] KeywordExtractor returns excerptKeywords metadata", async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: DEFAULT_LLM_TEXT_OUTPUT }), @@ -64,7 +64,7 @@ describe("[MetadataExtractor]: Extractors should populate the metadata", () => { }); test("[MetadataExtractor] TitleExtractor returns documentTitle metadata", async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: DEFAULT_LLM_TEXT_OUTPUT }), @@ -83,7 +83,7 @@ describe("[MetadataExtractor]: Extractors should populate the metadata", () => { }); test("[MetadataExtractor] QuestionsAnsweredExtractor returns questionsThisExcerptCanAnswer metadata", async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: DEFAULT_LLM_TEXT_OUTPUT }), @@ -103,7 +103,7 @@ describe("[MetadataExtractor]: Extractors should populate the metadata", () => { }); test("[MetadataExtractor] SumamryExtractor returns sectionSummary metadata", async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: DEFAULT_LLM_TEXT_OUTPUT }), diff --git a/packages/llamaindex/tests/ingestion/IngestionCache.test.ts b/packages/llamaindex/tests/ingestion/IngestionCache.test.ts index 6d6f63db3..e4614db58 100644 --- a/packages/llamaindex/tests/ingestion/IngestionCache.test.ts +++ b/packages/llamaindex/tests/ingestion/IngestionCache.test.ts @@ -5,7 +5,7 @@ import { IngestionCache, getTransformationHash, } from "llamaindex/ingestion/IngestionCache"; -import { SimpleNodeParser } from "llamaindex/nodeParsers/index"; +import { SentenceSplitter } from "llamaindex/nodeParsers/index"; import { beforeAll, describe, expect, test } from "vitest"; describe("IngestionCache", () => { @@ -32,7 +32,7 @@ describe("getTransformationHash", () => { beforeAll(() => { nodes = [new TextNode({ text: "some text", id_: "some id" })]; - transform = new SimpleNodeParser({ + transform = new SentenceSplitter({ chunkOverlap: 10, chunkSize: 1024, }); @@ -66,7 +66,7 @@ describe("getTransformationHash", () => { const result1 = getTransformationHash(nodes, transform); const result2 = getTransformationHash( nodes, - new SimpleNodeParser({ + new SentenceSplitter({ chunkOverlap: 10, chunkSize: 512, }), -- GitLab