diff --git a/apps/docs/docs/modules/ingestion_pipeline/index.md b/apps/docs/docs/modules/ingestion_pipeline/index.md index adb4d8bf93328f150bbb80184b2b80263e3e9007..6bf5c29954f6f66abcf0c8acc7800b3d4279cb9a 100644 --- a/apps/docs/docs/modules/ingestion_pipeline/index.md +++ b/apps/docs/docs/modules/ingestion_pipeline/index.md @@ -16,7 +16,7 @@ import { MetadataMode, OpenAIEmbedding, TitleExtractor, - SimpleNodeParser, + SentenceSplitter, } from "llamaindex"; async function main() { @@ -29,7 +29,7 @@ async function main() { const document = new Document({ text: essay, id_: path }); const pipeline = new IngestionPipeline({ transformations: [ - new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), + new SentenceSplitter({ chunkSize: 1024, chunkOverlap: 20 }), new TitleExtractor(), new OpenAIEmbedding(), ], @@ -62,7 +62,7 @@ import { MetadataMode, OpenAIEmbedding, TitleExtractor, - SimpleNodeParser, + SentenceSplitter, QdrantVectorStore, VectorStoreIndex, } from "llamaindex"; @@ -81,7 +81,7 @@ async function main() { const document = new Document({ text: essay, id_: path }); const pipeline = new IngestionPipeline({ transformations: [ - new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), + new SentenceSplitter({ chunkSize: 1024, chunkOverlap: 20 }), new TitleExtractor(), new OpenAIEmbedding(), ], diff --git a/apps/docs/docs/modules/ingestion_pipeline/transformations.md b/apps/docs/docs/modules/ingestion_pipeline/transformations.md index d1e82b7430e6ce2bad5faa103fbc51fda7f92dac..8cc35c54a137e522db43e82ae3a3669c0082cf2e 100644 --- a/apps/docs/docs/modules/ingestion_pipeline/transformations.md +++ b/apps/docs/docs/modules/ingestion_pipeline/transformations.md @@ -4,7 +4,7 @@ A transformation is something that takes a list of nodes as an input, and return Currently, the following components are Transformation objects: -- [SimpleNodeParser](../../api/classes/SimpleNodeParser.md) +- [SentenceSplitter](../../api/classes/SentenceSplitter.md) - [MetadataExtractor](../documents_and_nodes/metadata_extraction.md) - [Embeddings](../embeddings/index.md) @@ -13,10 +13,10 @@ Currently, the following components are Transformation objects: While transformations are best used with with an IngestionPipeline, they can also be used directly. ```ts -import { SimpleNodeParser, TitleExtractor, Document } from "llamaindex"; +import { SentenceSplitter, TitleExtractor, Document } from "llamaindex"; async function main() { - let nodes = new SimpleNodeParser().getNodesFromDocuments([ + let nodes = new SentenceSplitter().getNodesFromDocuments([ new Document({ text: "I am 10 years old. John is 20 years old." }), ]); @@ -34,15 +34,15 @@ main().catch(console.error); ## Custom Transformations -You can implement any transformation yourself by implementing the `TransformerComponent`. +You can implement any transformation yourself by implementing the `TransformComponent`. -The following custom transformation will remove any special characters or punctutaion in text. +The following custom transformation will remove any special characters or punctutation in text. ```ts -import { TransformerComponent, Node } from "llamaindex"; +import { TransformComponent, TextNode } from "llamaindex"; -class RemoveSpecialCharacters extends TransformerComponent { - async transform(nodes: Node[]): Promise<Node[]> { +export class RemoveSpecialCharacters extends TransformComponent { + async transform(nodes: TextNode[]): Promise<TextNode[]> { for (const node of nodes) { node.text = node.text.replace(/[^\w\s]/gi, ""); } diff --git a/apps/docs/docs/modules/node_parser.md b/apps/docs/docs/modules/node_parser.md index 68a024903efc33b72e0eac5308f15ede8b9216d3..6efaca84809f63142a38d7f006b261abe9b81934 100644 --- a/apps/docs/docs/modules/node_parser.md +++ b/apps/docs/docs/modules/node_parser.md @@ -7,9 +7,9 @@ sidebar_position: 4 The `NodeParser` in LlamaIndex is responsible for splitting `Document` objects into more manageable `Node` objects. When you call `.fromDocuments()`, the `NodeParser` from the `Settings` is used to do this automatically for you. Alternatively, you can use it to split documents ahead of time. ```typescript -import { Document, SimpleNodeParser } from "llamaindex"; +import { Document, SentenceSplitter } from "llamaindex"; -const nodeParser = new SimpleNodeParser(); +const nodeParser = new SentenceSplitter(); Settings.nodeParser = nodeParser; ``` @@ -93,6 +93,5 @@ The output metadata will be something like: ## API Reference -- [SimpleNodeParser](../api/classes/SimpleNodeParser.md) - [SentenceSplitter](../api/classes/SentenceSplitter.md) - [MarkdownNodeParser](../api/classes/MarkdownNodeParser.md) diff --git a/apps/docs/docs/modules/query_engines/router_query_engine.md b/apps/docs/docs/modules/query_engines/router_query_engine.md index be7c46e67378e25b17e1e62897a92793f285d64b..1bc78272db7a0157bbde06a54509a04cf528df17 100644 --- a/apps/docs/docs/modules/query_engines/router_query_engine.md +++ b/apps/docs/docs/modules/query_engines/router_query_engine.md @@ -15,7 +15,7 @@ import { OpenAI, RouterQueryEngine, SimpleDirectoryReader, - SimpleNodeParser, + SentenceSplitter, SummaryIndex, VectorStoreIndex, Settings, @@ -34,11 +34,11 @@ const documents = await new SimpleDirectoryReader().loadData({ ## Service Context -Next, we need to define some basic rules and parse the documents into nodes. We will use the `SimpleNodeParser` to parse the documents into nodes and `Settings` to define the rules (eg. LLM API key, chunk size, etc.): +Next, we need to define some basic rules and parse the documents into nodes. We will use the `SentenceSplitter` to parse the documents into nodes and `Settings` to define the rules (eg. LLM API key, chunk size, etc.): ```ts Settings.llm = new OpenAI(); -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 1024, }); ``` @@ -104,14 +104,14 @@ import { OpenAI, RouterQueryEngine, SimpleDirectoryReader, - SimpleNodeParser, + SentenceSplitter, SummaryIndex, VectorStoreIndex, Settings, } from "llamaindex"; Settings.llm = new OpenAI(); -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 1024, }); diff --git a/examples/agent/multi_document_agent.ts b/examples/agent/multi_document_agent.ts index 8a7861a15efdda0d2eab87a459cf62717cfa3c37..f5aa159e454561505ebdbd0aaeb0fe7cdceb89cc 100644 --- a/examples/agent/multi_document_agent.ts +++ b/examples/agent/multi_document_agent.ts @@ -6,8 +6,8 @@ import { OpenAI, OpenAIAgent, QueryEngineTool, + SentenceSplitter, Settings, - SimpleNodeParser, SimpleToolNodeMapping, SummaryIndex, VectorStoreIndex, @@ -43,7 +43,7 @@ async function main() { for (const title of wikiTitles) { console.log(`Processing ${title}`); - const nodes = new SimpleNodeParser({ + const nodes = new SentenceSplitter({ chunkSize: 200, chunkOverlap: 20, }).getNodesFromDocuments([countryDocs[title]]); diff --git a/examples/extractors/keywordExtractor.ts b/examples/extractors/keywordExtractor.ts index 8669c00048b7ba1b2a7a3bb8415a912427ee6ef6..78095cefb5db8f2bbec1c2f28f73bc773090af78 100644 --- a/examples/extractors/keywordExtractor.ts +++ b/examples/extractors/keywordExtractor.ts @@ -2,13 +2,13 @@ import { Document, KeywordExtractor, OpenAI, - SimpleNodeParser, + SentenceSplitter, } from "llamaindex"; (async () => { const openaiLLM = new OpenAI({ model: "gpt-3.5-turbo", temperature: 0 }); - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: "banana apple orange pear peach watermelon" }), diff --git a/examples/extractors/questionsAnsweredExtractor.ts b/examples/extractors/questionsAnsweredExtractor.ts index 30c55d98d0e6f27b9210f5b1485e0d1173656a5c..d9a4f8a1b95ab12091e71309e2d8196f432852c0 100644 --- a/examples/extractors/questionsAnsweredExtractor.ts +++ b/examples/extractors/questionsAnsweredExtractor.ts @@ -2,13 +2,13 @@ import { Document, OpenAI, QuestionsAnsweredExtractor, - SimpleNodeParser, + SentenceSplitter, } from "llamaindex"; (async () => { const openaiLLM = new OpenAI({ model: "gpt-3.5-turbo", temperature: 0 }); - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ diff --git a/examples/extractors/summaryExtractor.ts b/examples/extractors/summaryExtractor.ts index 1bb2c903af27baea7848a3e0f81502f9bd50d863..f7e38c1f4e655c2616a3ce7594c135b878a4927e 100644 --- a/examples/extractors/summaryExtractor.ts +++ b/examples/extractors/summaryExtractor.ts @@ -1,14 +1,14 @@ import { Document, OpenAI, - SimpleNodeParser, + SentenceSplitter, SummaryExtractor, } from "llamaindex"; (async () => { const openaiLLM = new OpenAI({ model: "gpt-3.5-turbo", temperature: 0 }); - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ diff --git a/examples/extractors/titleExtractor.ts b/examples/extractors/titleExtractor.ts index 0843e92087248d4e6d9cabd649c4cc88585ee064..cdd22d9303a9611613f1b30f4dc2fdcfd512905c 100644 --- a/examples/extractors/titleExtractor.ts +++ b/examples/extractors/titleExtractor.ts @@ -1,11 +1,11 @@ -import { Document, OpenAI, SimpleNodeParser, TitleExtractor } from "llamaindex"; +import { Document, OpenAI, SentenceSplitter, TitleExtractor } from "llamaindex"; import essay from "../essay"; (async () => { const openaiLLM = new OpenAI({ model: "gpt-3.5-turbo-0125", temperature: 0 }); - const nodeParser = new SimpleNodeParser({}); + const nodeParser = new SentenceSplitter({}); const nodes = nodeParser.getNodesFromDocuments([ new Document({ diff --git a/examples/jupyter/nodeparser.ipynb b/examples/jupyter/nodeparser.ipynb index 14df45b97a9bb3f192a6a6f7a6f183b91b801603..ffb48331acbf8f1b9c11e27d364f25ccb0eea9b2 100644 --- a/examples/jupyter/nodeparser.ipynb +++ b/examples/jupyter/nodeparser.ipynb @@ -7,10 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "import {\n", - " Document,\n", - " SimpleNodeParser\n", - "} from \"npm:llamaindex\";" + "import { Document, SentenceSplitter } from \"npm:llamaindex\";" ] }, { @@ -45,7 +42,7 @@ } ], "source": [ - "const nodeParser = new SimpleNodeParser();\n", + "const nodeParser = new SentenceSplitter();\n", "const nodes = nodeParser.getNodesFromDocuments([\n", " new Document({ text: \"I am 10 years old. John is 20 years old.\" }),\n", "]);\n", diff --git a/examples/lowlevel.ts b/examples/lowlevel.ts index 5db9b71dcb42d26130f2f46344303d6ebff58faa..5f96692f3a3952cda8ad3c98755b7dcea7eb8f38 100644 --- a/examples/lowlevel.ts +++ b/examples/lowlevel.ts @@ -2,12 +2,12 @@ import { Document, NodeWithScore, ResponseSynthesizer, - SimpleNodeParser, + SentenceSplitter, TextNode, } from "llamaindex"; (async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: "I am 10 years old. John is 20 years old." }), ]); diff --git a/examples/pipeline/ingestion.ts b/examples/pipeline/ingestion.ts index 341bfea778f6521a3e481a8eb23365167f7d4400..d3a851843207f4f833407c46c6b6910dcbcfbd3f 100644 --- a/examples/pipeline/ingestion.ts +++ b/examples/pipeline/ingestion.ts @@ -5,7 +5,7 @@ import { IngestionPipeline, MetadataMode, OpenAIEmbedding, - SimpleNodeParser, + SentenceSplitter, } from "llamaindex"; async function main() { @@ -18,7 +18,7 @@ async function main() { const document = new Document({ text: essay, id_: path }); const pipeline = new IngestionPipeline({ transformations: [ - new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), + new SentenceSplitter({ chunkSize: 1024, chunkOverlap: 20 }), new OpenAIEmbedding(), ], }); diff --git a/examples/routerQueryEngine.ts b/examples/routerQueryEngine.ts index 5bd169008905ea35f173821b314ccddbe8dfd258..b1081d5c3d9e7ebd540c26545f0e33ed7375bce2 100644 --- a/examples/routerQueryEngine.ts +++ b/examples/routerQueryEngine.ts @@ -1,9 +1,9 @@ import { OpenAI, RouterQueryEngine, + SentenceSplitter, Settings, SimpleDirectoryReader, - SimpleNodeParser, SummaryIndex, VectorStoreIndex, } from "llamaindex"; @@ -12,7 +12,7 @@ import { Settings.llm = new OpenAI(); // Update node parser -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 1024, }); diff --git a/examples/summaryIndex.ts b/examples/summaryIndex.ts index 31710d41cd5ae3253faff0e1a213f8ba4644864d..0813cf760830f13fa5d82567468f20c48bada00c 100644 --- a/examples/summaryIndex.ts +++ b/examples/summaryIndex.ts @@ -1,7 +1,7 @@ import { Document, + SentenceSplitter, Settings, - SimpleNodeParser, SummaryIndex, SummaryRetrieverMode, } from "llamaindex"; @@ -9,7 +9,7 @@ import { import essay from "./essay"; // Update node parser -Settings.nodeParser = new SimpleNodeParser({ +Settings.nodeParser = new SentenceSplitter({ chunkSize: 40, }); diff --git a/packages/llamaindex/tests/MetadataExtractors.test.ts b/packages/llamaindex/tests/MetadataExtractors.test.ts index 13ef651327d0a2b6ae06b5088dd4750cf69d3847..76eb487cae11dc3009c712cf1d1d826ac153100c 100644 --- a/packages/llamaindex/tests/MetadataExtractors.test.ts +++ b/packages/llamaindex/tests/MetadataExtractors.test.ts @@ -10,7 +10,7 @@ import { TitleExtractor, } from "llamaindex/extractors/index"; import { OpenAI } from "llamaindex/llm/openai"; -import { SimpleNodeParser } from "llamaindex/nodeParsers/index"; +import { SentenceSplitter } from "llamaindex/nodeParsers/index"; import { afterAll, beforeAll, describe, expect, test, vi } from "vitest"; import { DEFAULT_LLM_TEXT_OUTPUT, @@ -45,7 +45,7 @@ describe("[MetadataExtractor]: Extractors should populate the metadata", () => { }); test("[MetadataExtractor] KeywordExtractor returns excerptKeywords metadata", async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: DEFAULT_LLM_TEXT_OUTPUT }), @@ -64,7 +64,7 @@ describe("[MetadataExtractor]: Extractors should populate the metadata", () => { }); test("[MetadataExtractor] TitleExtractor returns documentTitle metadata", async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: DEFAULT_LLM_TEXT_OUTPUT }), @@ -83,7 +83,7 @@ describe("[MetadataExtractor]: Extractors should populate the metadata", () => { }); test("[MetadataExtractor] QuestionsAnsweredExtractor returns questionsThisExcerptCanAnswer metadata", async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: DEFAULT_LLM_TEXT_OUTPUT }), @@ -103,7 +103,7 @@ describe("[MetadataExtractor]: Extractors should populate the metadata", () => { }); test("[MetadataExtractor] SumamryExtractor returns sectionSummary metadata", async () => { - const nodeParser = new SimpleNodeParser(); + const nodeParser = new SentenceSplitter(); const nodes = nodeParser.getNodesFromDocuments([ new Document({ text: DEFAULT_LLM_TEXT_OUTPUT }), diff --git a/packages/llamaindex/tests/ingestion/IngestionCache.test.ts b/packages/llamaindex/tests/ingestion/IngestionCache.test.ts index 6d6f63db38c15aedfca3b8d2cd90b0e9c2c9e7f8..e4614db58c3fee96340484a322319a6c981a9ace 100644 --- a/packages/llamaindex/tests/ingestion/IngestionCache.test.ts +++ b/packages/llamaindex/tests/ingestion/IngestionCache.test.ts @@ -5,7 +5,7 @@ import { IngestionCache, getTransformationHash, } from "llamaindex/ingestion/IngestionCache"; -import { SimpleNodeParser } from "llamaindex/nodeParsers/index"; +import { SentenceSplitter } from "llamaindex/nodeParsers/index"; import { beforeAll, describe, expect, test } from "vitest"; describe("IngestionCache", () => { @@ -32,7 +32,7 @@ describe("getTransformationHash", () => { beforeAll(() => { nodes = [new TextNode({ text: "some text", id_: "some id" })]; - transform = new SimpleNodeParser({ + transform = new SentenceSplitter({ chunkOverlap: 10, chunkSize: 1024, }); @@ -66,7 +66,7 @@ describe("getTransformationHash", () => { const result1 = getTransformationHash(nodes, transform); const result2 = getTransformationHash( nodes, - new SimpleNodeParser({ + new SentenceSplitter({ chunkOverlap: 10, chunkSize: 512, }),