From f90f7fee641b36aa7b5a61d88d87e508921b1849 Mon Sep 17 00:00:00 2001 From: Emanuel Ferreira <contatoferreirads@gmail.com> Date: Sat, 27 Jan 2024 11:53:52 -0300 Subject: [PATCH] docs: ingestion pipeline, transformations (#464) --- apps/docs/docs/modules/data_index.md | 2 +- apps/docs/docs/modules/data_loader.md | 2 +- apps/docs/docs/modules/embedding.md | 2 +- .../modules/ingestion_pipeline/_category_.yml | 2 + .../docs/modules/ingestion_pipeline/index.md | 99 +++++++++++++++++++ .../ingestion_pipeline/transformations.md | 77 +++++++++++++++ apps/docs/docs/modules/llm.md | 2 +- apps/docs/docs/modules/query_engine.md | 2 +- 8 files changed, 183 insertions(+), 5 deletions(-) create mode 100644 apps/docs/docs/modules/ingestion_pipeline/_category_.yml create mode 100644 apps/docs/docs/modules/ingestion_pipeline/index.md create mode 100644 apps/docs/docs/modules/ingestion_pipeline/transformations.md diff --git a/apps/docs/docs/modules/data_index.md b/apps/docs/docs/modules/data_index.md index 2855e4e40..ad5a4dcd8 100644 --- a/apps/docs/docs/modules/data_index.md +++ b/apps/docs/docs/modules/data_index.md @@ -1,5 +1,5 @@ --- -sidebar_position: 3 +sidebar_position: 4 --- # Index diff --git a/apps/docs/docs/modules/data_loader.md b/apps/docs/docs/modules/data_loader.md index f1b1aa97a..1e3043b60 100644 --- a/apps/docs/docs/modules/data_loader.md +++ b/apps/docs/docs/modules/data_loader.md @@ -1,5 +1,5 @@ --- -sidebar_position: 2 +sidebar_position: 3 --- # Reader / Loader diff --git a/apps/docs/docs/modules/embedding.md b/apps/docs/docs/modules/embedding.md index bf8e0bacc..b3ca243f9 100644 --- a/apps/docs/docs/modules/embedding.md +++ b/apps/docs/docs/modules/embedding.md @@ -1,5 +1,5 @@ --- -sidebar_position: 2 +sidebar_position: 3 --- # Embedding diff --git a/apps/docs/docs/modules/ingestion_pipeline/_category_.yml b/apps/docs/docs/modules/ingestion_pipeline/_category_.yml new file mode 100644 index 000000000..1152ca723 --- /dev/null +++ b/apps/docs/docs/modules/ingestion_pipeline/_category_.yml @@ -0,0 +1,2 @@ +label: "Ingestion Pipeline" +position: 2 diff --git a/apps/docs/docs/modules/ingestion_pipeline/index.md b/apps/docs/docs/modules/ingestion_pipeline/index.md new file mode 100644 index 000000000..4de6ee5fb --- /dev/null +++ b/apps/docs/docs/modules/ingestion_pipeline/index.md @@ -0,0 +1,99 @@ +# Ingestion Pipeline + +An `IngestionPipeline` uses a concept of `Transformations` that are applied to input data. +These `Transformations` are applied to your input data, and the resulting nodes are either returned or inserted into a vector database (if given). + +## Usage Pattern + +The simplest usage is to instantiate an IngestionPipeline like so: + +```ts +import fs from "node:fs/promises"; + +import { + Document, + IngestionPipeline, + MetadataMode, + OpenAIEmbedding, + TitleExtractor, + SimpleNodeParser, +} from "llamaindex"; + +async function main() { + // Load essay from abramov.txt in Node + const path = "node_modules/llamaindex/examples/abramov.txt"; + + const essay = await fs.readFile(path, "utf-8"); + + // Create Document object with essay + const document = new Document({ text: essay, id_: path }); + const pipeline = new IngestionPipeline({ + transformations: [ + new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), + new TitleExtractor(), + new OpenAIEmbedding(), + ], + }); + + // run the pipeline + const nodes = await pipeline.run({ documents: [document] }); + + // print out the result of the pipeline run + for (const node of nodes) { + console.log(node.getContent(MetadataMode.NONE)); + } +} + +main().catch(console.error); +``` + +## Connecting to Vector Databases + +When running an ingestion pipeline, you can also chose to automatically insert the resulting nodes into a remote vector store. + +Then, you can construct an index from that vector store later on. + +```ts +import fs from "node:fs/promises"; + +import { + Document, + IngestionPipeline, + MetadataMode, + OpenAIEmbedding, + TitleExtractor, + SimpleNodeParser, + QdrantVectorStore, + VectorStoreIndex, +} from "llamaindex"; + +async function main() { + // Load essay from abramov.txt in Node + const path = "node_modules/llamaindex/examples/abramov.txt"; + + const essay = await fs.readFile(path, "utf-8"); + + const vectorStore = new QdrantVectorStore({ + host: "http://localhost:6333", + }); + + // Create Document object with essay + const document = new Document({ text: essay, id_: path }); + const pipeline = new IngestionPipeline({ + transformations: [ + new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), + new TitleExtractor(), + new OpenAIEmbedding(), + ], + vectorStore, + }); + + // run the pipeline + const nodes = await pipeline.run({ documents: [document] }); + + // create an index + const index = VectorStoreIndex.fromVectorStore(vectorStore); +} + +main().catch(console.error); +``` diff --git a/apps/docs/docs/modules/ingestion_pipeline/transformations.md b/apps/docs/docs/modules/ingestion_pipeline/transformations.md new file mode 100644 index 000000000..3d947628e --- /dev/null +++ b/apps/docs/docs/modules/ingestion_pipeline/transformations.md @@ -0,0 +1,77 @@ +# Transformations + +A transformation is something that takes a list of nodes as an input, and returns a list of nodes. Each component that implements the Transformatio class has both a `transform` definition responsible for transforming the nodes + +Currently, the following components are Transformation objects: + +- [SimpleNodeParser](../../api/classes/SimpleNodeParser.md) +- [MetadataExtractor](../documents_and_nodes/metadata_extraction.md) +- Embeddings + +## Usage Pattern + +While transformations are best used with with an IngestionPipeline, they can also be used directly. + +```ts +import { SimpleNodeParser, TitleExtractor, Document } from "llamaindex"; + +async function main() { + let nodes = new SimpleNodeParser().getNodesFromDocuments([ + new Document({ text: "I am 10 years old. John is 20 years old." }), + ]); + + const titleExtractor = new TitleExtractor(); + + nodes = await titleExtractor.transform(nodes); + + for (const node of nodes) { + console.log(node.getContent(MetadataMode.NONE)); + } +} + +main().catch(console.error); +``` + +## Custom Transformations + +You can implement any transformation yourself by implementing the `TransformerComponent`. + +The following custom transformation will remove any special characters or punctutaion in text. + +```ts +import { TransformerComponent, Node } from "llamaindex"; + +class RemoveSpecialCharacters extends TransformerComponent { + async transform(nodes: Node[]): Promise<Node[]> { + for (const node of nodes) { + node.text = node.text.replace(/[^\w\s]/gi, ""); + } + + return nodes; + } +} +``` + +These can then be used directly or in any IngestionPipeline. + +```ts +import { IngestionPipeline, Document } from "llamaindex"; + +async function main() { + const pipeline = new IngestionPipeline({ + transformations: [new RemoveSpecialCharacters()], + }); + + const nodes = await pipeline.run({ + documents: [ + new Document({ text: "I am 10 years old. John is 20 years old." }), + ], + }); + + for (const node of nodes) { + console.log(node.getContent(MetadataMode.NONE)); + } +} + +main().catch(console.error); +``` diff --git a/apps/docs/docs/modules/llm.md b/apps/docs/docs/modules/llm.md index 7f69f13db..92bae09ca 100644 --- a/apps/docs/docs/modules/llm.md +++ b/apps/docs/docs/modules/llm.md @@ -1,5 +1,5 @@ --- -sidebar_position: 2 +sidebar_position: 3 --- # LLM diff --git a/apps/docs/docs/modules/query_engine.md b/apps/docs/docs/modules/query_engine.md index 853641a17..65cc742f0 100644 --- a/apps/docs/docs/modules/query_engine.md +++ b/apps/docs/docs/modules/query_engine.md @@ -1,5 +1,5 @@ --- -sidebar_position: 3 +sidebar_position: 4 --- # QueryEngine -- GitLab