From f90f7fee641b36aa7b5a61d88d87e508921b1849 Mon Sep 17 00:00:00 2001
From: Emanuel Ferreira <contatoferreirads@gmail.com>
Date: Sat, 27 Jan 2024 11:53:52 -0300
Subject: [PATCH] docs: ingestion pipeline, transformations (#464)

---
 apps/docs/docs/modules/data_index.md          |  2 +-
 apps/docs/docs/modules/data_loader.md         |  2 +-
 apps/docs/docs/modules/embedding.md           |  2 +-
 .../modules/ingestion_pipeline/_category_.yml |  2 +
 .../docs/modules/ingestion_pipeline/index.md  | 99 +++++++++++++++++++
 .../ingestion_pipeline/transformations.md     | 77 +++++++++++++++
 apps/docs/docs/modules/llm.md                 |  2 +-
 apps/docs/docs/modules/query_engine.md        |  2 +-
 8 files changed, 183 insertions(+), 5 deletions(-)
 create mode 100644 apps/docs/docs/modules/ingestion_pipeline/_category_.yml
 create mode 100644 apps/docs/docs/modules/ingestion_pipeline/index.md
 create mode 100644 apps/docs/docs/modules/ingestion_pipeline/transformations.md

diff --git a/apps/docs/docs/modules/data_index.md b/apps/docs/docs/modules/data_index.md
index 2855e4e40..ad5a4dcd8 100644
--- a/apps/docs/docs/modules/data_index.md
+++ b/apps/docs/docs/modules/data_index.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 3
+sidebar_position: 4
 ---
 
 # Index
diff --git a/apps/docs/docs/modules/data_loader.md b/apps/docs/docs/modules/data_loader.md
index f1b1aa97a..1e3043b60 100644
--- a/apps/docs/docs/modules/data_loader.md
+++ b/apps/docs/docs/modules/data_loader.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 2
+sidebar_position: 3
 ---
 
 # Reader / Loader
diff --git a/apps/docs/docs/modules/embedding.md b/apps/docs/docs/modules/embedding.md
index bf8e0bacc..b3ca243f9 100644
--- a/apps/docs/docs/modules/embedding.md
+++ b/apps/docs/docs/modules/embedding.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 2
+sidebar_position: 3
 ---
 
 # Embedding
diff --git a/apps/docs/docs/modules/ingestion_pipeline/_category_.yml b/apps/docs/docs/modules/ingestion_pipeline/_category_.yml
new file mode 100644
index 000000000..1152ca723
--- /dev/null
+++ b/apps/docs/docs/modules/ingestion_pipeline/_category_.yml
@@ -0,0 +1,2 @@
+label: "Ingestion Pipeline"
+position: 2
diff --git a/apps/docs/docs/modules/ingestion_pipeline/index.md b/apps/docs/docs/modules/ingestion_pipeline/index.md
new file mode 100644
index 000000000..4de6ee5fb
--- /dev/null
+++ b/apps/docs/docs/modules/ingestion_pipeline/index.md
@@ -0,0 +1,99 @@
+# Ingestion Pipeline
+
+An `IngestionPipeline` uses a concept of `Transformations` that are applied to input data.
+These `Transformations` are applied to your input data, and the resulting nodes are either returned or inserted into a vector database (if given).
+
+## Usage Pattern
+
+The simplest usage is to instantiate an IngestionPipeline like so:
+
+```ts
+import fs from "node:fs/promises";
+
+import {
+  Document,
+  IngestionPipeline,
+  MetadataMode,
+  OpenAIEmbedding,
+  TitleExtractor,
+  SimpleNodeParser,
+} from "llamaindex";
+
+async function main() {
+  // Load essay from abramov.txt in Node
+  const path = "node_modules/llamaindex/examples/abramov.txt";
+
+  const essay = await fs.readFile(path, "utf-8");
+
+  // Create Document object with essay
+  const document = new Document({ text: essay, id_: path });
+  const pipeline = new IngestionPipeline({
+    transformations: [
+      new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }),
+      new TitleExtractor(),
+      new OpenAIEmbedding(),
+    ],
+  });
+
+  // run the pipeline
+  const nodes = await pipeline.run({ documents: [document] });
+
+  // print out the result of the pipeline run
+  for (const node of nodes) {
+    console.log(node.getContent(MetadataMode.NONE));
+  }
+}
+
+main().catch(console.error);
+```
+
+## Connecting to Vector Databases
+
+When running an ingestion pipeline, you can also chose to automatically insert the resulting nodes into a remote vector store.
+
+Then, you can construct an index from that vector store later on.
+
+```ts
+import fs from "node:fs/promises";
+
+import {
+  Document,
+  IngestionPipeline,
+  MetadataMode,
+  OpenAIEmbedding,
+  TitleExtractor,
+  SimpleNodeParser,
+  QdrantVectorStore,
+  VectorStoreIndex,
+} from "llamaindex";
+
+async function main() {
+  // Load essay from abramov.txt in Node
+  const path = "node_modules/llamaindex/examples/abramov.txt";
+
+  const essay = await fs.readFile(path, "utf-8");
+
+  const vectorStore = new QdrantVectorStore({
+    host: "http://localhost:6333",
+  });
+
+  // Create Document object with essay
+  const document = new Document({ text: essay, id_: path });
+  const pipeline = new IngestionPipeline({
+    transformations: [
+      new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }),
+      new TitleExtractor(),
+      new OpenAIEmbedding(),
+    ],
+    vectorStore,
+  });
+
+  // run the pipeline
+  const nodes = await pipeline.run({ documents: [document] });
+
+  // create an index
+  const index = VectorStoreIndex.fromVectorStore(vectorStore);
+}
+
+main().catch(console.error);
+```
diff --git a/apps/docs/docs/modules/ingestion_pipeline/transformations.md b/apps/docs/docs/modules/ingestion_pipeline/transformations.md
new file mode 100644
index 000000000..3d947628e
--- /dev/null
+++ b/apps/docs/docs/modules/ingestion_pipeline/transformations.md
@@ -0,0 +1,77 @@
+# Transformations
+
+A transformation is something that takes a list of nodes as an input, and returns a list of nodes. Each component that implements the Transformatio class has both a `transform` definition responsible for transforming the nodes
+
+Currently, the following components are Transformation objects:
+
+- [SimpleNodeParser](../../api/classes/SimpleNodeParser.md)
+- [MetadataExtractor](../documents_and_nodes/metadata_extraction.md)
+- Embeddings
+
+## Usage Pattern
+
+While transformations are best used with with an IngestionPipeline, they can also be used directly.
+
+```ts
+import { SimpleNodeParser, TitleExtractor, Document } from "llamaindex";
+
+async function main() {
+  let nodes = new SimpleNodeParser().getNodesFromDocuments([
+    new Document({ text: "I am 10 years old. John is 20 years old." }),
+  ]);
+
+  const titleExtractor = new TitleExtractor();
+
+  nodes = await titleExtractor.transform(nodes);
+
+  for (const node of nodes) {
+    console.log(node.getContent(MetadataMode.NONE));
+  }
+}
+
+main().catch(console.error);
+```
+
+## Custom Transformations
+
+You can implement any transformation yourself by implementing the `TransformerComponent`.
+
+The following custom transformation will remove any special characters or punctutaion in text.
+
+```ts
+import { TransformerComponent, Node } from "llamaindex";
+
+class RemoveSpecialCharacters extends TransformerComponent {
+  async transform(nodes: Node[]): Promise<Node[]> {
+    for (const node of nodes) {
+      node.text = node.text.replace(/[^\w\s]/gi, "");
+    }
+
+    return nodes;
+  }
+}
+```
+
+These can then be used directly or in any IngestionPipeline.
+
+```ts
+import { IngestionPipeline, Document } from "llamaindex";
+
+async function main() {
+  const pipeline = new IngestionPipeline({
+    transformations: [new RemoveSpecialCharacters()],
+  });
+
+  const nodes = await pipeline.run({
+    documents: [
+      new Document({ text: "I am 10 years old. John is 20 years old." }),
+    ],
+  });
+
+  for (const node of nodes) {
+    console.log(node.getContent(MetadataMode.NONE));
+  }
+}
+
+main().catch(console.error);
+```
diff --git a/apps/docs/docs/modules/llm.md b/apps/docs/docs/modules/llm.md
index 7f69f13db..92bae09ca 100644
--- a/apps/docs/docs/modules/llm.md
+++ b/apps/docs/docs/modules/llm.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 2
+sidebar_position: 3
 ---
 
 # LLM
diff --git a/apps/docs/docs/modules/query_engine.md b/apps/docs/docs/modules/query_engine.md
index 853641a17..65cc742f0 100644
--- a/apps/docs/docs/modules/query_engine.md
+++ b/apps/docs/docs/modules/query_engine.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 3
+sidebar_position: 4
 ---
 
 # QueryEngine
-- 
GitLab