feat: LlamaCloudIndex from documents (#677)

3cbfa98e · Emanuel Ferreira · GitHub · d256cbe0 · 3cbfa98e · 3cbfa98e
Unverified Commit 3cbfa98e authored 1 year ago by Emanuel Ferreira Committed by GitHub 1 year ago
--- a/.changeset/warm-ligers-hammer.md
+++ b/.changeset/warm-ligers-hammer.md
+---
+"llamaindex": patch
+---
+
+feat: llamacloud index from documents
--- a/examples/cloud/from_documents.ts
+++ b/examples/cloud/from_documents.ts
+import fs from "node:fs/promises";
+
+import { stdin as input, stdout as output } from "node:process";
+
+import readline from "node:readline/promises";
+
+import { Document, LlamaCloudIndex } from "llamaindex";
+
+async function main() {
+  const path = "node_modules/llamaindex/examples/abramov.txt";
+
+  const essay = await fs.readFile(path, "utf-8");
+
+  // Create Document object with essay
+  const document = new Document({ text: essay, id_: path });
+
+  const index = await LlamaCloudIndex.fromDocuments({
+    documents: [document],
+    name: "test",
+    projectName: "default",
+    apiKey: process.env.LLAMA_CLOUD_API_KEY,
+    baseUrl: process.env.LLAMA_CLOUD_BASE_URL,
+  });
+
+  const queryEngine = index.asQueryEngine({
+    denseSimilarityTopK: 5,
+  });
+
+  const rl = readline.createInterface({ input, output });
+
+  while (true) {
+    const query = await rl.question("Query: ");
+    const stream = await queryEngine.query({
+      query,
+      stream: true,
+    });
+    console.log();
+    for await (const chunk of stream) {
+      process.stdout.write(chunk.response);
+    }
+  }
+}
+
+main().catch(console.error);
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -9,7 +9,7 @@
    "@aws-crypto/sha256-js": "^5.2.0",
    "@datastax/astra-db-ts": "^0.1.4",
    "@grpc/grpc-js": "^1.10.2",
-    "@llamaindex/cloud": "0.0.4",
+    "@llamaindex/cloud": "0.0.5",
    "@llamaindex/env": "workspace:*",
    "@mistralai/mistralai": "^0.0.10",
    "@notionhq/client": "^2.2.14",

--- a/packages/core/src/TextSplitter.ts
+++ b/packages/core/src/TextSplitter.ts
@@ -66,8 +66,9 @@ export const defaultParagraphSeparator = EOL + EOL + EOL;
 * One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together.
 */
 export class SentenceSplitter {
-  private chunkSize: number;
-  private chunkOverlap: number;
+  public chunkSize: number;
+  public chunkOverlap: number;
+
  private tokenizer: any;
  private tokenizerDecoder: any;
  private paragraphSeparator: string;

--- a/packages/core/src/cloud/LlamaCloudIndex.ts
+++ b/packages/core/src/cloud/LlamaCloudIndex.ts
+import { PlatformApi } from "@llamaindex/cloud";
+import type { Document } from "../Node.js";
 import type { BaseRetriever } from "../Retriever.js";
 import { RetrieverQueryEngine } from "../engines/query/RetrieverQueryEngine.js";
+import type { TransformComponent } from "../ingestion/types.js";
 import type { BaseNodePostprocessor } from "../postprocessors/types.js";
 import type { BaseSynthesizer } from "../synthesizers/types.js";
 import type { BaseQueryEngine } from "../types.js";
 import type { CloudRetrieveParams } from "./LlamaCloudRetriever.js";
 import { LlamaCloudRetriever } from "./LlamaCloudRetriever.js";
+import { getPipelineCreate } from "./config.js";
 import type { CloudConstructorParams } from "./types.js";
+import { getAppBaseUrl, getClient } from "./utils.js";
+
+import { getEnv } from "@llamaindex/env";
+import { OpenAIEmbedding } from "../embeddings/OpenAIEmbedding.js";
+import { SimpleNodeParser } from "../nodeParsers/SimpleNodeParser.js";

 export class LlamaCloudIndex {
  params: CloudConstructorParams;
@@ -14,6 +23,151 @@ export class LlamaCloudIndex {
    this.params = params;
  }

+  static async fromDocuments(
+    params: {
+      documents: Document[];
+      transformations?: TransformComponent[];
+      verbose?: boolean;
+    } & CloudConstructorParams,
+  ): Promise<LlamaCloudIndex> {
+    const defaultTransformations: TransformComponent[] = [
+      new OpenAIEmbedding({
+        apiKey: getEnv("OPENAI_API_KEY"),
+      }),
+      new SimpleNodeParser(),
+    ];
+
+    const appUrl = getAppBaseUrl(params.baseUrl);
+
+    const client = await getClient({ ...params, baseUrl: appUrl });
+
+    const pipelineCreateParams = await getPipelineCreate({
+      pipelineName: params.name,
+      pipelineType: "MANAGED",
+      inputNodes: params.documents,
+      transformations: params.transformations ?? defaultTransformations,
+    });
+
+    const project = await client.project.upsertProject({
+      name: params.projectName ?? "default",
+    });
+
+    if (!project.id) {
+      throw new Error("Project ID should be defined");
+    }
+
+    const pipeline = await client.project.upsertPipelineForProject(
+      project.id,
+      pipelineCreateParams,
+    );
+
+    if (!pipeline.id) {
+      throw new Error("Pipeline ID must be defined");
+    }
+
+    if (params.verbose) {
+      console.log(`Created pipeline ${pipeline.id} with name ${params.name}`);
+    }
+
+    const executionsIds: {
+      exectionId: string;
+      dataSourceId: string;
+    }[] = [];
+
+    for (const dataSource of pipeline.dataSources) {
+      const dataSourceExection =
+        await client.dataSource.createDataSourceExecution(dataSource.id);
+
+      if (!dataSourceExection.id) {
+        throw new Error("Data Source Execution ID must be defined");
+      }
+
+      executionsIds.push({
+        exectionId: dataSourceExection.id,
+        dataSourceId: dataSource.id,
+      });
+    }
+
+    let isDone = false;
+
+    while (!isDone) {
+      const statuses = [];
+
+      for await (const execution of executionsIds) {
+        const dataSourceExecution =
+          await client.dataSource.getDataSourceExecution(
+            execution.dataSourceId,
+            execution.exectionId,
+          );
+
+        statuses.push(dataSourceExecution.status);
+
+        if (
+          statuses.every((status) => status === PlatformApi.StatusEnum.Success)
+        ) {
+          isDone = true;
+          if (params.verbose) {
+            console.info("Data Source Execution completed");
+          }
+          break;
+        } else if (
+          statuses.some((status) => status === PlatformApi.StatusEnum.Error)
+        ) {
+          throw new Error("Data Source Execution failed");
+        } else {
+          await new Promise((resolve) => setTimeout(resolve, 1000));
+          if (params.verbose) {
+            process.stdout.write(".");
+          }
+        }
+      }
+    }
+
+    isDone = false;
+
+    const execution = await client.pipeline.runManagedPipelineIngestion(
+      pipeline.id,
+    );
+
+    const ingestionId = execution.id;
+
+    if (!ingestionId) {
+      throw new Error("Ingestion ID must be defined");
+    }
+
+    while (!isDone) {
+      const pipelineStatus = await client.pipeline.getManagedIngestionExecution(
+        pipeline.id,
+        ingestionId,
+      );
+
+      if (pipelineStatus.status === PlatformApi.StatusEnum.Success) {
+        isDone = true;
+
+        if (params.verbose) {
+          console.info("Ingestion completed");
+        }
+
+        break;
+      } else if (pipelineStatus.status === PlatformApi.StatusEnum.Error) {
+        throw new Error("Ingestion failed");
+      } else {
+        await new Promise((resolve) => setTimeout(resolve, 1000));
+        if (params.verbose) {
+          process.stdout.write(".");
+        }
+      }
+    }
+
+    if (params.verbose) {
+      console.info(
+        `Ingestion completed, find your index at ${appUrl}/project/${project.id}/deploy/${pipeline.id}`,
+      );
+    }
+
+    return new LlamaCloudIndex({ ...params });
+  }
+
  asRetriever(params: CloudRetrieveParams = {}): BaseRetriever {
    return new LlamaCloudRetriever({ ...this.params, ...params });
  }

--- a/packages/core/src/cloud/config.ts
+++ b/packages/core/src/cloud/config.ts
@@ -18,11 +18,11 @@ function getTransformationConfig(
    return {
      configurableTransformationType: "SENTENCE_AWARE_NODE_PARSER",
      component: {
-        // TODO: API returns 422 if these parameters are included
-        // chunkSize: transformation.textSplitter.chunkSize, // TODO: set to public in SentenceSplitter
-        // chunkOverlap: transformation.textSplitter.chunkOverlap, // TODO: set to public in SentenceSplitter
-        // includeMetadata: transformation.includeMetadata,
-        // includePrevNextRel: transformation.includePrevNextRel,
+        // TODO: API doesnt accept camelCase
+        chunk_size: transformation.textSplitter.chunkSize, // TODO: set to public in SentenceSplitter
+        chunk_overlap: transformation.textSplitter.chunkOverlap, // TODO: set to public in SentenceSplitter
+        include_metadata: transformation.includeMetadata,
+        include_prev_next_rel: transformation.includePrevNextRel,
      },
    };
  }
@@ -30,9 +30,10 @@ function getTransformationConfig(
    return {
      configurableTransformationType: "OPENAI_EMBEDDING",
      component: {
-        modelName: transformation.model,
-        apiKey: transformation.apiKey,
-        embedBatchSize: transformation.embedBatchSize,
+        // TODO: API doesnt accept camelCase
+        model: transformation.model,
+        api_key: transformation.apiKey,
+        embed_batch_size: transformation.embedBatchSize,
        dimensions: transformation.dimensions,
      },
    };
@@ -71,10 +72,12 @@ export async function getPipelineCreate(
    inputNodes = [],
  } = params;

+  const dataSources = inputNodes.map(getDataSourceConfig);
+
  return {
    name: pipelineName,
    configuredTransformations: transformations.map(getTransformationConfig),
-    dataSources: inputNodes.map(getDataSourceConfig),
+    dataSources,
    dataSinks: [],
    pipelineType,
  };

--- a/packages/edge/package.json
+++ b/packages/edge/package.json
@@ -8,7 +8,7 @@
    "@aws-crypto/sha256-js": "^5.2.0",
    "@datastax/astra-db-ts": "^0.1.4",
    "@grpc/grpc-js": "^1.10.2",
-    "@llamaindex/cloud": "0.0.4",
+    "@llamaindex/cloud": "0.0.5",
    "@llamaindex/env": "workspace:*",
    "@mistralai/mistralai": "^0.0.10",
    "@notionhq/client": "^2.2.14",

--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml