From 39aaafcb6cbfe5340813be412b835e2adf8b4239 Mon Sep 17 00:00:00 2001
From: Marcus Schiesser <mail@marcusschiesser.de>
Date: Wed, 20 Mar 2024 11:25:55 +0700
Subject: [PATCH] Add support for llamaparse using Typescript (#11)

---
 helpers/env-variables.ts                      | 30 ++++++++++-----
 helpers/index.ts                              | 31 +++++++--------
 helpers/typescript.ts                         | 38 +++++++++++++++----
 questions.ts                                  |  5 +--
 .../loaders/typescript/file/loader.mjs        |  9 +++++
 .../loaders/typescript/llama_parse/loader.mjs | 19 ++++++++++
 .../vectordbs/typescript/milvus/generate.mjs  | 12 ++----
 .../vectordbs/typescript/milvus/shared.mjs    |  1 -
 .../vectordbs/typescript/mongo/generate.mjs   |  8 ++--
 .../vectordbs/typescript/mongo/shared.mjs     |  1 -
 .../vectordbs/typescript/none/constants.mjs   |  1 -
 .../vectordbs/typescript/none/generate.mjs    | 13 ++-----
 .../vectordbs/typescript/pg/generate.mjs      | 10 ++---
 .../vectordbs/typescript/pg/shared.mjs        |  2 +-
 .../typescript/pinecone/generate.mjs          |  8 ++--
 .../vectordbs/typescript/pinecone/shared.mjs  |  1 -
 16 files changed, 112 insertions(+), 77 deletions(-)
 create mode 100644 templates/components/loaders/typescript/file/loader.mjs
 create mode 100644 templates/components/loaders/typescript/llama_parse/loader.mjs

diff --git a/helpers/env-variables.ts b/helpers/env-variables.ts
index c5e65442..2b084f3a 100644
--- a/helpers/env-variables.ts
+++ b/helpers/env-variables.ts
@@ -99,7 +99,10 @@ const getVectorDBEnvs = (vectorDb: TemplateVectorDB) => {
   }
 };
 
-const getDataSourceEnvs = (dataSource: TemplateDataSource) => {
+const getDataSourceEnvs = (
+  dataSource: TemplateDataSource,
+  llamaCloudKey?: string,
+) => {
   switch (dataSource.type) {
     case "web":
       return [
@@ -116,6 +119,19 @@ const getDataSourceEnvs = (dataSource: TemplateDataSource) => {
           description: "The maximum depth to scrape.",
         },
       ];
+    case "file":
+    case "folder":
+      return [
+        ...((dataSource?.config as FileSourceConfig).useLlamaParse
+          ? [
+              {
+                name: "LLAMA_CLOUD_API_KEY",
+                description: `The Llama Cloud API key.`,
+                value: llamaCloudKey,
+              },
+            ]
+          : []),
+      ];
     default:
       return [];
   }
@@ -149,10 +165,13 @@ export const createBackendEnvFile = async (
       description: "The OpenAI API key to use.",
       value: opts.openAiKey,
     },
+
     // Add vector database environment variables
     ...(opts.vectorDb ? getVectorDBEnvs(opts.vectorDb) : []),
     // Add data source environment variables
-    ...(opts.dataSource ? getDataSourceEnvs(opts.dataSource) : []),
+    ...(opts.dataSource
+      ? getDataSourceEnvs(opts.dataSource, opts.llamaCloudKey)
+      : []),
   ];
   let envVars: EnvVar[] = [];
   if (opts.framework === "fastapi") {
@@ -204,13 +223,6 @@ We have provided context information below.
 Given this information, please answer the question: {query_str}
 "`,
         },
-        (opts?.dataSource?.config as FileSourceConfig).useLlamaParse
-          ? {
-              name: "LLAMA_CLOUD_API_KEY",
-              description: `The Llama Cloud API key.`,
-              value: opts.llamaCloudKey,
-            }
-          : {},
       ],
     ];
   } else {
diff --git a/helpers/index.ts b/helpers/index.ts
index 3c99cc17..60aec506 100644
--- a/helpers/index.ts
+++ b/helpers/index.ts
@@ -42,30 +42,27 @@ async function generateContextData(
       ? llamaCloudKey || process.env["LLAMA_CLOUD_API_KEY"]
       : true;
     const hasVectorDb = vectorDb && vectorDb !== "none";
-    if (framework === "fastapi") {
-      if (
-        openAiKeyConfigured &&
-        llamaCloudKeyConfigured &&
-        !hasVectorDb &&
-        isHavingPoetryLockFile()
-      ) {
-        console.log(`Running ${runGenerate} to generate the context data.`);
-        const result = tryPoetryRun("python app/engine/generate.py");
-        if (!result) {
-          console.log(`Failed to run ${runGenerate}.`);
-          process.exit(1);
+    if (openAiKeyConfigured && llamaCloudKeyConfigured && !hasVectorDb) {
+      // If all the required environment variables are set, run the generate script
+      if (framework === "fastapi") {
+        if (isHavingPoetryLockFile()) {
+          console.log(`Running ${runGenerate} to generate the context data.`);
+          const result = tryPoetryRun("python app/engine/generate.py");
+          if (!result) {
+            console.log(`Failed to run ${runGenerate}.`);
+            process.exit(1);
+          }
+          console.log(`Generated context data`);
+          return;
         }
-        console.log(`Generated context data`);
-        return;
-      }
-    } else {
-      if (openAiKeyConfigured && vectorDb === "none") {
+      } else {
         console.log(`Running ${runGenerate} to generate the context data.`);
         await callPackageManager(packageManager, true, ["run", "generate"]);
         return;
       }
     }
 
+    // generate the message of what to do to run the generate script manually
     const settings = [];
     if (!openAiKeyConfigured) settings.push("your OpenAI key");
     if (!llamaCloudKeyConfigured) settings.push("your Llama Cloud key");
diff --git a/helpers/typescript.ts b/helpers/typescript.ts
index 902a7d4d..91dd5b82 100644
--- a/helpers/typescript.ts
+++ b/helpers/typescript.ts
@@ -6,7 +6,7 @@ import { copy } from "../helpers/copy";
 import { callPackageManager } from "../helpers/install";
 import { templatesDir } from "./dir";
 import { PackageManager } from "./get-pkg-manager";
-import { InstallTemplateArgs } from "./types";
+import { FileSourceConfig, InstallTemplateArgs } from "./types";
 
 const rename = (name: string) => {
   switch (name) {
@@ -64,6 +64,7 @@ export const installTSTemplate = async ({
   postInstallAction,
   backend,
   observability,
+  dataSource,
 }: InstallTemplateArgs & { backend: boolean }) => {
   console.log(bold(`Using ${packageManager}.`));
 
@@ -118,6 +119,7 @@ export const installTSTemplate = async ({
     }
   }
 
+  // copy observability component
   if (observability && observability !== "none") {
     const chosenObservabilityPath = path.join(
       templatesDir,
@@ -150,20 +152,40 @@ export const installTSTemplate = async ({
       vectorDBFolder = vectorDb;
     }
 
-    const VectorDBPath = path.join(
+    relativeEngineDestPath =
+      framework === "nextjs"
+        ? path.join("app", "api", "chat")
+        : path.join("src", "controllers");
+
+    const enginePath = path.join(root, relativeEngineDestPath, "engine");
+
+    // copy vector db component
+    const vectorDBPath = path.join(
       compPath,
       "vectordbs",
       "typescript",
       vectorDBFolder,
     );
-    relativeEngineDestPath =
-      framework === "nextjs"
-        ? path.join("app", "api", "chat")
-        : path.join("src", "controllers");
-    await copy("**", path.join(root, relativeEngineDestPath, "engine"), {
+    await copy("**", enginePath, {
       parents: true,
-      cwd: VectorDBPath,
+      cwd: vectorDBPath,
     });
+
+    // copy loader component
+    const dataSourceType = dataSource?.type;
+    if (dataSourceType && dataSourceType !== "none") {
+      let loaderFolder: string;
+      if (dataSourceType === "file" || dataSourceType === "folder") {
+        const dataSourceConfig = dataSource?.config as FileSourceConfig;
+        loaderFolder = dataSourceConfig.useLlamaParse ? "llama_parse" : "file";
+      } else {
+        loaderFolder = dataSourceType;
+      }
+      await copy("**", enginePath, {
+        parents: true,
+        cwd: path.join(compPath, "loaders", "typescript", loaderFolder),
+      });
+    }
   }
 
   /**
diff --git a/questions.ts b/questions.ts
index 5e84c82b..3134c315 100644
--- a/questions.ts
+++ b/questions.ts
@@ -691,9 +691,8 @@ export const askQuestions = async (
   }
 
   if (
-    (program.dataSource?.type === "file" ||
-      program.dataSource?.type === "folder") &&
-    program.framework === "fastapi"
+    program.dataSource?.type === "file" ||
+    program.dataSource?.type === "folder"
   ) {
     if (ciInfo.isCI) {
       program.llamaCloudKey = getPrefOrDefault("llamaCloudKey");
diff --git a/templates/components/loaders/typescript/file/loader.mjs b/templates/components/loaders/typescript/file/loader.mjs
new file mode 100644
index 00000000..3039f34f
--- /dev/null
+++ b/templates/components/loaders/typescript/file/loader.mjs
@@ -0,0 +1,9 @@
+import { SimpleDirectoryReader } from "llamaindex";
+
+export const DATA_DIR = "./data";
+
+export async function getDocuments() {
+  return await new SimpleDirectoryReader().loadData({
+    directoryPath: DATA_DIR,
+  });
+}
diff --git a/templates/components/loaders/typescript/llama_parse/loader.mjs b/templates/components/loaders/typescript/llama_parse/loader.mjs
new file mode 100644
index 00000000..f285673f
--- /dev/null
+++ b/templates/components/loaders/typescript/llama_parse/loader.mjs
@@ -0,0 +1,19 @@
+import {
+  FILE_EXT_TO_READER,
+  LlamaParseReader,
+  SimpleDirectoryReader,
+} from "llamaindex";
+
+export const DATA_DIR = "./data";
+
+export async function getDocuments() {
+  const reader = new SimpleDirectoryReader();
+  // Load PDFs using LlamaParseReader
+  return await reader.loadData({
+    directoryPath: DATA_DIR,
+    fileExtToReader: {
+      ...FILE_EXT_TO_READER,
+      pdf: new LlamaParseReader({ resultType: "markdown" }),
+    },
+  });
+}
diff --git a/templates/components/vectordbs/typescript/milvus/generate.mjs b/templates/components/vectordbs/typescript/milvus/generate.mjs
index 905a066c..11c3184d 100644
--- a/templates/components/vectordbs/typescript/milvus/generate.mjs
+++ b/templates/components/vectordbs/typescript/milvus/generate.mjs
@@ -2,15 +2,11 @@
 import * as dotenv from "dotenv";
 import {
   MilvusVectorStore,
-  SimpleDirectoryReader,
   VectorStoreIndex,
   storageContextFromDefaults,
 } from "llamaindex";
-import {
-  STORAGE_DIR,
-  checkRequiredEnvVars,
-  getMilvusClient,
-} from "./shared.mjs";
+import { getDocuments } from "./loader.mjs";
+import { checkRequiredEnvVars, getMilvusClient } from "./shared.mjs";
 
 dotenv.config();
 
@@ -18,9 +14,7 @@ const collectionName = process.env.MILVUS_COLLECTION;
 
 async function loadAndIndex() {
   // load objects from storage and convert them into LlamaIndex Document objects
-  const documents = await new SimpleDirectoryReader().loadData({
-    directoryPath: STORAGE_DIR,
-  });
+  const documents = await getDocuments();
 
   // Connect to Milvus
   const milvusClient = getMilvusClient();
diff --git a/templates/components/vectordbs/typescript/milvus/shared.mjs b/templates/components/vectordbs/typescript/milvus/shared.mjs
index 0a35d715..a02ea57c 100644
--- a/templates/components/vectordbs/typescript/milvus/shared.mjs
+++ b/templates/components/vectordbs/typescript/milvus/shared.mjs
@@ -1,6 +1,5 @@
 import { MilvusClient } from "@zilliz/milvus2-sdk-node";
 
-export const STORAGE_DIR = "./data";
 export const CHUNK_SIZE = 512;
 export const CHUNK_OVERLAP = 20;
 
diff --git a/templates/components/vectordbs/typescript/mongo/generate.mjs b/templates/components/vectordbs/typescript/mongo/generate.mjs
index 7337d122..618859eb 100644
--- a/templates/components/vectordbs/typescript/mongo/generate.mjs
+++ b/templates/components/vectordbs/typescript/mongo/generate.mjs
@@ -2,12 +2,12 @@
 import * as dotenv from "dotenv";
 import {
   MongoDBAtlasVectorSearch,
-  SimpleDirectoryReader,
   VectorStoreIndex,
   storageContextFromDefaults,
 } from "llamaindex";
 import { MongoClient } from "mongodb";
-import { STORAGE_DIR, checkRequiredEnvVars } from "./shared.mjs";
+import { getDocuments } from "./loader.mjs";
+import { checkRequiredEnvVars } from "./shared.mjs";
 
 dotenv.config();
 
@@ -21,9 +21,7 @@ async function loadAndIndex() {
   const client = new MongoClient(mongoUri);
 
   // load objects from storage and convert them into LlamaIndex Document objects
-  const documents = await new SimpleDirectoryReader().loadData({
-    directoryPath: STORAGE_DIR,
-  });
+  const documents = await getDocuments();
 
   // create Atlas as a vector store
   const vectorStore = new MongoDBAtlasVectorSearch({
diff --git a/templates/components/vectordbs/typescript/mongo/shared.mjs b/templates/components/vectordbs/typescript/mongo/shared.mjs
index 264a82f0..ab467182 100644
--- a/templates/components/vectordbs/typescript/mongo/shared.mjs
+++ b/templates/components/vectordbs/typescript/mongo/shared.mjs
@@ -1,4 +1,3 @@
-export const STORAGE_DIR = "./data";
 export const CHUNK_SIZE = 512;
 export const CHUNK_OVERLAP = 20;
 
diff --git a/templates/components/vectordbs/typescript/none/constants.mjs b/templates/components/vectordbs/typescript/none/constants.mjs
index 8cfb403c..42a8664a 100644
--- a/templates/components/vectordbs/typescript/none/constants.mjs
+++ b/templates/components/vectordbs/typescript/none/constants.mjs
@@ -1,4 +1,3 @@
-export const STORAGE_DIR = "./data";
 export const STORAGE_CACHE_DIR = "./cache";
 export const CHUNK_SIZE = 512;
 export const CHUNK_OVERLAP = 20;
diff --git a/templates/components/vectordbs/typescript/none/generate.mjs b/templates/components/vectordbs/typescript/none/generate.mjs
index 9334f98e..5b3987c1 100644
--- a/templates/components/vectordbs/typescript/none/generate.mjs
+++ b/templates/components/vectordbs/typescript/none/generate.mjs
@@ -1,18 +1,13 @@
 import {
   serviceContextFromDefaults,
-  SimpleDirectoryReader,
   storageContextFromDefaults,
   VectorStoreIndex,
 } from "llamaindex";
 
 import * as dotenv from "dotenv";
 
-import {
-  CHUNK_OVERLAP,
-  CHUNK_SIZE,
-  STORAGE_CACHE_DIR,
-  STORAGE_DIR,
-} from "./constants.mjs";
+import { CHUNK_OVERLAP, CHUNK_SIZE, STORAGE_CACHE_DIR } from "./constants.mjs";
+import { getDocuments } from "./loader.mjs";
 
 // Load environment variables from local .env file
 dotenv.config();
@@ -31,9 +26,7 @@ async function generateDatasource(serviceContext) {
     const storageContext = await storageContextFromDefaults({
       persistDir: STORAGE_CACHE_DIR,
     });
-    const documents = await new SimpleDirectoryReader().loadData({
-      directoryPath: STORAGE_DIR,
-    });
+    const documents = await getDocuments();
     await VectorStoreIndex.fromDocuments(documents, {
       storageContext,
       serviceContext,
diff --git a/templates/components/vectordbs/typescript/pg/generate.mjs b/templates/components/vectordbs/typescript/pg/generate.mjs
index 3d959c69..ca8410bf 100644
--- a/templates/components/vectordbs/typescript/pg/generate.mjs
+++ b/templates/components/vectordbs/typescript/pg/generate.mjs
@@ -2,14 +2,14 @@
 import * as dotenv from "dotenv";
 import {
   PGVectorStore,
-  SimpleDirectoryReader,
   VectorStoreIndex,
   storageContextFromDefaults,
 } from "llamaindex";
+import { getDocuments } from "./loader.mjs";
 import {
+  PGVECTOR_COLLECTION,
   PGVECTOR_SCHEMA,
   PGVECTOR_TABLE,
-  STORAGE_DIR,
   checkRequiredEnvVars,
 } from "./shared.mjs";
 
@@ -17,9 +17,7 @@ dotenv.config();
 
 async function loadAndIndex() {
   // load objects from storage and convert them into LlamaIndex Document objects
-  const documents = await new SimpleDirectoryReader().loadData({
-    directoryPath: STORAGE_DIR,
-  });
+  const documents = await getDocuments();
 
   // create postgres vector store
   const vectorStore = new PGVectorStore({
@@ -27,7 +25,7 @@ async function loadAndIndex() {
     schemaName: PGVECTOR_SCHEMA,
     tableName: PGVECTOR_TABLE,
   });
-  vectorStore.setCollection(STORAGE_DIR);
+  vectorStore.setCollection(PGVECTOR_COLLECTION);
   vectorStore.clearCollection();
 
   // create index from all the Documents
diff --git a/templates/components/vectordbs/typescript/pg/shared.mjs b/templates/components/vectordbs/typescript/pg/shared.mjs
index 8ad729c0..ba747934 100644
--- a/templates/components/vectordbs/typescript/pg/shared.mjs
+++ b/templates/components/vectordbs/typescript/pg/shared.mjs
@@ -1,4 +1,4 @@
-export const STORAGE_DIR = "./data";
+export const PGVECTOR_COLLECTION = "data";
 export const CHUNK_SIZE = 512;
 export const CHUNK_OVERLAP = 20;
 export const PGVECTOR_SCHEMA = "public";
diff --git a/templates/components/vectordbs/typescript/pinecone/generate.mjs b/templates/components/vectordbs/typescript/pinecone/generate.mjs
index b371a639..3e1fcaa0 100644
--- a/templates/components/vectordbs/typescript/pinecone/generate.mjs
+++ b/templates/components/vectordbs/typescript/pinecone/generate.mjs
@@ -2,19 +2,17 @@
 import * as dotenv from "dotenv";
 import {
   PineconeVectorStore,
-  SimpleDirectoryReader,
   VectorStoreIndex,
   storageContextFromDefaults,
 } from "llamaindex";
-import { STORAGE_DIR, checkRequiredEnvVars } from "./shared.mjs";
+import { getDocuments } from "./loader.mjs";
+import { checkRequiredEnvVars } from "./shared.mjs";
 
 dotenv.config();
 
 async function loadAndIndex() {
   // load objects from storage and convert them into LlamaIndex Document objects
-  const documents = await new SimpleDirectoryReader().loadData({
-    directoryPath: STORAGE_DIR,
-  });
+  const documents = await getDocuments();
 
   // create vector store
   const vectorStore = new PineconeVectorStore();
diff --git a/templates/components/vectordbs/typescript/pinecone/shared.mjs b/templates/components/vectordbs/typescript/pinecone/shared.mjs
index f9140261..ae2fd6b1 100644
--- a/templates/components/vectordbs/typescript/pinecone/shared.mjs
+++ b/templates/components/vectordbs/typescript/pinecone/shared.mjs
@@ -1,4 +1,3 @@
-export const STORAGE_DIR = "./data";
 export const CHUNK_SIZE = 512;
 export const CHUNK_OVERLAP = 20;
 
-- 
GitLab