From b7b30a53004ce47531e57c66765eb86b91bb4a05 Mon Sep 17 00:00:00 2001
From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com>
Date: Tue, 30 Jan 2024 14:17:33 +0700
Subject: [PATCH] Feat: add + fix chat with web (#481)

---
 create-app.ts                                 |   4 +-
 helpers/index.ts                              | 110 +++++++++++-------
 helpers/python.ts                             |  13 ++-
 helpers/types.ts                              |  14 ++-
 index.ts                                      |   3 +-
 questions.ts                                  |  80 ++++++++++---
 .../components/loaders/python/file/loader.py  |   8 ++
 .../components/loaders/python/web/loader.py   |  15 +++
 .../vectordbs/python/mongo/generate.py        |   3 +-
 .../vectordbs/python/none/generate.py         |   3 +-
 .../vectordbs/python/pg/generate.py           |   3 +-
 11 files changed, 190 insertions(+), 66 deletions(-)
 create mode 100644 templates/components/loaders/python/file/loader.py
 create mode 100644 templates/components/loaders/python/web/loader.py

diff --git a/create-app.ts b/create-app.ts
index ad3343d3..6c684d9d 100644
--- a/create-app.ts
+++ b/create-app.ts
@@ -36,7 +36,7 @@ export async function createApp({
   vectorDb,
   externalPort,
   postInstallAction,
-  contextFile,
+  dataSource,
 }: InstallAppArgs): Promise<void> {
   const root = path.resolve(appPath);
 
@@ -80,7 +80,7 @@ export async function createApp({
     vectorDb,
     externalPort,
     postInstallAction,
-    contextFile,
+    dataSource,
   };
 
   if (frontend) {
diff --git a/helpers/index.ts b/helpers/index.ts
index 40c1b4eb..51d917f4 100644
--- a/helpers/index.ts
+++ b/helpers/index.ts
@@ -13,9 +13,10 @@ import { installPythonTemplate } from "./python";
 import { downloadAndExtractRepo } from "./repo";
 import {
   InstallTemplateArgs,
-  TemplateEngine,
+  TemplateDataSource,
   TemplateFramework,
   TemplateVectorDB,
+  WebSourceConfig,
 } from "./types";
 import { installTSTemplate } from "./typescript";
 
@@ -26,6 +27,7 @@ const createEnvLocalFile = async (
     vectorDb?: TemplateVectorDB;
     model?: string;
     framework?: TemplateFramework;
+    dataSource?: TemplateDataSource;
   },
 ) => {
   const envFileName = ".env";
@@ -58,48 +60,30 @@ const createEnvLocalFile = async (
     }
   }
 
+  switch (opts?.dataSource?.type) {
+    case "web": {
+      let webConfig = opts?.dataSource.config as WebSourceConfig;
+      content += `# web loader config\n`;
+      content += `BASE_URL=${webConfig.baseUrl}\n`;
+      content += `URL_PREFIX=${webConfig.baseUrl}\n`;
+      content += `MAX_DEPTH=${webConfig.depth}\n`;
+      break;
+    }
+  }
+
   if (content) {
     await fs.writeFile(path.join(root, envFileName), content);
     console.log(`Created '${envFileName}' file. Please check the settings.`);
   }
 };
 
-const copyTestData = async (
-  root: string,
+const installDependencies = async (
   framework: TemplateFramework,
   packageManager?: PackageManager,
-  engine?: TemplateEngine,
   openAiKey?: string,
   vectorDb?: TemplateVectorDB,
-  contextFile?: string,
-  // eslint-disable-next-line max-params
 ) => {
-  if (engine === "context") {
-    const destPath = path.join(root, "data");
-    if (contextFile) {
-      console.log(`\nCopying provided file to ${cyan(destPath)}\n`);
-      await fs.mkdir(destPath, { recursive: true });
-      await fs.copyFile(
-        contextFile,
-        path.join(destPath, path.basename(contextFile)),
-      );
-    } else {
-      const srcPath = path.join(
-        __dirname,
-        "..",
-        "templates",
-        "components",
-        "data",
-      );
-      console.log(`\nCopying test data to ${cyan(destPath)}\n`);
-      await copy("**", destPath, {
-        parents: true,
-        cwd: srcPath,
-      });
-    }
-  }
-
-  if (packageManager && engine === "context") {
+  if (packageManager) {
     const runGenerate = `${cyan(
       framework === "fastapi"
         ? "poetry run python app/engine/generate.py"
@@ -108,9 +92,14 @@ const copyTestData = async (
     const hasOpenAiKey = openAiKey || process.env["OPENAI_API_KEY"];
     const hasVectorDb = vectorDb && vectorDb !== "none";
     if (framework === "fastapi") {
-      if (hasOpenAiKey && vectorDb === "none" && isHavingPoetryLockFile()) {
+      if (hasOpenAiKey && !hasVectorDb && isHavingPoetryLockFile()) {
         console.log(`Running ${runGenerate} to generate the context data.`);
-        tryPoetryRun("python app/engine/generate.py");
+        let result = tryPoetryRun("python app/engine/generate.py");
+        if (!result) {
+          console.log(`Failed to run ${runGenerate}.`);
+          process.exit(1);
+        }
+        console.log(`Generated context data`);
         return;
       }
     } else {
@@ -131,6 +120,31 @@ const copyTestData = async (
   }
 };
 
+const copyContextData = async (root: string, contextFile?: string) => {
+  const destPath = path.join(root, "data");
+  if (contextFile) {
+    console.log(`\nCopying provided file to ${cyan(destPath)}\n`);
+    await fs.mkdir(destPath, { recursive: true });
+    await fs.copyFile(
+      contextFile,
+      path.join(destPath, path.basename(contextFile)),
+    );
+  } else {
+    const srcPath = path.join(
+      __dirname,
+      "..",
+      "templates",
+      "components",
+      "data",
+    );
+    console.log(`\nCopying test data to ${cyan(destPath)}\n`);
+    await copy("**", destPath, {
+      parents: true,
+      cwd: srcPath,
+    });
+  }
+};
+
 const installCommunityProject = async ({
   root,
   communityProjectPath,
@@ -174,18 +188,26 @@ export const installTemplate = async (
       vectorDb: props.vectorDb,
       model: props.model,
       framework: props.framework,
+      dataSource: props.dataSource,
     });
 
-    // Copy test pdf file
-    await copyTestData(
-      props.root,
-      props.framework,
-      props.packageManager,
-      props.engine,
-      props.openAiKey,
-      props.vectorDb,
-      props.contextFile,
-    );
+    if (props.engine === "context") {
+      if (
+        props.dataSource?.type === "file" &&
+        "contextFile" in props.dataSource.config
+      ) {
+        await copyContextData(props.root, props.dataSource.config.contextFile);
+      } else {
+        await copyContextData(props.root);
+      }
+      await installDependencies(
+        props.framework,
+        props.packageManager,
+        props.openAiKey,
+        props.vectorDb,
+      );
+      console.log("installed Dependencies");
+    }
   } else {
     // this is a frontend for a full-stack app, create .env file with model information
     const content = `MODEL=${props.model}\nNEXT_PUBLIC_MODEL=${props.model}\n`;
diff --git a/helpers/python.ts b/helpers/python.ts
index 186062ec..ca5f344d 100644
--- a/helpers/python.ts
+++ b/helpers/python.ts
@@ -126,6 +126,7 @@ export const installPythonTemplate = async ({
   framework,
   engine,
   vectorDb,
+  dataSource,
   postInstallAction,
 }: Pick<
   InstallTemplateArgs,
@@ -134,6 +135,7 @@ export const installPythonTemplate = async ({
   | "template"
   | "engine"
   | "vectorDb"
+  | "dataSource"
   | "postInstallAction"
 >) => {
   console.log("\nInitializing Python project with template:", template, "\n");
@@ -167,16 +169,25 @@ export const installPythonTemplate = async ({
 
   if (engine === "context") {
     const compPath = path.join(__dirname, "..", "templates", "components");
+    let vectorDbDirName = vectorDb ?? "none";
     const VectorDBPath = path.join(
       compPath,
       "vectordbs",
       "python",
-      vectorDb || "none",
+      vectorDbDirName,
     );
+    const enginePath = path.join(root, "app", "engine");
+
     await copy("**", path.join(root, "app", "engine"), {
       parents: true,
       cwd: VectorDBPath,
     });
+    let dataSourceDir = dataSource?.type ?? "file";
+    const loaderPath = path.join(compPath, "loaders", "python", dataSourceDir);
+    await copy("**", enginePath, {
+      parents: true,
+      cwd: loaderPath,
+    });
   }
 
   const addOnDependencies = getAdditionalDependencies(vectorDb);
diff --git a/helpers/types.ts b/helpers/types.ts
index d7e6e92a..e2660860 100644
--- a/helpers/types.ts
+++ b/helpers/types.ts
@@ -6,6 +6,18 @@ export type TemplateEngine = "simple" | "context";
 export type TemplateUI = "html" | "shadcn";
 export type TemplateVectorDB = "none" | "mongo" | "pg";
 export type TemplatePostInstallAction = "none" | "dependencies" | "runApp";
+export type TemplateDataSource = {
+  type: "none" | "file" | "web";
+  config: TemplateDataSourceConfig;
+};
+export type FileSourceConfig = {
+  contextFile?: string;
+};
+export type WebSourceConfig = {
+  baseUrl?: string;
+  depth?: number;
+};
+export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig;
 
 export interface InstallTemplateArgs {
   appName: string;
@@ -15,8 +27,8 @@ export interface InstallTemplateArgs {
   template: TemplateType;
   framework: TemplateFramework;
   engine: TemplateEngine;
-  contextFile?: string;
   ui: TemplateUI;
+  dataSource?: TemplateDataSource;
   eslint: boolean;
   customApiPath?: string;
   openAiKey?: string;
diff --git a/index.ts b/index.ts
index 5978dd37..935637e6 100644
--- a/index.ts
+++ b/index.ts
@@ -241,7 +241,7 @@ async function run(): Promise<void> {
     vectorDb: program.vectorDb,
     externalPort: program.externalPort,
     postInstallAction: program.postInstallAction,
-    contextFile: program.contextFile,
+    dataSource: program.dataSource,
   });
   conf.set("preferences", preferences);
 
@@ -278,7 +278,6 @@ async function notifyUpdate(): Promise<void> {
           "\n",
       );
     }
-    process.exit();
   } catch {
     // ignore error
   }
diff --git a/questions.ts b/questions.ts
index 531a8642..eff97455 100644
--- a/questions.ts
+++ b/questions.ts
@@ -40,6 +40,10 @@ const defaults: QuestionArgs = {
   communityProjectPath: "",
   llamapack: "",
   postInstallAction: "dependencies",
+  dataSource: {
+    type: "none",
+    config: {},
+  },
 };
 
 const handlers = {
@@ -149,7 +153,8 @@ export const askQuestions = async (
         ];
 
         const hasOpenAiKey = program.openAiKey || process.env["OPENAI_API_KEY"];
-        if (program.vectorDb === "none" && hasOpenAiKey) {
+        const hasVectorDb = program.vectorDb && program.vectorDb !== "none";
+        if (!hasVectorDb && hasOpenAiKey) {
           actionChoices.push({
             title:
               "Generate code, install dependencies, and run the app (~2 min)",
@@ -378,6 +383,9 @@ export const askQuestions = async (
       if (process.platform === "win32" || process.platform === "darwin") {
         choices.push({ title: "Use a local PDF file", value: "localFile" });
       }
+      if (program.framework === "fastapi") {
+        choices.push({ title: "Use website content", value: "web" });
+      }
 
       const { dataSource } = await prompts(
         {
@@ -389,20 +397,66 @@ export const askQuestions = async (
         },
         handlers,
       );
-      switch (dataSource) {
-        case "simple":
-          program.engine = "simple";
-          break;
-        case "exampleFile":
-          program.engine = "context";
-          break;
-        case "localFile":
-          program.engine = "context";
-          // If the user selected the "pdf" option, ask them to select a file
-          program.contextFile = await selectPDFFile();
-          break;
+      // Initialize with default config
+      program.dataSource = getPrefOrDefault("dataSource");
+      if (program.dataSource) {
+        switch (dataSource) {
+          case "simple":
+            program.engine = "simple";
+            break;
+          case "exampleFile":
+            program.engine = "context";
+            // example file is a context app with dataSource.type = file but has no config
+            program.dataSource = { type: "file", config: {} };
+            break;
+          case "localFile":
+            program.engine = "context";
+            program.dataSource.type = "file";
+            // If the user selected the "pdf" option, ask them to select a file
+            program.dataSource.config = {
+              contextFile: await selectPDFFile(),
+            };
+            break;
+          case "web":
+            program.engine = "context";
+            program.dataSource.type = "web";
+            break;
+        }
       }
     }
+
+    if (program.dataSource?.type === "web" && program.framework === "fastapi") {
+      let { baseUrl } = await prompts(
+        {
+          type: "text",
+          name: "baseUrl",
+          message: "Please provide base URL of the website:",
+          initial: "https://www.llamaindex.ai",
+        },
+        handlers,
+      );
+      try {
+        if (!baseUrl.includes("://")) {
+          baseUrl = `https://${baseUrl}`;
+        }
+        let checkUrl = new URL(baseUrl);
+        if (checkUrl.protocol !== "https:" && checkUrl.protocol !== "http:") {
+          throw new Error("Invalid protocol");
+        }
+      } catch (error) {
+        console.log(
+          red(
+            "Invalid URL provided! Please provide a valid URL (e.g. https://www.llamaindex.ai)",
+          ),
+        );
+        process.exit(1);
+      }
+      program.dataSource.config = {
+        baseUrl: baseUrl,
+        depth: 1,
+      };
+    }
+
     if (program.engine !== "simple" && !program.vectorDb) {
       if (ciInfo.isCI) {
         program.vectorDb = getPrefOrDefault("vectorDb");
diff --git a/templates/components/loaders/python/file/loader.py b/templates/components/loaders/python/file/loader.py
new file mode 100644
index 00000000..d343cec3
--- /dev/null
+++ b/templates/components/loaders/python/file/loader.py
@@ -0,0 +1,8 @@
+import os
+from app.engine.constants import DATA_DIR
+from llama_index import VectorStoreIndex, download_loader
+from llama_index import SimpleDirectoryReader
+
+
+def get_documents():
+    return SimpleDirectoryReader(DATA_DIR).load_data()
diff --git a/templates/components/loaders/python/web/loader.py b/templates/components/loaders/python/web/loader.py
new file mode 100644
index 00000000..026dd101
--- /dev/null
+++ b/templates/components/loaders/python/web/loader.py
@@ -0,0 +1,15 @@
+import os
+from llama_index import VectorStoreIndex, download_loader
+
+
+def get_documents():
+    WholeSiteReader = download_loader("WholeSiteReader")
+
+    # Initialize the scraper with a prefix URL and maximum depth
+    scraper = WholeSiteReader(
+        prefix=os.environ.get("URL_PREFIX"), max_depth=int(os.environ.get("MAX_DEPTH"))
+    )
+    # Start scraping from a base URL
+    documents = scraper.load_data(base_url=os.environ.get("BASE_URL"))
+
+    return documents
diff --git a/templates/components/vectordbs/python/mongo/generate.py b/templates/components/vectordbs/python/mongo/generate.py
index fe0ee9aa..f52b3b48 100644
--- a/templates/components/vectordbs/python/mongo/generate.py
+++ b/templates/components/vectordbs/python/mongo/generate.py
@@ -7,6 +7,7 @@ from llama_index.vector_stores import MongoDBAtlasVectorSearch
 
 from app.engine.constants import DATA_DIR
 from app.engine.context import create_service_context
+from app.engine.loader import get_documents
 
 
 from llama_index import (
@@ -22,7 +23,7 @@ logger = logging.getLogger()
 def generate_datasource(service_context):
     logger.info("Creating new index")
     # load the documents and create the index
-    documents = SimpleDirectoryReader(DATA_DIR).load_data()
+    documents = get_documents()
     store = MongoDBAtlasVectorSearch(
         db_name=os.environ["MONGODB_DATABASE"],
         collection_name=os.environ["MONGODB_VECTORS"],
diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py
index 3c4cd6a9..7ff20012 100644
--- a/templates/components/vectordbs/python/none/generate.py
+++ b/templates/components/vectordbs/python/none/generate.py
@@ -4,6 +4,7 @@ from dotenv import load_dotenv
 
 from app.engine.constants import DATA_DIR, STORAGE_DIR
 from app.engine.context import create_service_context
+from app.engine.loader import get_documents
 
 load_dotenv()
 
@@ -19,7 +20,7 @@ logger = logging.getLogger()
 def generate_datasource(service_context):
     logger.info("Creating new index")
     # load the documents and create the index
-    documents = SimpleDirectoryReader(DATA_DIR).load_data()
+    documents = get_documents()
     index = VectorStoreIndex.from_documents(documents, service_context=service_context)
     # store it for later
     index.storage_context.persist(STORAGE_DIR)
diff --git a/templates/components/vectordbs/python/pg/generate.py b/templates/components/vectordbs/python/pg/generate.py
index ee07e7a4..5c77ee07 100644
--- a/templates/components/vectordbs/python/pg/generate.py
+++ b/templates/components/vectordbs/python/pg/generate.py
@@ -6,6 +6,7 @@ import logging
 from app.engine.constants import DATA_DIR
 from app.engine.context import create_service_context
 from app.engine.utils import init_pg_vector_store_from_env
+from app.engine.loader import get_documents
 
 from llama_index import (
     SimpleDirectoryReader,
@@ -20,7 +21,7 @@ logger = logging.getLogger()
 def generate_datasource(service_context):
     logger.info("Creating new index")
     # load the documents and create the index
-    documents = SimpleDirectoryReader(DATA_DIR).load_data()
+    documents = get_documents()
     store = init_pg_vector_store_from_env()
     storage_context = StorageContext.from_defaults(vector_store=store)
     VectorStoreIndex.from_documents(
-- 
GitLab