feat: Add multiple URLs reader (#18)

e8db041d · Huu Le (Lee) · GitHub · b3f26856 · e8db041d · e8db041d
Unverified Commit e8db041d authored 1 year ago by Huu Le (Lee) Committed by GitHub 1 year ago
--- a/.changeset/dry-seals-sniff.md
+++ b/.changeset/dry-seals-sniff.md
+---
+"create-llama": patch
+---
+
+Add multiple URLs for web data source
--- a/create-app.ts
+++ b/create-app.ts
@@ -130,8 +130,8 @@ export async function createApp({
    console.log(
      yellow(
        `You have selected tools that require configuration. Please configure them in the ${terminalLink(
-          "tools_config.json",
-          `file://${root}/tools_config.json`,
+          "config/tools.json",
+          `file://${root}/config/tools.json`,
        )} file.`,
      ),
    );

--- a/helpers/env-variables.ts
+++ b/helpers/env-variables.ts
 import fs from "fs/promises";
 import path from "path";
 import {
-  FileSourceConfig,
  TemplateDataSource,
  TemplateFramework,
  TemplateVectorDB,
-  WebSourceConfig,
 } from "./types";

 type EnvVar = {
@@ -100,48 +98,6 @@ const getVectorDBEnvs = (vectorDb: TemplateVectorDB) => {
  }
 };

-const getDataSourceEnvs = (
-  dataSource: TemplateDataSource,
-  llamaCloudKey?: string,
-) => {
-  switch (dataSource.type) {
-    case "web":
-      const config = dataSource.config as WebSourceConfig;
-      return [
-        {
-          name: "BASE_URL",
-          description: "The base URL to start web scraping.",
-          value: config.baseUrl,
-        },
-        {
-          name: "URL_PREFIX",
-          description: "The prefix of the URL to start web scraping.",
-          value: config.baseUrl,
-        },
-        {
-          name: "MAX_DEPTH",
-          description: "The maximum depth to scrape.",
-          value: config.depth?.toString(),
-        },
-      ];
-    case "file":
-    case "folder":
-      return [
-        ...((dataSource?.config as FileSourceConfig).useLlamaParse
-          ? [
-              {
-                name: "LLAMA_CLOUD_API_KEY",
-                description: `The Llama Cloud API key.`,
-                value: llamaCloudKey,
-              },
-            ]
-          : []),
-      ];
-    default:
-      return [];
-  }
-};
-
 export const createBackendEnvFile = async (
  root: string,
  opts: {
@@ -173,9 +129,15 @@ export const createBackendEnvFile = async (

    // Add vector database environment variables
    ...(opts.vectorDb ? getVectorDBEnvs(opts.vectorDb) : []),
-    // Add data source environment variables
-    ...(opts.dataSource
-      ? getDataSourceEnvs(opts.dataSource, opts.llamaCloudKey)
+    // Add LlamaCloud API key
+    ...(opts.llamaCloudKey
+      ? [
+          {
+            name: "LLAMA_CLOUD_API_KEY",
+            description: `The Llama Cloud API key.`,
+            value: opts.llamaCloudKey,
+          },
+        ]
      : []),
  ];
  let envVars: EnvVar[] = [];

--- a/helpers/python.ts
+++ b/helpers/python.ts
@@ -12,6 +12,7 @@ import {
  InstallTemplateArgs,
  TemplateDataSource,
  TemplateVectorDB,
+  WebSourceConfig,
 } from "./types";

 interface Dependency {
@@ -237,12 +238,13 @@ export const installPythonTemplate = async ({
        parents: true,
        cwd: path.join(compPath, "engines", "python", "agent"),
      });
-      // Write tools_config.json
+      // Write tool configs
      const configContent: Record<string, any> = {};
      tools.forEach((tool) => {
        configContent[tool.name] = tool.config ?? {};
      });
-      const configFilePath = path.join(root, "tools_config.json");
+      const configFilePath = path.join(root, "config/tools.json");
+      await fs.mkdir(path.join(root, "config"), { recursive: true });
      await fs.writeFile(
        configFilePath,
        JSON.stringify(configContent, null, 2),
@@ -254,6 +256,30 @@ export const installPythonTemplate = async ({
      });
    }

+    // Write loader configs
+    if (dataSource?.type === "web") {
+      const config = dataSource.config as WebSourceConfig[];
+      const webLoaderConfig = config.map((c) => {
+        return {
+          base_url: c.baseUrl,
+          prefix: c.prefix || c.baseUrl,
+          depth: c.depth || 1,
+        };
+      });
+      const loaderConfigPath = path.join(root, "config/loaders.json");
+      await fs.mkdir(path.join(root, "config"), { recursive: true });
+      await fs.writeFile(
+        loaderConfigPath,
+        JSON.stringify(
+          {
+            web: webLoaderConfig,
+          },
+          null,
+          2,
+        ),
+      );
+    }
+
    const dataSourceType = dataSource?.type;
    if (dataSourceType !== undefined && dataSourceType !== "none") {
      let loaderFolder: string;

--- a/helpers/types.ts
+++ b/helpers/types.ts
@@ -24,9 +24,11 @@ export type FileSourceConfig = {
 };
 export type WebSourceConfig = {
  baseUrl?: string;
+  prefix?: string;
  depth?: number;
 };
-export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig;
+
+export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig[];

 export type CommunityProjectConfig = {
  owner: string;

--- a/questions.ts
+++ b/questions.ts
@@ -11,6 +11,7 @@ import {
  FileSourceConfig,
  TemplateDataSourceType,
  TemplateFramework,
+  WebSourceConfig,
 } from "./helpers";
 import { COMMUNITY_OWNER, COMMUNITY_REPO } from "./helpers/constant";
 import { templatesDir } from "./helpers/dir";
@@ -755,35 +756,53 @@ export const askQuestions = async (
  }

  if (program.dataSource?.type === "web" && program.framework === "fastapi") {
-    let { baseUrl } = await prompts(
-      {
-        type: "text",
-        name: "baseUrl",
-        message: "Please provide base URL of the website:",
-        initial: "https://www.llamaindex.ai",
-      },
-      handlers,
-    );
-    try {
-      if (!baseUrl.includes("://")) {
-        baseUrl = `https://${baseUrl}`;
-      }
-      const checkUrl = new URL(baseUrl);
-      if (checkUrl.protocol !== "https:" && checkUrl.protocol !== "http:") {
-        throw new Error("Invalid protocol");
+    program.dataSource.config = [];
+
+    while (true) {
+      const questions: any[] = [
+        {
+          type: "text",
+          name: "baseUrl",
+          message: "Please provide base URL of the website: ",
+          initial: "https://www.llamaindex.ai",
+          validate: (value: string) => {
+            if (!value.includes("://")) {
+              value = `https://${value}`;
+            }
+            const urlObj = new URL(value);
+            if (urlObj.protocol !== "https:" && urlObj.protocol !== "http:") {
+              return `URL=${value} has invalid protocol, only allow http or https`;
+            }
+            // Check duplicated URL
+            if (
+              (program.dataSource?.config as WebSourceConfig[]).some(
+                (c) => c.baseUrl === value,
+              )
+            ) {
+              return `URL=${value} is already added. Please provide a different URL.`;
+            }
+            return true;
+          },
+        },
+        {
+          type: "toggle",
+          name: "shouldContinue",
+          message: "Would you like to add another website?",
+          initial: false,
+          active: "Yes",
+          inactive: "No",
+        },
+      ];
+      let { shouldContinue, baseUrl } = await prompts(questions, handlers);
+      program.dataSource.config.push({
+        baseUrl: baseUrl,
+        prefix: baseUrl,
+        depth: 1,
+      });
+      if (!shouldContinue) {
+        break;
      }
-    } catch (error) {
-      console.log(
-        red(
-          "Invalid URL provided! Please provide a valid URL (e.g. https://www.llamaindex.ai)",
-        ),
-      );
-      process.exit(1);
    }
-    program.dataSource.config = {
-      baseUrl: baseUrl,
-      depth: 1,
-    };
  }

  if (program.engine !== "simple" && !program.vectorDb) {

--- a/templates/components/engines/python/agent/tools.py
+++ b/templates/components/engines/python/agent/tools.py
@@ -26,7 +26,7 @@ class ToolFactory:
    @staticmethod
    def from_env() -> list[FunctionTool]:
        tools = []
-        with open("tools_config.json", "r") as f:
+        with open("config/tools.json", "r") as f:
            tool_configs = json.load(f)
            for name, config in tool_configs.items():
                tools += ToolFactory.create_tool(name, **config)

--- a/templates/components/loaders/python/web/loader.py
+++ b/templates/components/loaders/python/web/loader.py
 import os
+import json
+from pydantic import BaseModel, Field
 from llama_index.readers.web import WholeSiteReader


-def get_documents():
-    # Initialize the scraper with a prefix URL and maximum depth
-    scraper = WholeSiteReader(
-        prefix=os.environ.get("URL_PREFIX"), max_depth=int(os.environ.get("MAX_DEPTH"))
-    )
-    # Start scraping from a base URL
-    documents = scraper.load_data(base_url=os.environ.get("BASE_URL"))
+class WebLoaderConfig(BaseModel):
+    base_url: str
+    prefix: str
+    max_depth: int = Field(default=1, ge=0)
+
+
+def load_configs():
+    with open("config/loaders.json") as f:
+        configs = json.load(f)
+    web_config = configs.get("web", None)
+    if web_config is None:
+        raise ValueError("No web config found in loaders.json")
+    return [WebLoaderConfig(**config) for config in web_config]

+
+def get_documents():
+    web_config = load_configs()
+    documents = []
+    for entry in web_config:
+        scraper = WholeSiteReader(
+            prefix=entry.prefix,
+            max_depth=entry.max_depth,
+        )
+        documents.extend(scraper.load_data(entry.base_url))
    return documents
--- a/templates/types/simple/fastapi/README-template.md
+++ b/templates/types/simple/fastapi/README-template.md
@@ -19,6 +19,8 @@ Example `backend/.env` file:
 OPENAI_API_KEY=<openai_api_key>
 ```

+If you are using any tools or data sources, you can update their config files in the `config` folder.
+
 Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step):

 ```

--- a/templates/types/streaming/fastapi/README-template.md
+++ b/templates/types/streaming/fastapi/README-template.md
@@ -19,6 +19,8 @@ Example `.env` file:
 OPENAI_API_KEY=<openai_api_key>
 ```

+If you are using any tools or data sources, you can update their config files in the `config` folder.
+
 Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step):

 ```