From e8db041d5867571cb4ab84c50d97d2ee6dd3a755 Mon Sep 17 00:00:00 2001 From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com> Date: Mon, 25 Mar 2024 12:43:50 +0700 Subject: [PATCH] feat: Add multiple URLs reader (#18) --- .changeset/dry-seals-sniff.md | 5 ++ create-app.ts | 4 +- helpers/env-variables.ts | 56 +++----------- helpers/python.ts | 30 +++++++- helpers/types.ts | 4 +- questions.ts | 73 ++++++++++++------- .../components/engines/python/agent/tools.py | 2 +- .../components/loaders/python/web/loader.py | 32 ++++++-- .../types/simple/fastapi/README-template.md | 2 + .../streaming/fastapi/README-template.md | 2 + 10 files changed, 123 insertions(+), 87 deletions(-) create mode 100644 .changeset/dry-seals-sniff.md diff --git a/.changeset/dry-seals-sniff.md b/.changeset/dry-seals-sniff.md new file mode 100644 index 00000000..459a860c --- /dev/null +++ b/.changeset/dry-seals-sniff.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Add multiple URLs for web data source diff --git a/create-app.ts b/create-app.ts index e453deb1..3379ceec 100644 --- a/create-app.ts +++ b/create-app.ts @@ -130,8 +130,8 @@ export async function createApp({ console.log( yellow( `You have selected tools that require configuration. Please configure them in the ${terminalLink( - "tools_config.json", - `file://${root}/tools_config.json`, + "config/tools.json", + `file://${root}/config/tools.json`, )} file.`, ), ); diff --git a/helpers/env-variables.ts b/helpers/env-variables.ts index b65b0df2..06d27197 100644 --- a/helpers/env-variables.ts +++ b/helpers/env-variables.ts @@ -1,11 +1,9 @@ import fs from "fs/promises"; import path from "path"; import { - FileSourceConfig, TemplateDataSource, TemplateFramework, TemplateVectorDB, - WebSourceConfig, } from "./types"; type EnvVar = { @@ -100,48 +98,6 @@ const getVectorDBEnvs = (vectorDb: TemplateVectorDB) => { } }; -const getDataSourceEnvs = ( - dataSource: TemplateDataSource, - llamaCloudKey?: string, -) => { - switch (dataSource.type) { - case "web": - const config = dataSource.config as WebSourceConfig; - return [ - { - name: "BASE_URL", - description: "The base URL to start web scraping.", - value: config.baseUrl, - }, - { - name: "URL_PREFIX", - description: "The prefix of the URL to start web scraping.", - value: config.baseUrl, - }, - { - name: "MAX_DEPTH", - description: "The maximum depth to scrape.", - value: config.depth?.toString(), - }, - ]; - case "file": - case "folder": - return [ - ...((dataSource?.config as FileSourceConfig).useLlamaParse - ? [ - { - name: "LLAMA_CLOUD_API_KEY", - description: `The Llama Cloud API key.`, - value: llamaCloudKey, - }, - ] - : []), - ]; - default: - return []; - } -}; - export const createBackendEnvFile = async ( root: string, opts: { @@ -173,9 +129,15 @@ export const createBackendEnvFile = async ( // Add vector database environment variables ...(opts.vectorDb ? getVectorDBEnvs(opts.vectorDb) : []), - // Add data source environment variables - ...(opts.dataSource - ? getDataSourceEnvs(opts.dataSource, opts.llamaCloudKey) + // Add LlamaCloud API key + ...(opts.llamaCloudKey + ? [ + { + name: "LLAMA_CLOUD_API_KEY", + description: `The Llama Cloud API key.`, + value: opts.llamaCloudKey, + }, + ] : []), ]; let envVars: EnvVar[] = []; diff --git a/helpers/python.ts b/helpers/python.ts index 60b11980..9687f371 100644 --- a/helpers/python.ts +++ b/helpers/python.ts @@ -12,6 +12,7 @@ import { InstallTemplateArgs, TemplateDataSource, TemplateVectorDB, + WebSourceConfig, } from "./types"; interface Dependency { @@ -237,12 +238,13 @@ export const installPythonTemplate = async ({ parents: true, cwd: path.join(compPath, "engines", "python", "agent"), }); - // Write tools_config.json + // Write tool configs const configContent: Record<string, any> = {}; tools.forEach((tool) => { configContent[tool.name] = tool.config ?? {}; }); - const configFilePath = path.join(root, "tools_config.json"); + const configFilePath = path.join(root, "config/tools.json"); + await fs.mkdir(path.join(root, "config"), { recursive: true }); await fs.writeFile( configFilePath, JSON.stringify(configContent, null, 2), @@ -254,6 +256,30 @@ export const installPythonTemplate = async ({ }); } + // Write loader configs + if (dataSource?.type === "web") { + const config = dataSource.config as WebSourceConfig[]; + const webLoaderConfig = config.map((c) => { + return { + base_url: c.baseUrl, + prefix: c.prefix || c.baseUrl, + depth: c.depth || 1, + }; + }); + const loaderConfigPath = path.join(root, "config/loaders.json"); + await fs.mkdir(path.join(root, "config"), { recursive: true }); + await fs.writeFile( + loaderConfigPath, + JSON.stringify( + { + web: webLoaderConfig, + }, + null, + 2, + ), + ); + } + const dataSourceType = dataSource?.type; if (dataSourceType !== undefined && dataSourceType !== "none") { let loaderFolder: string; diff --git a/helpers/types.ts b/helpers/types.ts index f8db9a28..d093c099 100644 --- a/helpers/types.ts +++ b/helpers/types.ts @@ -24,9 +24,11 @@ export type FileSourceConfig = { }; export type WebSourceConfig = { baseUrl?: string; + prefix?: string; depth?: number; }; -export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig; + +export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig[]; export type CommunityProjectConfig = { owner: string; diff --git a/questions.ts b/questions.ts index cb97fae7..0651f3d1 100644 --- a/questions.ts +++ b/questions.ts @@ -11,6 +11,7 @@ import { FileSourceConfig, TemplateDataSourceType, TemplateFramework, + WebSourceConfig, } from "./helpers"; import { COMMUNITY_OWNER, COMMUNITY_REPO } from "./helpers/constant"; import { templatesDir } from "./helpers/dir"; @@ -755,35 +756,53 @@ export const askQuestions = async ( } if (program.dataSource?.type === "web" && program.framework === "fastapi") { - let { baseUrl } = await prompts( - { - type: "text", - name: "baseUrl", - message: "Please provide base URL of the website:", - initial: "https://www.llamaindex.ai", - }, - handlers, - ); - try { - if (!baseUrl.includes("://")) { - baseUrl = `https://${baseUrl}`; - } - const checkUrl = new URL(baseUrl); - if (checkUrl.protocol !== "https:" && checkUrl.protocol !== "http:") { - throw new Error("Invalid protocol"); + program.dataSource.config = []; + + while (true) { + const questions: any[] = [ + { + type: "text", + name: "baseUrl", + message: "Please provide base URL of the website: ", + initial: "https://www.llamaindex.ai", + validate: (value: string) => { + if (!value.includes("://")) { + value = `https://${value}`; + } + const urlObj = new URL(value); + if (urlObj.protocol !== "https:" && urlObj.protocol !== "http:") { + return `URL=${value} has invalid protocol, only allow http or https`; + } + // Check duplicated URL + if ( + (program.dataSource?.config as WebSourceConfig[]).some( + (c) => c.baseUrl === value, + ) + ) { + return `URL=${value} is already added. Please provide a different URL.`; + } + return true; + }, + }, + { + type: "toggle", + name: "shouldContinue", + message: "Would you like to add another website?", + initial: false, + active: "Yes", + inactive: "No", + }, + ]; + let { shouldContinue, baseUrl } = await prompts(questions, handlers); + program.dataSource.config.push({ + baseUrl: baseUrl, + prefix: baseUrl, + depth: 1, + }); + if (!shouldContinue) { + break; } - } catch (error) { - console.log( - red( - "Invalid URL provided! Please provide a valid URL (e.g. https://www.llamaindex.ai)", - ), - ); - process.exit(1); } - program.dataSource.config = { - baseUrl: baseUrl, - depth: 1, - }; } if (program.engine !== "simple" && !program.vectorDb) { diff --git a/templates/components/engines/python/agent/tools.py b/templates/components/engines/python/agent/tools.py index fafc1fcc..a2ad3b96 100644 --- a/templates/components/engines/python/agent/tools.py +++ b/templates/components/engines/python/agent/tools.py @@ -26,7 +26,7 @@ class ToolFactory: @staticmethod def from_env() -> list[FunctionTool]: tools = [] - with open("tools_config.json", "r") as f: + with open("config/tools.json", "r") as f: tool_configs = json.load(f) for name, config in tool_configs.items(): tools += ToolFactory.create_tool(name, **config) diff --git a/templates/components/loaders/python/web/loader.py b/templates/components/loaders/python/web/loader.py index bc6d0496..096e3c97 100644 --- a/templates/components/loaders/python/web/loader.py +++ b/templates/components/loaders/python/web/loader.py @@ -1,13 +1,31 @@ import os +import json +from pydantic import BaseModel, Field from llama_index.readers.web import WholeSiteReader -def get_documents(): - # Initialize the scraper with a prefix URL and maximum depth - scraper = WholeSiteReader( - prefix=os.environ.get("URL_PREFIX"), max_depth=int(os.environ.get("MAX_DEPTH")) - ) - # Start scraping from a base URL - documents = scraper.load_data(base_url=os.environ.get("BASE_URL")) +class WebLoaderConfig(BaseModel): + base_url: str + prefix: str + max_depth: int = Field(default=1, ge=0) + + +def load_configs(): + with open("config/loaders.json") as f: + configs = json.load(f) + web_config = configs.get("web", None) + if web_config is None: + raise ValueError("No web config found in loaders.json") + return [WebLoaderConfig(**config) for config in web_config] + +def get_documents(): + web_config = load_configs() + documents = [] + for entry in web_config: + scraper = WholeSiteReader( + prefix=entry.prefix, + max_depth=entry.max_depth, + ) + documents.extend(scraper.load_data(entry.base_url)) return documents diff --git a/templates/types/simple/fastapi/README-template.md b/templates/types/simple/fastapi/README-template.md index b1a35c42..69ff766c 100644 --- a/templates/types/simple/fastapi/README-template.md +++ b/templates/types/simple/fastapi/README-template.md @@ -19,6 +19,8 @@ Example `backend/.env` file: OPENAI_API_KEY=<openai_api_key> ``` +If you are using any tools or data sources, you can update their config files in the `config` folder. + Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step): ``` diff --git a/templates/types/streaming/fastapi/README-template.md b/templates/types/streaming/fastapi/README-template.md index 35ef1125..2bea7683 100644 --- a/templates/types/streaming/fastapi/README-template.md +++ b/templates/types/streaming/fastapi/README-template.md @@ -19,6 +19,8 @@ Example `.env` file: OPENAI_API_KEY=<openai_api_key> ``` +If you are using any tools or data sources, you can update their config files in the `config` folder. + Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step): ``` -- GitLab