diff --git a/.changeset/dry-seals-sniff.md b/.changeset/dry-seals-sniff.md new file mode 100644 index 0000000000000000000000000000000000000000..459a860c59ac1326bd93515cae5cf6979d66dc93 --- /dev/null +++ b/.changeset/dry-seals-sniff.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Add multiple URLs for web data source diff --git a/create-app.ts b/create-app.ts index e453deb1456b9401b01ab46d57cc94c51eb7ebd5..3379ceec7c2ac24e5d96ea58b797cdfd14e0de82 100644 --- a/create-app.ts +++ b/create-app.ts @@ -130,8 +130,8 @@ export async function createApp({ console.log( yellow( `You have selected tools that require configuration. Please configure them in the ${terminalLink( - "tools_config.json", - `file://${root}/tools_config.json`, + "config/tools.json", + `file://${root}/config/tools.json`, )} file.`, ), ); diff --git a/helpers/env-variables.ts b/helpers/env-variables.ts index b65b0df29d1e986abd9574ec8ba73490314e2761..06d27197efbc6354bfc76555f4278b184166125c 100644 --- a/helpers/env-variables.ts +++ b/helpers/env-variables.ts @@ -1,11 +1,9 @@ import fs from "fs/promises"; import path from "path"; import { - FileSourceConfig, TemplateDataSource, TemplateFramework, TemplateVectorDB, - WebSourceConfig, } from "./types"; type EnvVar = { @@ -100,48 +98,6 @@ const getVectorDBEnvs = (vectorDb: TemplateVectorDB) => { } }; -const getDataSourceEnvs = ( - dataSource: TemplateDataSource, - llamaCloudKey?: string, -) => { - switch (dataSource.type) { - case "web": - const config = dataSource.config as WebSourceConfig; - return [ - { - name: "BASE_URL", - description: "The base URL to start web scraping.", - value: config.baseUrl, - }, - { - name: "URL_PREFIX", - description: "The prefix of the URL to start web scraping.", - value: config.baseUrl, - }, - { - name: "MAX_DEPTH", - description: "The maximum depth to scrape.", - value: config.depth?.toString(), - }, - ]; - case "file": - case "folder": - return [ - ...((dataSource?.config as FileSourceConfig).useLlamaParse - ? [ - { - name: "LLAMA_CLOUD_API_KEY", - description: `The Llama Cloud API key.`, - value: llamaCloudKey, - }, - ] - : []), - ]; - default: - return []; - } -}; - export const createBackendEnvFile = async ( root: string, opts: { @@ -173,9 +129,15 @@ export const createBackendEnvFile = async ( // Add vector database environment variables ...(opts.vectorDb ? getVectorDBEnvs(opts.vectorDb) : []), - // Add data source environment variables - ...(opts.dataSource - ? getDataSourceEnvs(opts.dataSource, opts.llamaCloudKey) + // Add LlamaCloud API key + ...(opts.llamaCloudKey + ? [ + { + name: "LLAMA_CLOUD_API_KEY", + description: `The Llama Cloud API key.`, + value: opts.llamaCloudKey, + }, + ] : []), ]; let envVars: EnvVar[] = []; diff --git a/helpers/python.ts b/helpers/python.ts index 60b119800a3a8bd5f5f254ac592600e8ebee8db0..9687f3716810b77699bf6ef4c8f67620a28401f7 100644 --- a/helpers/python.ts +++ b/helpers/python.ts @@ -12,6 +12,7 @@ import { InstallTemplateArgs, TemplateDataSource, TemplateVectorDB, + WebSourceConfig, } from "./types"; interface Dependency { @@ -237,12 +238,13 @@ export const installPythonTemplate = async ({ parents: true, cwd: path.join(compPath, "engines", "python", "agent"), }); - // Write tools_config.json + // Write tool configs const configContent: Record<string, any> = {}; tools.forEach((tool) => { configContent[tool.name] = tool.config ?? {}; }); - const configFilePath = path.join(root, "tools_config.json"); + const configFilePath = path.join(root, "config/tools.json"); + await fs.mkdir(path.join(root, "config"), { recursive: true }); await fs.writeFile( configFilePath, JSON.stringify(configContent, null, 2), @@ -254,6 +256,30 @@ export const installPythonTemplate = async ({ }); } + // Write loader configs + if (dataSource?.type === "web") { + const config = dataSource.config as WebSourceConfig[]; + const webLoaderConfig = config.map((c) => { + return { + base_url: c.baseUrl, + prefix: c.prefix || c.baseUrl, + depth: c.depth || 1, + }; + }); + const loaderConfigPath = path.join(root, "config/loaders.json"); + await fs.mkdir(path.join(root, "config"), { recursive: true }); + await fs.writeFile( + loaderConfigPath, + JSON.stringify( + { + web: webLoaderConfig, + }, + null, + 2, + ), + ); + } + const dataSourceType = dataSource?.type; if (dataSourceType !== undefined && dataSourceType !== "none") { let loaderFolder: string; diff --git a/helpers/types.ts b/helpers/types.ts index f8db9a28d76e29356bdbaa4ac43099d25cf77aa3..d093c09910f6a1289d3a2a7aa040ae85fbb8125d 100644 --- a/helpers/types.ts +++ b/helpers/types.ts @@ -24,9 +24,11 @@ export type FileSourceConfig = { }; export type WebSourceConfig = { baseUrl?: string; + prefix?: string; depth?: number; }; -export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig; + +export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig[]; export type CommunityProjectConfig = { owner: string; diff --git a/questions.ts b/questions.ts index cb97fae743ffe0fa46f2c3e7aa3fbdc63304b710..0651f3d116fac6927840286c5b63c4a2a5bedfef 100644 --- a/questions.ts +++ b/questions.ts @@ -11,6 +11,7 @@ import { FileSourceConfig, TemplateDataSourceType, TemplateFramework, + WebSourceConfig, } from "./helpers"; import { COMMUNITY_OWNER, COMMUNITY_REPO } from "./helpers/constant"; import { templatesDir } from "./helpers/dir"; @@ -755,35 +756,53 @@ export const askQuestions = async ( } if (program.dataSource?.type === "web" && program.framework === "fastapi") { - let { baseUrl } = await prompts( - { - type: "text", - name: "baseUrl", - message: "Please provide base URL of the website:", - initial: "https://www.llamaindex.ai", - }, - handlers, - ); - try { - if (!baseUrl.includes("://")) { - baseUrl = `https://${baseUrl}`; - } - const checkUrl = new URL(baseUrl); - if (checkUrl.protocol !== "https:" && checkUrl.protocol !== "http:") { - throw new Error("Invalid protocol"); + program.dataSource.config = []; + + while (true) { + const questions: any[] = [ + { + type: "text", + name: "baseUrl", + message: "Please provide base URL of the website: ", + initial: "https://www.llamaindex.ai", + validate: (value: string) => { + if (!value.includes("://")) { + value = `https://${value}`; + } + const urlObj = new URL(value); + if (urlObj.protocol !== "https:" && urlObj.protocol !== "http:") { + return `URL=${value} has invalid protocol, only allow http or https`; + } + // Check duplicated URL + if ( + (program.dataSource?.config as WebSourceConfig[]).some( + (c) => c.baseUrl === value, + ) + ) { + return `URL=${value} is already added. Please provide a different URL.`; + } + return true; + }, + }, + { + type: "toggle", + name: "shouldContinue", + message: "Would you like to add another website?", + initial: false, + active: "Yes", + inactive: "No", + }, + ]; + let { shouldContinue, baseUrl } = await prompts(questions, handlers); + program.dataSource.config.push({ + baseUrl: baseUrl, + prefix: baseUrl, + depth: 1, + }); + if (!shouldContinue) { + break; } - } catch (error) { - console.log( - red( - "Invalid URL provided! Please provide a valid URL (e.g. https://www.llamaindex.ai)", - ), - ); - process.exit(1); } - program.dataSource.config = { - baseUrl: baseUrl, - depth: 1, - }; } if (program.engine !== "simple" && !program.vectorDb) { diff --git a/templates/components/engines/python/agent/tools.py b/templates/components/engines/python/agent/tools.py index fafc1fcc874477893e7ecc4ae9add4bcb5e062b1..a2ad3b9697c994ca56c13cb9f72c843054eef6cf 100644 --- a/templates/components/engines/python/agent/tools.py +++ b/templates/components/engines/python/agent/tools.py @@ -26,7 +26,7 @@ class ToolFactory: @staticmethod def from_env() -> list[FunctionTool]: tools = [] - with open("tools_config.json", "r") as f: + with open("config/tools.json", "r") as f: tool_configs = json.load(f) for name, config in tool_configs.items(): tools += ToolFactory.create_tool(name, **config) diff --git a/templates/components/loaders/python/web/loader.py b/templates/components/loaders/python/web/loader.py index bc6d0496dda8fc7a9e0b1a79e1a4084fec6a6cb6..096e3c9701fcb1b1ce4f584e19addd2aac7bb6dc 100644 --- a/templates/components/loaders/python/web/loader.py +++ b/templates/components/loaders/python/web/loader.py @@ -1,13 +1,31 @@ import os +import json +from pydantic import BaseModel, Field from llama_index.readers.web import WholeSiteReader -def get_documents(): - # Initialize the scraper with a prefix URL and maximum depth - scraper = WholeSiteReader( - prefix=os.environ.get("URL_PREFIX"), max_depth=int(os.environ.get("MAX_DEPTH")) - ) - # Start scraping from a base URL - documents = scraper.load_data(base_url=os.environ.get("BASE_URL")) +class WebLoaderConfig(BaseModel): + base_url: str + prefix: str + max_depth: int = Field(default=1, ge=0) + + +def load_configs(): + with open("config/loaders.json") as f: + configs = json.load(f) + web_config = configs.get("web", None) + if web_config is None: + raise ValueError("No web config found in loaders.json") + return [WebLoaderConfig(**config) for config in web_config] + +def get_documents(): + web_config = load_configs() + documents = [] + for entry in web_config: + scraper = WholeSiteReader( + prefix=entry.prefix, + max_depth=entry.max_depth, + ) + documents.extend(scraper.load_data(entry.base_url)) return documents diff --git a/templates/types/simple/fastapi/README-template.md b/templates/types/simple/fastapi/README-template.md index b1a35c42e80331465066fcf5d7200ad2fdb7f92d..69ff766c0aaeae4f6dda0999c6bec81d71b8a061 100644 --- a/templates/types/simple/fastapi/README-template.md +++ b/templates/types/simple/fastapi/README-template.md @@ -19,6 +19,8 @@ Example `backend/.env` file: OPENAI_API_KEY=<openai_api_key> ``` +If you are using any tools or data sources, you can update their config files in the `config` folder. + Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step): ``` diff --git a/templates/types/streaming/fastapi/README-template.md b/templates/types/streaming/fastapi/README-template.md index 35ef1125adf9044ee229105362e93fe07766d73a..2bea76831042acb84aacd4a9c68285046d73f3bd 100644 --- a/templates/types/streaming/fastapi/README-template.md +++ b/templates/types/streaming/fastapi/README-template.md @@ -19,6 +19,8 @@ Example `.env` file: OPENAI_API_KEY=<openai_api_key> ``` +If you are using any tools or data sources, you can update their config files in the `config` folder. + Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step): ```