From b7b30a53004ce47531e57c66765eb86b91bb4a05 Mon Sep 17 00:00:00 2001 From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com> Date: Tue, 30 Jan 2024 14:17:33 +0700 Subject: [PATCH] Feat: add + fix chat with web (#481) --- create-app.ts | 4 +- helpers/index.ts | 110 +++++++++++------- helpers/python.ts | 13 ++- helpers/types.ts | 14 ++- index.ts | 3 +- questions.ts | 80 ++++++++++--- .../components/loaders/python/file/loader.py | 8 ++ .../components/loaders/python/web/loader.py | 15 +++ .../vectordbs/python/mongo/generate.py | 3 +- .../vectordbs/python/none/generate.py | 3 +- .../vectordbs/python/pg/generate.py | 3 +- 11 files changed, 190 insertions(+), 66 deletions(-) create mode 100644 templates/components/loaders/python/file/loader.py create mode 100644 templates/components/loaders/python/web/loader.py diff --git a/create-app.ts b/create-app.ts index ad3343d3..6c684d9d 100644 --- a/create-app.ts +++ b/create-app.ts @@ -36,7 +36,7 @@ export async function createApp({ vectorDb, externalPort, postInstallAction, - contextFile, + dataSource, }: InstallAppArgs): Promise<void> { const root = path.resolve(appPath); @@ -80,7 +80,7 @@ export async function createApp({ vectorDb, externalPort, postInstallAction, - contextFile, + dataSource, }; if (frontend) { diff --git a/helpers/index.ts b/helpers/index.ts index 40c1b4eb..51d917f4 100644 --- a/helpers/index.ts +++ b/helpers/index.ts @@ -13,9 +13,10 @@ import { installPythonTemplate } from "./python"; import { downloadAndExtractRepo } from "./repo"; import { InstallTemplateArgs, - TemplateEngine, + TemplateDataSource, TemplateFramework, TemplateVectorDB, + WebSourceConfig, } from "./types"; import { installTSTemplate } from "./typescript"; @@ -26,6 +27,7 @@ const createEnvLocalFile = async ( vectorDb?: TemplateVectorDB; model?: string; framework?: TemplateFramework; + dataSource?: TemplateDataSource; }, ) => { const envFileName = ".env"; @@ -58,48 +60,30 @@ const createEnvLocalFile = async ( } } + switch (opts?.dataSource?.type) { + case "web": { + let webConfig = opts?.dataSource.config as WebSourceConfig; + content += `# web loader config\n`; + content += `BASE_URL=${webConfig.baseUrl}\n`; + content += `URL_PREFIX=${webConfig.baseUrl}\n`; + content += `MAX_DEPTH=${webConfig.depth}\n`; + break; + } + } + if (content) { await fs.writeFile(path.join(root, envFileName), content); console.log(`Created '${envFileName}' file. Please check the settings.`); } }; -const copyTestData = async ( - root: string, +const installDependencies = async ( framework: TemplateFramework, packageManager?: PackageManager, - engine?: TemplateEngine, openAiKey?: string, vectorDb?: TemplateVectorDB, - contextFile?: string, - // eslint-disable-next-line max-params ) => { - if (engine === "context") { - const destPath = path.join(root, "data"); - if (contextFile) { - console.log(`\nCopying provided file to ${cyan(destPath)}\n`); - await fs.mkdir(destPath, { recursive: true }); - await fs.copyFile( - contextFile, - path.join(destPath, path.basename(contextFile)), - ); - } else { - const srcPath = path.join( - __dirname, - "..", - "templates", - "components", - "data", - ); - console.log(`\nCopying test data to ${cyan(destPath)}\n`); - await copy("**", destPath, { - parents: true, - cwd: srcPath, - }); - } - } - - if (packageManager && engine === "context") { + if (packageManager) { const runGenerate = `${cyan( framework === "fastapi" ? "poetry run python app/engine/generate.py" @@ -108,9 +92,14 @@ const copyTestData = async ( const hasOpenAiKey = openAiKey || process.env["OPENAI_API_KEY"]; const hasVectorDb = vectorDb && vectorDb !== "none"; if (framework === "fastapi") { - if (hasOpenAiKey && vectorDb === "none" && isHavingPoetryLockFile()) { + if (hasOpenAiKey && !hasVectorDb && isHavingPoetryLockFile()) { console.log(`Running ${runGenerate} to generate the context data.`); - tryPoetryRun("python app/engine/generate.py"); + let result = tryPoetryRun("python app/engine/generate.py"); + if (!result) { + console.log(`Failed to run ${runGenerate}.`); + process.exit(1); + } + console.log(`Generated context data`); return; } } else { @@ -131,6 +120,31 @@ const copyTestData = async ( } }; +const copyContextData = async (root: string, contextFile?: string) => { + const destPath = path.join(root, "data"); + if (contextFile) { + console.log(`\nCopying provided file to ${cyan(destPath)}\n`); + await fs.mkdir(destPath, { recursive: true }); + await fs.copyFile( + contextFile, + path.join(destPath, path.basename(contextFile)), + ); + } else { + const srcPath = path.join( + __dirname, + "..", + "templates", + "components", + "data", + ); + console.log(`\nCopying test data to ${cyan(destPath)}\n`); + await copy("**", destPath, { + parents: true, + cwd: srcPath, + }); + } +}; + const installCommunityProject = async ({ root, communityProjectPath, @@ -174,18 +188,26 @@ export const installTemplate = async ( vectorDb: props.vectorDb, model: props.model, framework: props.framework, + dataSource: props.dataSource, }); - // Copy test pdf file - await copyTestData( - props.root, - props.framework, - props.packageManager, - props.engine, - props.openAiKey, - props.vectorDb, - props.contextFile, - ); + if (props.engine === "context") { + if ( + props.dataSource?.type === "file" && + "contextFile" in props.dataSource.config + ) { + await copyContextData(props.root, props.dataSource.config.contextFile); + } else { + await copyContextData(props.root); + } + await installDependencies( + props.framework, + props.packageManager, + props.openAiKey, + props.vectorDb, + ); + console.log("installed Dependencies"); + } } else { // this is a frontend for a full-stack app, create .env file with model information const content = `MODEL=${props.model}\nNEXT_PUBLIC_MODEL=${props.model}\n`; diff --git a/helpers/python.ts b/helpers/python.ts index 186062ec..ca5f344d 100644 --- a/helpers/python.ts +++ b/helpers/python.ts @@ -126,6 +126,7 @@ export const installPythonTemplate = async ({ framework, engine, vectorDb, + dataSource, postInstallAction, }: Pick< InstallTemplateArgs, @@ -134,6 +135,7 @@ export const installPythonTemplate = async ({ | "template" | "engine" | "vectorDb" + | "dataSource" | "postInstallAction" >) => { console.log("\nInitializing Python project with template:", template, "\n"); @@ -167,16 +169,25 @@ export const installPythonTemplate = async ({ if (engine === "context") { const compPath = path.join(__dirname, "..", "templates", "components"); + let vectorDbDirName = vectorDb ?? "none"; const VectorDBPath = path.join( compPath, "vectordbs", "python", - vectorDb || "none", + vectorDbDirName, ); + const enginePath = path.join(root, "app", "engine"); + await copy("**", path.join(root, "app", "engine"), { parents: true, cwd: VectorDBPath, }); + let dataSourceDir = dataSource?.type ?? "file"; + const loaderPath = path.join(compPath, "loaders", "python", dataSourceDir); + await copy("**", enginePath, { + parents: true, + cwd: loaderPath, + }); } const addOnDependencies = getAdditionalDependencies(vectorDb); diff --git a/helpers/types.ts b/helpers/types.ts index d7e6e92a..e2660860 100644 --- a/helpers/types.ts +++ b/helpers/types.ts @@ -6,6 +6,18 @@ export type TemplateEngine = "simple" | "context"; export type TemplateUI = "html" | "shadcn"; export type TemplateVectorDB = "none" | "mongo" | "pg"; export type TemplatePostInstallAction = "none" | "dependencies" | "runApp"; +export type TemplateDataSource = { + type: "none" | "file" | "web"; + config: TemplateDataSourceConfig; +}; +export type FileSourceConfig = { + contextFile?: string; +}; +export type WebSourceConfig = { + baseUrl?: string; + depth?: number; +}; +export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig; export interface InstallTemplateArgs { appName: string; @@ -15,8 +27,8 @@ export interface InstallTemplateArgs { template: TemplateType; framework: TemplateFramework; engine: TemplateEngine; - contextFile?: string; ui: TemplateUI; + dataSource?: TemplateDataSource; eslint: boolean; customApiPath?: string; openAiKey?: string; diff --git a/index.ts b/index.ts index 5978dd37..935637e6 100644 --- a/index.ts +++ b/index.ts @@ -241,7 +241,7 @@ async function run(): Promise<void> { vectorDb: program.vectorDb, externalPort: program.externalPort, postInstallAction: program.postInstallAction, - contextFile: program.contextFile, + dataSource: program.dataSource, }); conf.set("preferences", preferences); @@ -278,7 +278,6 @@ async function notifyUpdate(): Promise<void> { "\n", ); } - process.exit(); } catch { // ignore error } diff --git a/questions.ts b/questions.ts index 531a8642..eff97455 100644 --- a/questions.ts +++ b/questions.ts @@ -40,6 +40,10 @@ const defaults: QuestionArgs = { communityProjectPath: "", llamapack: "", postInstallAction: "dependencies", + dataSource: { + type: "none", + config: {}, + }, }; const handlers = { @@ -149,7 +153,8 @@ export const askQuestions = async ( ]; const hasOpenAiKey = program.openAiKey || process.env["OPENAI_API_KEY"]; - if (program.vectorDb === "none" && hasOpenAiKey) { + const hasVectorDb = program.vectorDb && program.vectorDb !== "none"; + if (!hasVectorDb && hasOpenAiKey) { actionChoices.push({ title: "Generate code, install dependencies, and run the app (~2 min)", @@ -378,6 +383,9 @@ export const askQuestions = async ( if (process.platform === "win32" || process.platform === "darwin") { choices.push({ title: "Use a local PDF file", value: "localFile" }); } + if (program.framework === "fastapi") { + choices.push({ title: "Use website content", value: "web" }); + } const { dataSource } = await prompts( { @@ -389,20 +397,66 @@ export const askQuestions = async ( }, handlers, ); - switch (dataSource) { - case "simple": - program.engine = "simple"; - break; - case "exampleFile": - program.engine = "context"; - break; - case "localFile": - program.engine = "context"; - // If the user selected the "pdf" option, ask them to select a file - program.contextFile = await selectPDFFile(); - break; + // Initialize with default config + program.dataSource = getPrefOrDefault("dataSource"); + if (program.dataSource) { + switch (dataSource) { + case "simple": + program.engine = "simple"; + break; + case "exampleFile": + program.engine = "context"; + // example file is a context app with dataSource.type = file but has no config + program.dataSource = { type: "file", config: {} }; + break; + case "localFile": + program.engine = "context"; + program.dataSource.type = "file"; + // If the user selected the "pdf" option, ask them to select a file + program.dataSource.config = { + contextFile: await selectPDFFile(), + }; + break; + case "web": + program.engine = "context"; + program.dataSource.type = "web"; + break; + } } } + + if (program.dataSource?.type === "web" && program.framework === "fastapi") { + let { baseUrl } = await prompts( + { + type: "text", + name: "baseUrl", + message: "Please provide base URL of the website:", + initial: "https://www.llamaindex.ai", + }, + handlers, + ); + try { + if (!baseUrl.includes("://")) { + baseUrl = `https://${baseUrl}`; + } + let checkUrl = new URL(baseUrl); + if (checkUrl.protocol !== "https:" && checkUrl.protocol !== "http:") { + throw new Error("Invalid protocol"); + } + } catch (error) { + console.log( + red( + "Invalid URL provided! Please provide a valid URL (e.g. https://www.llamaindex.ai)", + ), + ); + process.exit(1); + } + program.dataSource.config = { + baseUrl: baseUrl, + depth: 1, + }; + } + if (program.engine !== "simple" && !program.vectorDb) { if (ciInfo.isCI) { program.vectorDb = getPrefOrDefault("vectorDb"); diff --git a/templates/components/loaders/python/file/loader.py b/templates/components/loaders/python/file/loader.py new file mode 100644 index 00000000..d343cec3 --- /dev/null +++ b/templates/components/loaders/python/file/loader.py @@ -0,0 +1,8 @@ +import os +from app.engine.constants import DATA_DIR +from llama_index import VectorStoreIndex, download_loader +from llama_index import SimpleDirectoryReader + + +def get_documents(): + return SimpleDirectoryReader(DATA_DIR).load_data() diff --git a/templates/components/loaders/python/web/loader.py b/templates/components/loaders/python/web/loader.py new file mode 100644 index 00000000..026dd101 --- /dev/null +++ b/templates/components/loaders/python/web/loader.py @@ -0,0 +1,15 @@ +import os +from llama_index import VectorStoreIndex, download_loader + + +def get_documents(): + WholeSiteReader = download_loader("WholeSiteReader") + + # Initialize the scraper with a prefix URL and maximum depth + scraper = WholeSiteReader( + prefix=os.environ.get("URL_PREFIX"), max_depth=int(os.environ.get("MAX_DEPTH")) + ) + # Start scraping from a base URL + documents = scraper.load_data(base_url=os.environ.get("BASE_URL")) + + return documents diff --git a/templates/components/vectordbs/python/mongo/generate.py b/templates/components/vectordbs/python/mongo/generate.py index fe0ee9aa..f52b3b48 100644 --- a/templates/components/vectordbs/python/mongo/generate.py +++ b/templates/components/vectordbs/python/mongo/generate.py @@ -7,6 +7,7 @@ from llama_index.vector_stores import MongoDBAtlasVectorSearch from app.engine.constants import DATA_DIR from app.engine.context import create_service_context +from app.engine.loader import get_documents from llama_index import ( @@ -22,7 +23,7 @@ logger = logging.getLogger() def generate_datasource(service_context): logger.info("Creating new index") # load the documents and create the index - documents = SimpleDirectoryReader(DATA_DIR).load_data() + documents = get_documents() store = MongoDBAtlasVectorSearch( db_name=os.environ["MONGODB_DATABASE"], collection_name=os.environ["MONGODB_VECTORS"], diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py index 3c4cd6a9..7ff20012 100644 --- a/templates/components/vectordbs/python/none/generate.py +++ b/templates/components/vectordbs/python/none/generate.py @@ -4,6 +4,7 @@ from dotenv import load_dotenv from app.engine.constants import DATA_DIR, STORAGE_DIR from app.engine.context import create_service_context +from app.engine.loader import get_documents load_dotenv() @@ -19,7 +20,7 @@ logger = logging.getLogger() def generate_datasource(service_context): logger.info("Creating new index") # load the documents and create the index - documents = SimpleDirectoryReader(DATA_DIR).load_data() + documents = get_documents() index = VectorStoreIndex.from_documents(documents, service_context=service_context) # store it for later index.storage_context.persist(STORAGE_DIR) diff --git a/templates/components/vectordbs/python/pg/generate.py b/templates/components/vectordbs/python/pg/generate.py index ee07e7a4..5c77ee07 100644 --- a/templates/components/vectordbs/python/pg/generate.py +++ b/templates/components/vectordbs/python/pg/generate.py @@ -6,6 +6,7 @@ import logging from app.engine.constants import DATA_DIR from app.engine.context import create_service_context from app.engine.utils import init_pg_vector_store_from_env +from app.engine.loader import get_documents from llama_index import ( SimpleDirectoryReader, @@ -20,7 +21,7 @@ logger = logging.getLogger() def generate_datasource(service_context): logger.info("Creating new index") # load the documents and create the index - documents = SimpleDirectoryReader(DATA_DIR).load_data() + documents = get_documents() store = init_pg_vector_store_from_env() storage_context = StorageContext.from_defaults(vector_store=store) VectorStoreIndex.from_documents( -- GitLab