diff --git a/create-app.ts b/create-app.ts index ad3343d30f3ce5ed5e8644bd2c4b35932453320e..6c684d9decf6718a91970f7cbedf333545af6f65 100644 --- a/create-app.ts +++ b/create-app.ts @@ -36,7 +36,7 @@ export async function createApp({ vectorDb, externalPort, postInstallAction, - contextFile, + dataSource, }: InstallAppArgs): Promise<void> { const root = path.resolve(appPath); @@ -80,7 +80,7 @@ export async function createApp({ vectorDb, externalPort, postInstallAction, - contextFile, + dataSource, }; if (frontend) { diff --git a/helpers/index.ts b/helpers/index.ts index 40c1b4eb92fe9356b51474618eb6383f678527ae..ee3d5a94cfacfd1c05317f6d2440988b1556ed4f 100644 --- a/helpers/index.ts +++ b/helpers/index.ts @@ -13,9 +13,10 @@ import { installPythonTemplate } from "./python"; import { downloadAndExtractRepo } from "./repo"; import { InstallTemplateArgs, - TemplateEngine, + TemplateDataSource, TemplateFramework, TemplateVectorDB, + WebSourceConfig, } from "./types"; import { installTSTemplate } from "./typescript"; @@ -26,6 +27,7 @@ const createEnvLocalFile = async ( vectorDb?: TemplateVectorDB; model?: string; framework?: TemplateFramework; + dataSource?: TemplateDataSource; }, ) => { const envFileName = ".env"; @@ -58,48 +60,30 @@ const createEnvLocalFile = async ( } } + switch (opts?.dataSource?.type) { + case "web": { + let webConfig = opts?.dataSource.config as WebSourceConfig; + content += `# web loader config\n`; + content += `BASE_URL=${webConfig.baseUrl}\n`; + content += `URL_PREFIX=${webConfig.baseUrl}\n`; + content += `MAX_DEPTH=${webConfig.depth}\n`; + break; + } + } + if (content) { await fs.writeFile(path.join(root, envFileName), content); console.log(`Created '${envFileName}' file. Please check the settings.`); } }; -const copyTestData = async ( - root: string, +const installDependencies = async ( framework: TemplateFramework, packageManager?: PackageManager, - engine?: TemplateEngine, openAiKey?: string, vectorDb?: TemplateVectorDB, - contextFile?: string, - // eslint-disable-next-line max-params ) => { - if (engine === "context") { - const destPath = path.join(root, "data"); - if (contextFile) { - console.log(`\nCopying provided file to ${cyan(destPath)}\n`); - await fs.mkdir(destPath, { recursive: true }); - await fs.copyFile( - contextFile, - path.join(destPath, path.basename(contextFile)), - ); - } else { - const srcPath = path.join( - __dirname, - "..", - "templates", - "components", - "data", - ); - console.log(`\nCopying test data to ${cyan(destPath)}\n`); - await copy("**", destPath, { - parents: true, - cwd: srcPath, - }); - } - } - - if (packageManager && engine === "context") { + if (packageManager) { const runGenerate = `${cyan( framework === "fastapi" ? "poetry run python app/engine/generate.py" @@ -131,6 +115,31 @@ const copyTestData = async ( } }; +const copyTestData = async (root: string, contextFile?: string) => { + const destPath = path.join(root, "data"); + if (contextFile) { + console.log(`\nCopying provided file to ${cyan(destPath)}\n`); + await fs.mkdir(destPath, { recursive: true }); + await fs.copyFile( + contextFile, + path.join(destPath, path.basename(contextFile)), + ); + } else { + const srcPath = path.join( + __dirname, + "..", + "templates", + "components", + "data", + ); + console.log(`\nCopying test data to ${cyan(destPath)}\n`); + await copy("**", destPath, { + parents: true, + cwd: srcPath, + }); + } +}; + const installCommunityProject = async ({ root, communityProjectPath, @@ -174,18 +183,21 @@ export const installTemplate = async ( vectorDb: props.vectorDb, model: props.model, framework: props.framework, + dataSource: props.dataSource, }); - // Copy test pdf file - await copyTestData( - props.root, - props.framework, - props.packageManager, - props.engine, - props.openAiKey, - props.vectorDb, - props.contextFile, - ); + if (props.engine === "context") { + if (props.dataSource?.type === "file") { + // Copy test pdf file + await copyTestData(props.root, props.framework); + } + installDependencies( + props.framework, + props.packageManager, + props.openAiKey, + props.vectorDb, + ); + } } else { // this is a frontend for a full-stack app, create .env file with model information const content = `MODEL=${props.model}\nNEXT_PUBLIC_MODEL=${props.model}\n`; diff --git a/helpers/python.ts b/helpers/python.ts index 015e2c269033eb3a4afed8e03428cc6959a32503..8b0d717029c2467790d080bc1cdf47448b80397b 100644 --- a/helpers/python.ts +++ b/helpers/python.ts @@ -126,6 +126,7 @@ export const installPythonTemplate = async ({ framework, engine, vectorDb, + dataSource, postInstallAction, }: Pick< InstallTemplateArgs, @@ -134,6 +135,7 @@ export const installPythonTemplate = async ({ | "template" | "engine" | "vectorDb" + | "dataSource" | "postInstallAction" >) => { console.log("\nInitializing Python project with template:", template, "\n"); @@ -173,10 +175,24 @@ export const installPythonTemplate = async ({ "python", vectorDb || "none", ); + const enginePath = path.join(root, "app", "engine"); + await copy("**", path.join(root, "app", "engine"), { parents: true, cwd: VectorDBPath, }); + if (dataSource?.type !== "none" && dataSource?.type !== undefined) { + const loaderPath = path.join( + compPath, + "loaders", + "python", + dataSource.type, + ); + await copy("**", enginePath, { + parents: true, + cwd: loaderPath, + }); + } } const addOnDependencies = getAdditionalDependencies(vectorDb); diff --git a/helpers/types.ts b/helpers/types.ts index d7e6e92adc6b0fc7d8dd703f80c711a496d1172f..e26608609e3be1402f4454f00640bc127e133b0d 100644 --- a/helpers/types.ts +++ b/helpers/types.ts @@ -6,6 +6,18 @@ export type TemplateEngine = "simple" | "context"; export type TemplateUI = "html" | "shadcn"; export type TemplateVectorDB = "none" | "mongo" | "pg"; export type TemplatePostInstallAction = "none" | "dependencies" | "runApp"; +export type TemplateDataSource = { + type: "none" | "file" | "web"; + config: TemplateDataSourceConfig; +}; +export type FileSourceConfig = { + contextFile?: string; +}; +export type WebSourceConfig = { + baseUrl?: string; + depth?: number; +}; +export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig; export interface InstallTemplateArgs { appName: string; @@ -15,8 +27,8 @@ export interface InstallTemplateArgs { template: TemplateType; framework: TemplateFramework; engine: TemplateEngine; - contextFile?: string; ui: TemplateUI; + dataSource?: TemplateDataSource; eslint: boolean; customApiPath?: string; openAiKey?: string; diff --git a/index.ts b/index.ts index 5978dd37af7daa6b8412481904add75165776e13..81b64d62239edd3248409944fa8ad352e6a801f3 100644 --- a/index.ts +++ b/index.ts @@ -241,7 +241,7 @@ async function run(): Promise<void> { vectorDb: program.vectorDb, externalPort: program.externalPort, postInstallAction: program.postInstallAction, - contextFile: program.contextFile, + dataSource: program.dataSource, }); conf.set("preferences", preferences); diff --git a/questions.ts b/questions.ts index 531a8642ed80151347f4994b8cd4b436cf71f335..36d07cc69c2fb8b10f39baa23feb8ec1d6e6edac 100644 --- a/questions.ts +++ b/questions.ts @@ -40,6 +40,10 @@ const defaults: QuestionArgs = { communityProjectPath: "", llamapack: "", postInstallAction: "dependencies", + dataSource: { + type: "none", + config: {}, + }, }; const handlers = { @@ -378,6 +382,9 @@ export const askQuestions = async ( if (process.platform === "win32" || process.platform === "darwin") { choices.push({ title: "Use a local PDF file", value: "localFile" }); } + if (program.framework === "fastapi") { + choices.push({ title: "Use website content", value: "web" }); + } const { dataSource } = await prompts( { @@ -389,20 +396,47 @@ export const askQuestions = async ( }, handlers, ); - switch (dataSource) { - case "simple": - program.engine = "simple"; - break; - case "exampleFile": - program.engine = "context"; - break; - case "localFile": - program.engine = "context"; - // If the user selected the "pdf" option, ask them to select a file - program.contextFile = await selectPDFFile(); - break; + // Initialize with default config + program.dataSource = getPrefOrDefault("dataSource"); + if (program.dataSource) { + switch (dataSource) { + case "simple": + program.engine = "simple"; + break; + case "exampleFile": + program.engine = "context"; + break; + case "localFile": + program.engine = "context"; + program.dataSource.type = "file"; + // If the user selected the "pdf" option, ask them to select a file + program.dataSource.config = { + contextFile: await selectPDFFile(), + }; + break; + case "web": + program.engine = "context"; + program.dataSource.type = "web"; + break; + } } } + + if (program.dataSource?.type === "web" && program.framework === "fastapi") { + const { baseUrl } = await prompts( + { + type: "text", + name: "baseUrl", + message: "Please provide base URL of the website:", + initial: "https://ts.llamaindex.ai/modules/", + }, + handlers, + ); + program.dataSource.config = { + baseUrl: baseUrl, + depth: 2, + }; + } if (program.engine !== "simple" && !program.vectorDb) { if (ciInfo.isCI) { program.vectorDb = getPrefOrDefault("vectorDb"); diff --git a/templates/components/loaders/python/file/loader.py b/templates/components/loaders/python/file/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..d343cec34a60f4c3004d9c00c53f7f3b7734bf82 --- /dev/null +++ b/templates/components/loaders/python/file/loader.py @@ -0,0 +1,8 @@ +import os +from app.engine.constants import DATA_DIR +from llama_index import VectorStoreIndex, download_loader +from llama_index import SimpleDirectoryReader + + +def get_documents(): + return SimpleDirectoryReader(DATA_DIR).load_data() diff --git a/templates/components/loaders/python/web/loader.py b/templates/components/loaders/python/web/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..026dd1019e555d438b896a2cf9a7e7161fa1d189 --- /dev/null +++ b/templates/components/loaders/python/web/loader.py @@ -0,0 +1,15 @@ +import os +from llama_index import VectorStoreIndex, download_loader + + +def get_documents(): + WholeSiteReader = download_loader("WholeSiteReader") + + # Initialize the scraper with a prefix URL and maximum depth + scraper = WholeSiteReader( + prefix=os.environ.get("URL_PREFIX"), max_depth=int(os.environ.get("MAX_DEPTH")) + ) + # Start scraping from a base URL + documents = scraper.load_data(base_url=os.environ.get("BASE_URL")) + + return documents diff --git a/templates/components/vectordbs/python/mongo/generate.py b/templates/components/vectordbs/python/mongo/generate.py index fe0ee9aa016d5adab9d95be06f2e0708f8aa903d..f52b3b48b1fdecd2102b172d1a85d356a2b2338c 100644 --- a/templates/components/vectordbs/python/mongo/generate.py +++ b/templates/components/vectordbs/python/mongo/generate.py @@ -7,6 +7,7 @@ from llama_index.vector_stores import MongoDBAtlasVectorSearch from app.engine.constants import DATA_DIR from app.engine.context import create_service_context +from app.engine.loader import get_documents from llama_index import ( @@ -22,7 +23,7 @@ logger = logging.getLogger() def generate_datasource(service_context): logger.info("Creating new index") # load the documents and create the index - documents = SimpleDirectoryReader(DATA_DIR).load_data() + documents = get_documents() store = MongoDBAtlasVectorSearch( db_name=os.environ["MONGODB_DATABASE"], collection_name=os.environ["MONGODB_VECTORS"], diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py index 3c4cd6a9e310f3f2e2f7e4709e94b9073282f151..7ff20012e9fbd1d3189ccf9527ef928ec1a85a12 100644 --- a/templates/components/vectordbs/python/none/generate.py +++ b/templates/components/vectordbs/python/none/generate.py @@ -4,6 +4,7 @@ from dotenv import load_dotenv from app.engine.constants import DATA_DIR, STORAGE_DIR from app.engine.context import create_service_context +from app.engine.loader import get_documents load_dotenv() @@ -19,7 +20,7 @@ logger = logging.getLogger() def generate_datasource(service_context): logger.info("Creating new index") # load the documents and create the index - documents = SimpleDirectoryReader(DATA_DIR).load_data() + documents = get_documents() index = VectorStoreIndex.from_documents(documents, service_context=service_context) # store it for later index.storage_context.persist(STORAGE_DIR) diff --git a/templates/components/vectordbs/python/pg/generate.py b/templates/components/vectordbs/python/pg/generate.py index ee07e7a4ec3f19a83e731a265e4451b68915330d..5c77ee07e910349c19bd6320c359d3885aff879b 100644 --- a/templates/components/vectordbs/python/pg/generate.py +++ b/templates/components/vectordbs/python/pg/generate.py @@ -6,6 +6,7 @@ import logging from app.engine.constants import DATA_DIR from app.engine.context import create_service_context from app.engine.utils import init_pg_vector_store_from_env +from app.engine.loader import get_documents from llama_index import ( SimpleDirectoryReader, @@ -20,7 +21,7 @@ logger = logging.getLogger() def generate_datasource(service_context): logger.info("Creating new index") # load the documents and create the index - documents = SimpleDirectoryReader(DATA_DIR).load_data() + documents = get_documents() store = init_pg_vector_store_from_env() storage_context = StorageContext.from_defaults(vector_store=store) VectorStoreIndex.from_documents(