Skip to content
Snippets Groups Projects
Unverified Commit e8db041d authored by Huu Le (Lee)'s avatar Huu Le (Lee) Committed by GitHub
Browse files

feat: Add multiple URLs reader (#18)

parent b3f26856
Branches
Tags
No related merge requests found
---
"create-llama": patch
---
Add multiple URLs for web data source
......@@ -130,8 +130,8 @@ export async function createApp({
console.log(
yellow(
`You have selected tools that require configuration. Please configure them in the ${terminalLink(
"tools_config.json",
`file://${root}/tools_config.json`,
"config/tools.json",
`file://${root}/config/tools.json`,
)} file.`,
),
);
......
import fs from "fs/promises";
import path from "path";
import {
FileSourceConfig,
TemplateDataSource,
TemplateFramework,
TemplateVectorDB,
WebSourceConfig,
} from "./types";
type EnvVar = {
......@@ -100,48 +98,6 @@ const getVectorDBEnvs = (vectorDb: TemplateVectorDB) => {
}
};
const getDataSourceEnvs = (
dataSource: TemplateDataSource,
llamaCloudKey?: string,
) => {
switch (dataSource.type) {
case "web":
const config = dataSource.config as WebSourceConfig;
return [
{
name: "BASE_URL",
description: "The base URL to start web scraping.",
value: config.baseUrl,
},
{
name: "URL_PREFIX",
description: "The prefix of the URL to start web scraping.",
value: config.baseUrl,
},
{
name: "MAX_DEPTH",
description: "The maximum depth to scrape.",
value: config.depth?.toString(),
},
];
case "file":
case "folder":
return [
...((dataSource?.config as FileSourceConfig).useLlamaParse
? [
{
name: "LLAMA_CLOUD_API_KEY",
description: `The Llama Cloud API key.`,
value: llamaCloudKey,
},
]
: []),
];
default:
return [];
}
};
export const createBackendEnvFile = async (
root: string,
opts: {
......@@ -173,9 +129,15 @@ export const createBackendEnvFile = async (
// Add vector database environment variables
...(opts.vectorDb ? getVectorDBEnvs(opts.vectorDb) : []),
// Add data source environment variables
...(opts.dataSource
? getDataSourceEnvs(opts.dataSource, opts.llamaCloudKey)
// Add LlamaCloud API key
...(opts.llamaCloudKey
? [
{
name: "LLAMA_CLOUD_API_KEY",
description: `The Llama Cloud API key.`,
value: opts.llamaCloudKey,
},
]
: []),
];
let envVars: EnvVar[] = [];
......
......@@ -12,6 +12,7 @@ import {
InstallTemplateArgs,
TemplateDataSource,
TemplateVectorDB,
WebSourceConfig,
} from "./types";
interface Dependency {
......@@ -237,12 +238,13 @@ export const installPythonTemplate = async ({
parents: true,
cwd: path.join(compPath, "engines", "python", "agent"),
});
// Write tools_config.json
// Write tool configs
const configContent: Record<string, any> = {};
tools.forEach((tool) => {
configContent[tool.name] = tool.config ?? {};
});
const configFilePath = path.join(root, "tools_config.json");
const configFilePath = path.join(root, "config/tools.json");
await fs.mkdir(path.join(root, "config"), { recursive: true });
await fs.writeFile(
configFilePath,
JSON.stringify(configContent, null, 2),
......@@ -254,6 +256,30 @@ export const installPythonTemplate = async ({
});
}
// Write loader configs
if (dataSource?.type === "web") {
const config = dataSource.config as WebSourceConfig[];
const webLoaderConfig = config.map((c) => {
return {
base_url: c.baseUrl,
prefix: c.prefix || c.baseUrl,
depth: c.depth || 1,
};
});
const loaderConfigPath = path.join(root, "config/loaders.json");
await fs.mkdir(path.join(root, "config"), { recursive: true });
await fs.writeFile(
loaderConfigPath,
JSON.stringify(
{
web: webLoaderConfig,
},
null,
2,
),
);
}
const dataSourceType = dataSource?.type;
if (dataSourceType !== undefined && dataSourceType !== "none") {
let loaderFolder: string;
......
......@@ -24,9 +24,11 @@ export type FileSourceConfig = {
};
export type WebSourceConfig = {
baseUrl?: string;
prefix?: string;
depth?: number;
};
export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig;
export type TemplateDataSourceConfig = FileSourceConfig | WebSourceConfig[];
export type CommunityProjectConfig = {
owner: string;
......
......@@ -11,6 +11,7 @@ import {
FileSourceConfig,
TemplateDataSourceType,
TemplateFramework,
WebSourceConfig,
} from "./helpers";
import { COMMUNITY_OWNER, COMMUNITY_REPO } from "./helpers/constant";
import { templatesDir } from "./helpers/dir";
......@@ -755,35 +756,53 @@ export const askQuestions = async (
}
if (program.dataSource?.type === "web" && program.framework === "fastapi") {
let { baseUrl } = await prompts(
{
type: "text",
name: "baseUrl",
message: "Please provide base URL of the website:",
initial: "https://www.llamaindex.ai",
},
handlers,
);
try {
if (!baseUrl.includes("://")) {
baseUrl = `https://${baseUrl}`;
}
const checkUrl = new URL(baseUrl);
if (checkUrl.protocol !== "https:" && checkUrl.protocol !== "http:") {
throw new Error("Invalid protocol");
program.dataSource.config = [];
while (true) {
const questions: any[] = [
{
type: "text",
name: "baseUrl",
message: "Please provide base URL of the website: ",
initial: "https://www.llamaindex.ai",
validate: (value: string) => {
if (!value.includes("://")) {
value = `https://${value}`;
}
const urlObj = new URL(value);
if (urlObj.protocol !== "https:" && urlObj.protocol !== "http:") {
return `URL=${value} has invalid protocol, only allow http or https`;
}
// Check duplicated URL
if (
(program.dataSource?.config as WebSourceConfig[]).some(
(c) => c.baseUrl === value,
)
) {
return `URL=${value} is already added. Please provide a different URL.`;
}
return true;
},
},
{
type: "toggle",
name: "shouldContinue",
message: "Would you like to add another website?",
initial: false,
active: "Yes",
inactive: "No",
},
];
let { shouldContinue, baseUrl } = await prompts(questions, handlers);
program.dataSource.config.push({
baseUrl: baseUrl,
prefix: baseUrl,
depth: 1,
});
if (!shouldContinue) {
break;
}
} catch (error) {
console.log(
red(
"Invalid URL provided! Please provide a valid URL (e.g. https://www.llamaindex.ai)",
),
);
process.exit(1);
}
program.dataSource.config = {
baseUrl: baseUrl,
depth: 1,
};
}
if (program.engine !== "simple" && !program.vectorDb) {
......
......@@ -26,7 +26,7 @@ class ToolFactory:
@staticmethod
def from_env() -> list[FunctionTool]:
tools = []
with open("tools_config.json", "r") as f:
with open("config/tools.json", "r") as f:
tool_configs = json.load(f)
for name, config in tool_configs.items():
tools += ToolFactory.create_tool(name, **config)
......
import os
import json
from pydantic import BaseModel, Field
from llama_index.readers.web import WholeSiteReader
def get_documents():
# Initialize the scraper with a prefix URL and maximum depth
scraper = WholeSiteReader(
prefix=os.environ.get("URL_PREFIX"), max_depth=int(os.environ.get("MAX_DEPTH"))
)
# Start scraping from a base URL
documents = scraper.load_data(base_url=os.environ.get("BASE_URL"))
class WebLoaderConfig(BaseModel):
base_url: str
prefix: str
max_depth: int = Field(default=1, ge=0)
def load_configs():
with open("config/loaders.json") as f:
configs = json.load(f)
web_config = configs.get("web", None)
if web_config is None:
raise ValueError("No web config found in loaders.json")
return [WebLoaderConfig(**config) for config in web_config]
def get_documents():
web_config = load_configs()
documents = []
for entry in web_config:
scraper = WholeSiteReader(
prefix=entry.prefix,
max_depth=entry.max_depth,
)
documents.extend(scraper.load_data(entry.base_url))
return documents
......@@ -19,6 +19,8 @@ Example `backend/.env` file:
OPENAI_API_KEY=<openai_api_key>
```
If you are using any tools or data sources, you can update their config files in the `config` folder.
Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step):
```
......
......@@ -19,6 +19,8 @@ Example `.env` file:
OPENAI_API_KEY=<openai_api_key>
```
If you are using any tools or data sources, you can update their config files in the `config` folder.
Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step):
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment