From 78ded9e242547999f74b9eebdec2b23ce85dbeb0 Mon Sep 17 00:00:00 2001 From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com> Date: Thu, 28 Mar 2024 12:35:27 +0700 Subject: [PATCH] feat: Add Dockerfile template (#27) --- .changeset/healthy-insects-check.md | 5 ++ helpers/python.ts | 32 ++++++++-- helpers/typescript.ts | 5 ++ .../components/deployments/python/Dockerfile | 26 ++++++++ .../deployments/typescript/Dockerfile | 22 +++++++ .../components/loaders/python/__init__.py | 5 +- templates/components/loaders/python/web.py | 29 +++++++-- .../types/simple/fastapi/README-template.md | 60 ------------------- .../streaming/express/README-template.md | 32 ++++++++++ .../streaming/fastapi/README-template.md | 33 ++++++++++ .../types/streaming/nextjs/README-template.md | 35 +++++++++++ 11 files changed, 209 insertions(+), 75 deletions(-) create mode 100644 .changeset/healthy-insects-check.md create mode 100644 templates/components/deployments/python/Dockerfile create mode 100644 templates/components/deployments/typescript/Dockerfile delete mode 100644 templates/types/simple/fastapi/README-template.md diff --git a/.changeset/healthy-insects-check.md b/.changeset/healthy-insects-check.md new file mode 100644 index 00000000..00b948e1 --- /dev/null +++ b/.changeset/healthy-insects-check.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Add Dockerfile template diff --git a/helpers/python.ts b/helpers/python.ts index be9f03cc..047c4876 100644 --- a/helpers/python.ts +++ b/helpers/python.ts @@ -216,9 +216,10 @@ export const installPythonTemplate = async ({ }, }); + const compPath = path.join(templatesDir, "components"); + if (dataSources.length > 0) { const enginePath = path.join(root, "app", "engine"); - const compPath = path.join(templatesDir, "components"); const vectorDbDirName = vectorDb ?? "none"; const VectorDBPath = path.join( @@ -265,7 +266,19 @@ export const installPythonTemplate = async ({ // Generate loaders config // Web loader config if (dataSources.some((ds) => ds.type === "web")) { - const webLoaderConfig = dataSources + const webLoaderConfig = new Document({}); + + // Create config for browser driver arguments + const driverArgNodeValue = webLoaderConfig.createNode([ + "--no-sandbox", + "--disable-dev-shm-usage", + ]); + driverArgNodeValue.commentBefore = + " The arguments to pass to the webdriver. E.g.: add --headless to run in headless mode"; + webLoaderConfig.set("driver_arguments", driverArgNodeValue); + + // Create config for urls + const urlConfigs = dataSources .filter((ds) => ds.type === "web") .map((ds) => { const dsConfig = ds.config as WebSourceConfig; @@ -275,13 +288,15 @@ export const installPythonTemplate = async ({ depth: dsConfig.depth, }; }); - // Add documentation to web loader config - const node = loaderConfig.createNode(webLoaderConfig); - node.commentBefore = ` base_url: The URL to start crawling with + const urlConfigNode = webLoaderConfig.createNode(urlConfigs); + urlConfigNode.commentBefore = ` base_url: The URL to start crawling with prefix: Only crawl URLs matching the specified prefix depth: The maximum depth for BFS traversal You can add more websites by adding more entries (don't forget the - prefix from YAML)`; - loaderConfig.set("web", node); + webLoaderConfig.set("urls", urlConfigNode); + + // Add web config to the loaders config + loaderConfig.set("web", webLoaderConfig); } // File loader config if (dataSources.some((ds) => ds.type === "file")) { @@ -308,4 +323,9 @@ export const installPythonTemplate = async ({ if (postInstallAction === "runApp" || postInstallAction === "dependencies") { installPythonDependencies(); } + + // Copy deployment files for python + await copy("**", root, { + cwd: path.join(compPath, "deployments", "python"), + }); }; diff --git a/helpers/typescript.ts b/helpers/typescript.ts index d965106d..7dcf3ed4 100644 --- a/helpers/typescript.ts +++ b/helpers/typescript.ts @@ -295,4 +295,9 @@ export const installTSTemplate = async ({ if (postInstallAction === "runApp" || postInstallAction === "dependencies") { await installTSDependencies(packageJson, packageManager, isOnline); } + + // Copy deployment files for typescript + await copy("**", root, { + cwd: path.join(compPath, "deployments", "typescript"), + }); }; diff --git a/templates/components/deployments/python/Dockerfile b/templates/components/deployments/python/Dockerfile new file mode 100644 index 00000000..624364b6 --- /dev/null +++ b/templates/components/deployments/python/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.11 as build + +WORKDIR /app + +ENV PYTHONPATH=/app + +# Install Poetry +RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \ + cd /usr/local/bin && \ + ln -s /opt/poetry/bin/poetry && \ + poetry config virtualenvs.create false + +# Install Chromium for web loader +# Can disable this if you don't use the web loader to reduce the image size +RUN apt update && apt install -y chromium chromium-driver + +# Install dependencies +COPY ./pyproject.toml ./poetry.lock* /app/ +RUN poetry install --no-root --no-cache --only main + +# ==================================== +FROM build as release + +COPY . . + +CMD ["python", "main.py"] \ No newline at end of file diff --git a/templates/components/deployments/typescript/Dockerfile b/templates/components/deployments/typescript/Dockerfile new file mode 100644 index 00000000..b9b28f2d --- /dev/null +++ b/templates/components/deployments/typescript/Dockerfile @@ -0,0 +1,22 @@ +FROM node:20-alpine as build + +WORKDIR /app + +# Install dependencies +COPY package.json pnpm-lock.yaml* /app/ +RUN npm install + +# Build the application +COPY . . +RUN npm run build + +# ==================================== +FROM build as release + +# Copy built output from the previous stage +COPY --from=build /app/.next* ./.next +COPY --from=build /app/public* ./public +COPY --from=build /app/package.json ./package.json +COPY --from=build /app/node_modules ./node_modules + +CMD ["npm", "start"] \ No newline at end of file diff --git a/templates/components/loaders/python/__init__.py b/templates/components/loaders/python/__init__.py index 710144fb..662c65a9 100644 --- a/templates/components/loaders/python/__init__.py +++ b/templates/components/loaders/python/__init__.py @@ -26,8 +26,7 @@ def get_documents(): document = get_file_documents(FileLoaderConfig(**loader_config)) documents.extend(document) elif loader_type == "web": - for entry in loader_config: - document = get_web_documents(WebLoaderConfig(**entry)) - documents.extend(document) + document = get_web_documents(WebLoaderConfig(**loader_config)) + documents.extend(document) return documents diff --git a/templates/components/loaders/python/web.py b/templates/components/loaders/python/web.py index bca9aaf7..563e51b5 100644 --- a/templates/components/loaders/python/web.py +++ b/templates/components/loaders/python/web.py @@ -3,17 +3,34 @@ import json from pydantic import BaseModel, Field -class WebLoaderConfig(BaseModel): +class CrawlUrl(BaseModel): base_url: str prefix: str max_depth: int = Field(default=1, ge=0) +class WebLoaderConfig(BaseModel): + driver_arguments: list[str] = Field(default=None) + urls: list[CrawlUrl] + + def get_web_documents(config: WebLoaderConfig): from llama_index.readers.web import WholeSiteReader + from selenium import webdriver + from selenium.webdriver.chrome.options import Options + + options = Options() + driver_arguments = config.driver_arguments or [] + for arg in driver_arguments: + options.add_argument(arg) + + docs = [] + for url in config.urls: + scraper = WholeSiteReader( + prefix=url.prefix, + max_depth=url.max_depth, + driver=webdriver.Chrome(options=options), + ) + docs.extend(scraper.load_data(url.base_url)) - scraper = WholeSiteReader( - prefix=config.prefix, - max_depth=config.max_depth, - ) - return scraper.load_data(config.base_url) + return docs diff --git a/templates/types/simple/fastapi/README-template.md b/templates/types/simple/fastapi/README-template.md deleted file mode 100644 index 69ff766c..00000000 --- a/templates/types/simple/fastapi/README-template.md +++ /dev/null @@ -1,60 +0,0 @@ -This is a [LlamaIndex](https://www.llamaindex.ai/) project using [FastAPI](https://fastapi.tiangolo.com/) bootstrapped with [`create-llama`](https://github.com/run-llama/LlamaIndexTS/tree/main/packages/create-llama). - -## Getting Started - -First, setup the environment with poetry: - -> **_Note:_** This step is not needed if you are using the dev-container. - -``` -poetry install -poetry shell -``` - -By default, we use the OpenAI LLM (though you can customize, see app/api/routers/chat.py). As a result you need to specify an `OPENAI_API_KEY` in an .env file in this directory. - -Example `backend/.env` file: - -``` -OPENAI_API_KEY=<openai_api_key> -``` - -If you are using any tools or data sources, you can update their config files in the `config` folder. - -Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step): - -``` -python app/engine/generate.py -``` - -Third, run the development server: - -``` -python main.py -``` - -Then call the API endpoint `/api/chat` to see the result: - -``` -curl --location 'localhost:8000/api/chat' \ ---header 'Content-Type: application/json' \ ---data '{ "messages": [{ "role": "user", "content": "Hello" }] }' -``` - -You can start editing the API by modifying `app/api/routers/chat.py`. The endpoint auto-updates as you save the file. - -Open [http://localhost:8000/docs](http://localhost:8000/docs) with your browser to see the Swagger UI of the API. - -The API allows CORS for all origins to simplify development. You can change this behavior by setting the `ENVIRONMENT` environment variable to `prod`: - -``` -ENVIRONMENT=prod python main.py -``` - -## Learn More - -To learn more about LlamaIndex, take a look at the following resources: - -- [LlamaIndex Documentation](https://docs.llamaindex.ai) - learn about LlamaIndex. - -You can check out [the LlamaIndex GitHub repository](https://github.com/run-llama/llama_index) - your feedback and contributions are welcome! diff --git a/templates/types/streaming/express/README-template.md b/templates/types/streaming/express/README-template.md index 62a5f248..61e164bf 100644 --- a/templates/types/streaming/express/README-template.md +++ b/templates/types/streaming/express/README-template.md @@ -60,6 +60,38 @@ NODE_ENV=production npm run start > Note that the `NODE_ENV` environment variable is set to `production`. This disables CORS for all origins. +## Using Docker + +1. Build an image for Express app: + +``` +docker build -t <your_backend_image_name> . +``` + +2. Start the app: + +- Generate index data: + +``` +docker run --rm \ + --v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system + -v $(pwd)/config:/app/config \ + -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database + <your_backend_image_name> + npm run generate +``` + +- Start the API: + +``` +docker run \ + -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system + -v $(pwd)/config:/app/config \ + -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database + -p 8000:8000 \ + <your_backend_image_name> +``` + ## Learn More To learn more about LlamaIndex, take a look at the following resources: diff --git a/templates/types/streaming/fastapi/README-template.md b/templates/types/streaming/fastapi/README-template.md index ca7e0b33..33dbeba6 100644 --- a/templates/types/streaming/fastapi/README-template.md +++ b/templates/types/streaming/fastapi/README-template.md @@ -64,6 +64,39 @@ The API allows CORS for all origins to simplify development. You can change this ENVIRONMENT=prod python main.py ``` +## Using docker + +1. Build an image for FastAPI app: + +``` +docker build -t <your_backend_image_name> . +``` + +2. Start the app: + +- Generate embedding for index data: + +``` +docker run \ + --rm \ + -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system + -v $(pwd)/config:/app/config \ + -v $(pwd)/storage:/app/storage \ # Use your file system to store gea vector database + <your_backend_image_name> \ + python app/engine/generate.py +``` + +- Start the API: + +``` +docker run \ + -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system + -v $(pwd)/config:/app/config \ + -v $(pwd)/storage:/app/storage \ # Use your file system to store gea vector database + -p 8000:8000 \ + <your_backend_image_name> +``` + ## Learn More To learn more about LlamaIndex, take a look at the following resources: diff --git a/templates/types/streaming/nextjs/README-template.md b/templates/types/streaming/nextjs/README-template.md index 81b3b93f..c522d311 100644 --- a/templates/types/streaming/nextjs/README-template.md +++ b/templates/types/streaming/nextjs/README-template.md @@ -26,6 +26,41 @@ You can start editing the page by modifying `app/page.tsx`. The page auto-update This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font. +## Using Docker + +1. Build an image for the Next.js app: + +``` +docker build -t <your_app_image_name> . +``` + +2. Generate embeddings: + +Parse the data and generate the vector embeddings if the `./data` folder exists - otherwise, skip this step: + +``` +docker run \ + --rm \ + -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system + -v $(pwd)/config:/app/config \ + -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database + -p 3000:3000 \ + <your_app_image_name> \ + npm run generate +``` + +3. Start the API + +``` +docker run \ + --rm \ + -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system + -v $(pwd)/config:/app/config \ + -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database + -p 3000:3000 \ + <your_app_image_name> +``` + ## Learn More To learn more about LlamaIndex, take a look at the following resources: -- GitLab