feat: Add Dockerfile template (#27)

78ded9e2 · Huu Le (Lee) · GitHub · 4f10840f · 78ded9e2 · 78ded9e2
Unverified Commit 78ded9e2 authored 11 months ago by Huu Le (Lee) Committed by GitHub 11 months ago
--- a/.changeset/healthy-insects-check.md
+++ b/.changeset/healthy-insects-check.md
+---
+"create-llama": patch
+---
+Add Dockerfile template
--- a/helpers/python.ts
+++ b/helpers/python.ts
@@ -216,9 +216,10 @@ export const installPythonTemplate = async ({
    },
  });
+  const compPath = path.join(templatesDir, "components");
  if (dataSources.length > 0) {
    const enginePath = path.join(root, "app", "engine");
-    const compPath = path.join(templatesDir, "components");
    const vectorDbDirName = vectorDb ?? "none";
    const VectorDBPath = path.join(
@@ -265,7 +266,19 @@ export const installPythonTemplate = async ({
    // Generate loaders config
    // Web loader config
    if (dataSources.some((ds) => ds.type === "web")) {
-      const webLoaderConfig = dataSources
+      const webLoaderConfig = new Document({});
+      // Create config for browser driver arguments
+      const driverArgNodeValue = webLoaderConfig.createNode([
+        "--no-sandbox",
+        "--disable-dev-shm-usage",
+      ]);
+      driverArgNodeValue.commentBefore =
+        " The arguments to pass to the webdriver. E.g.: add --headless to run in headless mode";
+      webLoaderConfig.set("driver_arguments", driverArgNodeValue);
+      // Create config for urls
+      const urlConfigs = dataSources
        .filter((ds) => ds.type === "web")
        .map((ds) => {
          const dsConfig = ds.config as WebSourceConfig;
@@ -275,13 +288,15 @@ export const installPythonTemplate = async ({
            depth: dsConfig.depth,
          };
        });
-      // Add documentation to web loader config
+      const urlConfigNode = webLoaderConfig.createNode(urlConfigs);
-      const node = loaderConfig.createNode(webLoaderConfig);
+      urlConfigNode.commentBefore = ` base_url: The URL to start crawling with
-      node.commentBefore = ` base_url: The URL to start crawling with
 prefix: Only crawl URLs matching the specified prefix
 depth: The maximum depth for BFS traversal
 You can add more websites by adding more entries (don't forget the - prefix from YAML)`;
-      loaderConfig.set("web", node);
+      webLoaderConfig.set("urls", urlConfigNode);
+      // Add web config to the loaders config
+      loaderConfig.set("web", webLoaderConfig);
    }
    // File loader config
    if (dataSources.some((ds) => ds.type === "file")) {
@@ -308,4 +323,9 @@ export const installPythonTemplate = async ({
  if (postInstallAction === "runApp" || postInstallAction === "dependencies") {
    installPythonDependencies();
  }
+  // Copy deployment files for python
+  await copy("**", root, {
+    cwd: path.join(compPath, "deployments", "python"),
+  });
 };
--- a/helpers/typescript.ts
+++ b/helpers/typescript.ts
@@ -295,4 +295,9 @@ export const installTSTemplate = async ({
  if (postInstallAction === "runApp" || postInstallAction === "dependencies") {
    await installTSDependencies(packageJson, packageManager, isOnline);
  }
+  // Copy deployment files for typescript
+  await copy("**", root, {
+    cwd: path.join(compPath, "deployments", "typescript"),
+  });
 };
--- a/templates/components/deployments/python/Dockerfile
+++ b/templates/components/deployments/python/Dockerfile
+FROM python:3.11 as build
+WORKDIR /app
+ENV PYTHONPATH=/app
+# Install Poetry
+RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
+    cd /usr/local/bin && \
+    ln -s /opt/poetry/bin/poetry && \
+    poetry config virtualenvs.create false
+# Install Chromium for web loader
+# Can disable this if you don't use the web loader to reduce the image size
+RUN apt update && apt install -y chromium chromium-driver
+# Install dependencies
+COPY ./pyproject.toml ./poetry.lock* /app/
+RUN poetry install --no-root --no-cache --only main
+# ====================================
+FROM build as release
+COPY . .
+CMD ["python", "main.py"]
\ No newline at end of file
--- a/templates/components/deployments/typescript/Dockerfile
+++ b/templates/components/deployments/typescript/Dockerfile
+FROM node:20-alpine as build
+WORKDIR /app
+# Install dependencies
+COPY package.json pnpm-lock.yaml* /app/
+RUN npm install 
+# Build the application
+COPY . .
+RUN npm run build
+# ====================================
+FROM build as release
+# Copy built output from the previous stage
+COPY --from=build /app/.next* ./.next
+COPY --from=build /app/public* ./public
+COPY --from=build /app/package.json ./package.json
+COPY --from=build /app/node_modules ./node_modules
+CMD ["npm", "start"]
\ No newline at end of file
--- a/templates/components/loaders/python/__init__.py
+++ b/templates/components/loaders/python/__init__.py
@@ -26,8 +26,7 @@ def get_documents():
            document = get_file_documents(FileLoaderConfig(**loader_config))
            documents.extend(document)
        elif loader_type == "web":
-            for entry in loader_config:
+            document = get_web_documents(WebLoaderConfig(**loader_config))
-                document = get_web_documents(WebLoaderConfig(**entry))
+            documents.extend(document)
-                documents.extend(document)
    return documents
--- a/templates/components/loaders/python/web.py
+++ b/templates/components/loaders/python/web.py
@@ -3,17 +3,34 @@ import json
 from pydantic import BaseModel, Field
-class WebLoaderConfig(BaseModel):
+class CrawlUrl(BaseModel):
    base_url: str
    prefix: str
    max_depth: int = Field(default=1, ge=0)
+class WebLoaderConfig(BaseModel):
+    driver_arguments: list[str] = Field(default=None)
+    urls: list[CrawlUrl]
 def get_web_documents(config: WebLoaderConfig):
    from llama_index.readers.web import WholeSiteReader
+    from selenium import webdriver
+    from selenium.webdriver.chrome.options import Options
+    options = Options()
+    driver_arguments = config.driver_arguments or []
+    for arg in driver_arguments:
+        options.add_argument(arg)
+    docs = []
+    for url in config.urls:
+        scraper = WholeSiteReader(
+            prefix=url.prefix,
+            max_depth=url.max_depth,
+            driver=webdriver.Chrome(options=options),
+        )
+        docs.extend(scraper.load_data(url.base_url))
-    scraper = WholeSiteReader(
+    return docs
-        prefix=config.prefix,
-        max_depth=config.max_depth,
-    )
-    return scraper.load_data(config.base_url)
--- a/templates/types/simple/fastapi/README-template.md
+++ b/templates/types/simple/fastapi/README-template.md
-This is a [LlamaIndex](https://www.llamaindex.ai/) project using [FastAPI](https://fastapi.tiangolo.com/) bootstrapped with [`create-llama`](https://github.com/run-llama/LlamaIndexTS/tree/main/packages/create-llama).
-## Getting Started
-First, setup the environment with poetry:
-> **_Note:_** This step is not needed if you are using the dev-container.
-```
-poetry install
-poetry shell
-```
-By default, we use the OpenAI LLM (though you can customize, see app/api/routers/chat.py). As a result you need to specify an `OPENAI_API_KEY` in an .env file in this directory.
-Example `backend/.env` file:
-```
-OPENAI_API_KEY=<openai_api_key>
-```
-If you are using any tools or data sources, you can update their config files in the `config` folder.
-Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step):
-```
-python app/engine/generate.py
-```
-Third, run the development server:
-```
-python main.py
-```
-Then call the API endpoint `/api/chat` to see the result:
-```
-curl --location 'localhost:8000/api/chat' \
--header 'Content-Type: application/json' \
--data '{ "messages": [{ "role": "user", "content": "Hello" }] }'
-```
-You can start editing the API by modifying `app/api/routers/chat.py`. The endpoint auto-updates as you save the file.
-Open [http://localhost:8000/docs](http://localhost:8000/docs) with your browser to see the Swagger UI of the API.
-The API allows CORS for all origins to simplify development. You can change this behavior by setting the `ENVIRONMENT` environment variable to `prod`:
-```
-ENVIRONMENT=prod python main.py
-```
-## Learn More
-To learn more about LlamaIndex, take a look at the following resources:
- [LlamaIndex Documentation](https://docs.llamaindex.ai) - learn about LlamaIndex.
-You can check out [the LlamaIndex GitHub repository](https://github.com/run-llama/llama_index) - your feedback and contributions are welcome!
--- a/templates/types/streaming/express/README-template.md
+++ b/templates/types/streaming/express/README-template.md
@@ -60,6 +60,38 @@ NODE_ENV=production npm run start
 > Note that the `NODE_ENV` environment variable is set to `production`. This disables CORS for all origins.
+## Using Docker
+1. Build an image for Express app:
+```
+docker build -t <your_backend_image_name> .
+```
+2. Start the app:
+- Generate index data:
+```
+docker run --rm \
+  --v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  <your_backend_image_name>
+  npm run generate
+```
+- Start the API:
+```
+docker run \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 8000:8000 \
+  <your_backend_image_name>
+```
 ## Learn More
 To learn more about LlamaIndex, take a look at the following resources:

--- a/templates/types/streaming/fastapi/README-template.md
+++ b/templates/types/streaming/fastapi/README-template.md
@@ -64,6 +64,39 @@ The API allows CORS for all origins to simplify development. You can change this
 ENVIRONMENT=prod python main.py
 ```
+## Using docker
+1. Build an image for FastAPI app:
+```
+docker build -t <your_backend_image_name> .
+```
+2. Start the app:
+- Generate embedding for index data:
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/storage:/app/storage \ # Use your file system to store gea vector database
+  <your_backend_image_name> \
+  python app/engine/generate.py
+```
+- Start the API:
+```
+docker run \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/storage:/app/storage \ # Use your file system to store gea vector database
+  -p 8000:8000 \
+  <your_backend_image_name>
+```
 ## Learn More
 To learn more about LlamaIndex, take a look at the following resources:

--- a/templates/types/streaming/nextjs/README-template.md
+++ b/templates/types/streaming/nextjs/README-template.md
@@ -26,6 +26,41 @@ You can start editing the page by modifying `app/page.tsx`. The page auto-update
 This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font.
+## Using Docker
+1. Build an image for the Next.js app:
+```
+docker build -t <your_app_image_name> .
+```
+2. Generate embeddings:
+Parse the data and generate the vector embeddings if the `./data` folder exists - otherwise, skip this step:
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 3000:3000 \
+  <your_app_image_name> \
+  npm run generate
+```
+3. Start the API
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 3000:3000 \
+  <your_app_image_name>
+```
 ## Learn More
 To learn more about LlamaIndex, take a look at the following resources: