From 78ded9e242547999f74b9eebdec2b23ce85dbeb0 Mon Sep 17 00:00:00 2001
From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com>
Date: Thu, 28 Mar 2024 12:35:27 +0700
Subject: [PATCH] feat: Add Dockerfile template (#27)

---
 .changeset/healthy-insects-check.md           |  5 ++
 helpers/python.ts                             | 32 ++++++++--
 helpers/typescript.ts                         |  5 ++
 .../components/deployments/python/Dockerfile  | 26 ++++++++
 .../deployments/typescript/Dockerfile         | 22 +++++++
 .../components/loaders/python/__init__.py     |  5 +-
 templates/components/loaders/python/web.py    | 29 +++++++--
 .../types/simple/fastapi/README-template.md   | 60 -------------------
 .../streaming/express/README-template.md      | 32 ++++++++++
 .../streaming/fastapi/README-template.md      | 33 ++++++++++
 .../types/streaming/nextjs/README-template.md | 35 +++++++++++
 11 files changed, 209 insertions(+), 75 deletions(-)
 create mode 100644 .changeset/healthy-insects-check.md
 create mode 100644 templates/components/deployments/python/Dockerfile
 create mode 100644 templates/components/deployments/typescript/Dockerfile
 delete mode 100644 templates/types/simple/fastapi/README-template.md

diff --git a/.changeset/healthy-insects-check.md b/.changeset/healthy-insects-check.md
new file mode 100644
index 00000000..00b948e1
--- /dev/null
+++ b/.changeset/healthy-insects-check.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Add Dockerfile template
diff --git a/helpers/python.ts b/helpers/python.ts
index be9f03cc..047c4876 100644
--- a/helpers/python.ts
+++ b/helpers/python.ts
@@ -216,9 +216,10 @@ export const installPythonTemplate = async ({
     },
   });
 
+  const compPath = path.join(templatesDir, "components");
+
   if (dataSources.length > 0) {
     const enginePath = path.join(root, "app", "engine");
-    const compPath = path.join(templatesDir, "components");
 
     const vectorDbDirName = vectorDb ?? "none";
     const VectorDBPath = path.join(
@@ -265,7 +266,19 @@ export const installPythonTemplate = async ({
     // Generate loaders config
     // Web loader config
     if (dataSources.some((ds) => ds.type === "web")) {
-      const webLoaderConfig = dataSources
+      const webLoaderConfig = new Document({});
+
+      // Create config for browser driver arguments
+      const driverArgNodeValue = webLoaderConfig.createNode([
+        "--no-sandbox",
+        "--disable-dev-shm-usage",
+      ]);
+      driverArgNodeValue.commentBefore =
+        " The arguments to pass to the webdriver. E.g.: add --headless to run in headless mode";
+      webLoaderConfig.set("driver_arguments", driverArgNodeValue);
+
+      // Create config for urls
+      const urlConfigs = dataSources
         .filter((ds) => ds.type === "web")
         .map((ds) => {
           const dsConfig = ds.config as WebSourceConfig;
@@ -275,13 +288,15 @@ export const installPythonTemplate = async ({
             depth: dsConfig.depth,
           };
         });
-      // Add documentation to web loader config
-      const node = loaderConfig.createNode(webLoaderConfig);
-      node.commentBefore = ` base_url: The URL to start crawling with
+      const urlConfigNode = webLoaderConfig.createNode(urlConfigs);
+      urlConfigNode.commentBefore = ` base_url: The URL to start crawling with
  prefix: Only crawl URLs matching the specified prefix
  depth: The maximum depth for BFS traversal
  You can add more websites by adding more entries (don't forget the - prefix from YAML)`;
-      loaderConfig.set("web", node);
+      webLoaderConfig.set("urls", urlConfigNode);
+
+      // Add web config to the loaders config
+      loaderConfig.set("web", webLoaderConfig);
     }
     // File loader config
     if (dataSources.some((ds) => ds.type === "file")) {
@@ -308,4 +323,9 @@ export const installPythonTemplate = async ({
   if (postInstallAction === "runApp" || postInstallAction === "dependencies") {
     installPythonDependencies();
   }
+
+  // Copy deployment files for python
+  await copy("**", root, {
+    cwd: path.join(compPath, "deployments", "python"),
+  });
 };
diff --git a/helpers/typescript.ts b/helpers/typescript.ts
index d965106d..7dcf3ed4 100644
--- a/helpers/typescript.ts
+++ b/helpers/typescript.ts
@@ -295,4 +295,9 @@ export const installTSTemplate = async ({
   if (postInstallAction === "runApp" || postInstallAction === "dependencies") {
     await installTSDependencies(packageJson, packageManager, isOnline);
   }
+
+  // Copy deployment files for typescript
+  await copy("**", root, {
+    cwd: path.join(compPath, "deployments", "typescript"),
+  });
 };
diff --git a/templates/components/deployments/python/Dockerfile b/templates/components/deployments/python/Dockerfile
new file mode 100644
index 00000000..624364b6
--- /dev/null
+++ b/templates/components/deployments/python/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.11 as build
+
+WORKDIR /app
+
+ENV PYTHONPATH=/app
+
+# Install Poetry
+RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
+    cd /usr/local/bin && \
+    ln -s /opt/poetry/bin/poetry && \
+    poetry config virtualenvs.create false
+
+# Install Chromium for web loader
+# Can disable this if you don't use the web loader to reduce the image size
+RUN apt update && apt install -y chromium chromium-driver
+
+# Install dependencies
+COPY ./pyproject.toml ./poetry.lock* /app/
+RUN poetry install --no-root --no-cache --only main
+
+# ====================================
+FROM build as release
+
+COPY . .
+
+CMD ["python", "main.py"]
\ No newline at end of file
diff --git a/templates/components/deployments/typescript/Dockerfile b/templates/components/deployments/typescript/Dockerfile
new file mode 100644
index 00000000..b9b28f2d
--- /dev/null
+++ b/templates/components/deployments/typescript/Dockerfile
@@ -0,0 +1,22 @@
+FROM node:20-alpine as build
+
+WORKDIR /app
+
+# Install dependencies
+COPY package.json pnpm-lock.yaml* /app/
+RUN npm install 
+
+# Build the application
+COPY . .
+RUN npm run build
+
+# ====================================
+FROM build as release
+
+# Copy built output from the previous stage
+COPY --from=build /app/.next* ./.next
+COPY --from=build /app/public* ./public
+COPY --from=build /app/package.json ./package.json
+COPY --from=build /app/node_modules ./node_modules
+
+CMD ["npm", "start"]
\ No newline at end of file
diff --git a/templates/components/loaders/python/__init__.py b/templates/components/loaders/python/__init__.py
index 710144fb..662c65a9 100644
--- a/templates/components/loaders/python/__init__.py
+++ b/templates/components/loaders/python/__init__.py
@@ -26,8 +26,7 @@ def get_documents():
             document = get_file_documents(FileLoaderConfig(**loader_config))
             documents.extend(document)
         elif loader_type == "web":
-            for entry in loader_config:
-                document = get_web_documents(WebLoaderConfig(**entry))
-                documents.extend(document)
+            document = get_web_documents(WebLoaderConfig(**loader_config))
+            documents.extend(document)
 
     return documents
diff --git a/templates/components/loaders/python/web.py b/templates/components/loaders/python/web.py
index bca9aaf7..563e51b5 100644
--- a/templates/components/loaders/python/web.py
+++ b/templates/components/loaders/python/web.py
@@ -3,17 +3,34 @@ import json
 from pydantic import BaseModel, Field
 
 
-class WebLoaderConfig(BaseModel):
+class CrawlUrl(BaseModel):
     base_url: str
     prefix: str
     max_depth: int = Field(default=1, ge=0)
 
 
+class WebLoaderConfig(BaseModel):
+    driver_arguments: list[str] = Field(default=None)
+    urls: list[CrawlUrl]
+
+
 def get_web_documents(config: WebLoaderConfig):
     from llama_index.readers.web import WholeSiteReader
+    from selenium import webdriver
+    from selenium.webdriver.chrome.options import Options
+
+    options = Options()
+    driver_arguments = config.driver_arguments or []
+    for arg in driver_arguments:
+        options.add_argument(arg)
+
+    docs = []
+    for url in config.urls:
+        scraper = WholeSiteReader(
+            prefix=url.prefix,
+            max_depth=url.max_depth,
+            driver=webdriver.Chrome(options=options),
+        )
+        docs.extend(scraper.load_data(url.base_url))
 
-    scraper = WholeSiteReader(
-        prefix=config.prefix,
-        max_depth=config.max_depth,
-    )
-    return scraper.load_data(config.base_url)
+    return docs
diff --git a/templates/types/simple/fastapi/README-template.md b/templates/types/simple/fastapi/README-template.md
deleted file mode 100644
index 69ff766c..00000000
--- a/templates/types/simple/fastapi/README-template.md
+++ /dev/null
@@ -1,60 +0,0 @@
-This is a [LlamaIndex](https://www.llamaindex.ai/) project using [FastAPI](https://fastapi.tiangolo.com/) bootstrapped with [`create-llama`](https://github.com/run-llama/LlamaIndexTS/tree/main/packages/create-llama).
-
-## Getting Started
-
-First, setup the environment with poetry:
-
-> **_Note:_** This step is not needed if you are using the dev-container.
-
-```
-poetry install
-poetry shell
-```
-
-By default, we use the OpenAI LLM (though you can customize, see app/api/routers/chat.py). As a result you need to specify an `OPENAI_API_KEY` in an .env file in this directory.
-
-Example `backend/.env` file:
-
-```
-OPENAI_API_KEY=<openai_api_key>
-```
-
-If you are using any tools or data sources, you can update their config files in the `config` folder.
-
-Second, generate the embeddings of the documents in the `./data` directory (if this folder exists - otherwise, skip this step):
-
-```
-python app/engine/generate.py
-```
-
-Third, run the development server:
-
-```
-python main.py
-```
-
-Then call the API endpoint `/api/chat` to see the result:
-
-```
-curl --location 'localhost:8000/api/chat' \
---header 'Content-Type: application/json' \
---data '{ "messages": [{ "role": "user", "content": "Hello" }] }'
-```
-
-You can start editing the API by modifying `app/api/routers/chat.py`. The endpoint auto-updates as you save the file.
-
-Open [http://localhost:8000/docs](http://localhost:8000/docs) with your browser to see the Swagger UI of the API.
-
-The API allows CORS for all origins to simplify development. You can change this behavior by setting the `ENVIRONMENT` environment variable to `prod`:
-
-```
-ENVIRONMENT=prod python main.py
-```
-
-## Learn More
-
-To learn more about LlamaIndex, take a look at the following resources:
-
-- [LlamaIndex Documentation](https://docs.llamaindex.ai) - learn about LlamaIndex.
-
-You can check out [the LlamaIndex GitHub repository](https://github.com/run-llama/llama_index) - your feedback and contributions are welcome!
diff --git a/templates/types/streaming/express/README-template.md b/templates/types/streaming/express/README-template.md
index 62a5f248..61e164bf 100644
--- a/templates/types/streaming/express/README-template.md
+++ b/templates/types/streaming/express/README-template.md
@@ -60,6 +60,38 @@ NODE_ENV=production npm run start
 
 > Note that the `NODE_ENV` environment variable is set to `production`. This disables CORS for all origins.
 
+## Using Docker
+
+1. Build an image for Express app:
+
+```
+docker build -t <your_backend_image_name> .
+```
+
+2. Start the app:
+
+- Generate index data:
+
+```
+docker run --rm \
+  --v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  <your_backend_image_name>
+  npm run generate
+```
+
+- Start the API:
+
+```
+docker run \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 8000:8000 \
+  <your_backend_image_name>
+```
+
 ## Learn More
 
 To learn more about LlamaIndex, take a look at the following resources:
diff --git a/templates/types/streaming/fastapi/README-template.md b/templates/types/streaming/fastapi/README-template.md
index ca7e0b33..33dbeba6 100644
--- a/templates/types/streaming/fastapi/README-template.md
+++ b/templates/types/streaming/fastapi/README-template.md
@@ -64,6 +64,39 @@ The API allows CORS for all origins to simplify development. You can change this
 ENVIRONMENT=prod python main.py
 ```
 
+## Using docker
+
+1. Build an image for FastAPI app:
+
+```
+docker build -t <your_backend_image_name> .
+```
+
+2. Start the app:
+
+- Generate embedding for index data:
+
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/storage:/app/storage \ # Use your file system to store gea vector database
+  <your_backend_image_name> \
+  python app/engine/generate.py
+```
+
+- Start the API:
+
+```
+docker run \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/storage:/app/storage \ # Use your file system to store gea vector database
+  -p 8000:8000 \
+  <your_backend_image_name>
+```
+
 ## Learn More
 
 To learn more about LlamaIndex, take a look at the following resources:
diff --git a/templates/types/streaming/nextjs/README-template.md b/templates/types/streaming/nextjs/README-template.md
index 81b3b93f..c522d311 100644
--- a/templates/types/streaming/nextjs/README-template.md
+++ b/templates/types/streaming/nextjs/README-template.md
@@ -26,6 +26,41 @@ You can start editing the page by modifying `app/page.tsx`. The page auto-update
 
 This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font.
 
+## Using Docker
+
+1. Build an image for the Next.js app:
+
+```
+docker build -t <your_app_image_name> .
+```
+
+2. Generate embeddings:
+
+Parse the data and generate the vector embeddings if the `./data` folder exists - otherwise, skip this step:
+
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 3000:3000 \
+  <your_app_image_name> \
+  npm run generate
+```
+
+3. Start the API
+
+```
+docker run \
+  --rm \
+  -v $(pwd)/.env:/app/.env \ # Use ENV variables and configuration from your file-system
+  -v $(pwd)/config:/app/config \
+  -v $(pwd)/cache:/app/cache \ # Use your file system to store gea vector database
+  -p 3000:3000 \
+  <your_app_image_name>
+```
+
 ## Learn More
 
 To learn more about LlamaIndex, take a look at the following resources:
-- 
GitLab