From a84743c57692a45e521b9b201b7038c8b22727fd Mon Sep 17 00:00:00 2001
From: Huu Le <39040748+leehuwuj@users.noreply.github.com>
Date: Thu, 26 Dec 2024 15:09:16 +0700
Subject: [PATCH] add LlamaCloud support for reflex template (#473)

---
 .changeset/famous-ways-give.md                |   5 +
 .changeset/green-melons-thank.md              |   5 +
 create-app.ts                                 |   4 +-
 e2e/shared/multiagent_template.spec.ts        |  12 +-
 e2e/shared/reflex_template.spec.ts            |  10 +-
 e2e/utils.ts                                  |   8 +-
 helpers/python.ts                             |  35 +--
 helpers/types.ts                              |   4 +-
 helpers/typescript.ts                         |  18 +-
 index.ts                                      |   4 +-
 questions/questions.ts                        |  86 ++++--
 questions/simple.ts                           |  56 ++--
 .../types/reflex/app/api/routers/models.py    |  65 ++++
 templates/types/reflex/app/services/file.py   | 281 ++++++++++++++++++
 14 files changed, 490 insertions(+), 103 deletions(-)
 create mode 100644 .changeset/famous-ways-give.md
 create mode 100644 .changeset/green-melons-thank.md
 create mode 100644 templates/types/reflex/app/api/routers/models.py
 create mode 100644 templates/types/reflex/app/services/file.py

diff --git a/.changeset/famous-ways-give.md b/.changeset/famous-ways-give.md
new file mode 100644
index 00000000..08891c67
--- /dev/null
+++ b/.changeset/famous-ways-give.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Change --agents paramameter to --use-case
diff --git a/.changeset/green-melons-thank.md b/.changeset/green-melons-thank.md
new file mode 100644
index 00000000..c17a3cd7
--- /dev/null
+++ b/.changeset/green-melons-thank.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Add LlamaCloud support for Reflex templates
diff --git a/create-app.ts b/create-app.ts
index 345ff369..b28dea5a 100644
--- a/create-app.ts
+++ b/create-app.ts
@@ -39,7 +39,7 @@ export async function createApp({
   tools,
   useLlamaParse,
   observability,
-  agents,
+  useCase,
 }: InstallAppArgs): Promise<void> {
   const root = path.resolve(appPath);
 
@@ -84,7 +84,7 @@ export async function createApp({
     tools,
     useLlamaParse,
     observability,
-    agents,
+    useCase,
   };
 
   // Install backend
diff --git a/e2e/shared/multiagent_template.spec.ts b/e2e/shared/multiagent_template.spec.ts
index ccb33539..0839d000 100644
--- a/e2e/shared/multiagent_template.spec.ts
+++ b/e2e/shared/multiagent_template.spec.ts
@@ -18,10 +18,10 @@ const templateUI: TemplateUI = "shadcn";
 const templatePostInstallAction: TemplatePostInstallAction = "runApp";
 const appType: AppType = templateFramework === "fastapi" ? "--frontend" : "";
 const userMessage = "Write a blog post about physical standards for letters";
-const templateAgents = ["financial_report", "blog", "form_filling"];
+const templateUseCases = ["financial_report", "blog", "form_filling"];
 
-for (const agents of templateAgents) {
-  test.describe(`Test multiagent template ${agents} ${templateFramework} ${dataSource} ${templateUI} ${appType} ${templatePostInstallAction}`, async () => {
+for (const useCase of templateUseCases) {
+  test.describe(`Test multiagent template ${useCase} ${templateFramework} ${dataSource} ${templateUI} ${appType} ${templatePostInstallAction}`, async () => {
     test.skip(
       process.platform !== "linux" || process.env.DATASOURCE === "--no-files",
       "The multiagent template currently only works with files. We also only run on Linux to speed up tests.",
@@ -46,7 +46,7 @@ for (const agents of templateAgents) {
         postInstallAction: templatePostInstallAction,
         templateUI,
         appType,
-        agents,
+        useCase,
       });
       name = result.projectName;
       appProcess = result.appProcess;
@@ -71,8 +71,8 @@ for (const agents of templateAgents) {
     }) => {
       test.skip(
         templatePostInstallAction !== "runApp" ||
-          agents === "financial_report" ||
-          agents === "form_filling" ||
+          useCase === "financial_report" ||
+          useCase === "form_filling" ||
           templateFramework === "express",
         "Skip chat tests for financial report and form filling.",
       );
diff --git a/e2e/shared/reflex_template.spec.ts b/e2e/shared/reflex_template.spec.ts
index 766d20a1..d20c6d7e 100644
--- a/e2e/shared/reflex_template.spec.ts
+++ b/e2e/shared/reflex_template.spec.ts
@@ -3,7 +3,7 @@ import { expect, test } from "@playwright/test";
 import { ChildProcess } from "child_process";
 import fs from "fs";
 import path from "path";
-import { TemplateAgents, TemplateFramework } from "../../helpers";
+import { TemplateFramework, TemplateUseCase } from "../../helpers";
 import { createTestDir, runCreateLlama } from "../utils";
 
 const templateFramework: TemplateFramework = process.env.FRAMEWORK
@@ -12,7 +12,7 @@ const templateFramework: TemplateFramework = process.env.FRAMEWORK
 const dataSource: string = process.env.DATASOURCE
   ? process.env.DATASOURCE
   : "--example-file";
-const templateAgents: TemplateAgents[] = ["extractor", "contract_review"];
+const templateUseCases: TemplateUseCase[] = ["extractor", "contract_review"];
 
 // The reflex template currently only works with FastAPI and files (and not on Windows)
 if (
@@ -20,8 +20,8 @@ if (
   templateFramework === "fastapi" &&
   dataSource === "--example-file"
 ) {
-  for (const agents of templateAgents) {
-    test.describe(`Test reflex template ${agents} ${templateFramework} ${dataSource}`, async () => {
+  for (const useCase of templateUseCases) {
+    test.describe(`Test reflex template ${useCase} ${templateFramework} ${dataSource}`, async () => {
       let appPort: number;
       let name: string;
       let appProcess: ChildProcess;
@@ -39,7 +39,7 @@ if (
           vectorDb: "none",
           port: appPort,
           postInstallAction: "runApp",
-          agents,
+          useCase,
         });
         name = result.projectName;
         appProcess = result.appProcess;
diff --git a/e2e/utils.ts b/e2e/utils.ts
index e7a9cc9a..a2d29083 100644
--- a/e2e/utils.ts
+++ b/e2e/utils.ts
@@ -33,7 +33,7 @@ export type RunCreateLlamaOptions = {
   tools?: string;
   useLlamaParse?: boolean;
   observability?: string;
-  agents?: string;
+  useCase?: string;
 };
 
 export async function runCreateLlama({
@@ -51,7 +51,7 @@ export async function runCreateLlama({
   tools,
   useLlamaParse,
   observability,
-  agents,
+  useCase,
 }: RunCreateLlamaOptions): Promise<CreateLlamaResult> {
   if (!process.env.OPENAI_API_KEY || !process.env.LLAMA_CLOUD_API_KEY) {
     throw new Error(
@@ -113,8 +113,8 @@ export async function runCreateLlama({
   if (observability) {
     commandArgs.push("--observability", observability);
   }
-  if ((templateType === "multiagent" || templateType === "reflex") && agents) {
-    commandArgs.push("--agents", agents);
+  if ((templateType === "multiagent" || templateType === "reflex") && useCase) {
+    commandArgs.push("--use-case", useCase);
   }
 
   const command = commandArgs.join(" ");
diff --git a/helpers/python.ts b/helpers/python.ts
index e1b2271f..5201c1b5 100644
--- a/helpers/python.ts
+++ b/helpers/python.ts
@@ -380,28 +380,32 @@ export const installPythonDependencies = (
 };
 
 export const installPythonTemplate = async ({
+  appName,
   root,
   template,
   framework,
   vectorDb,
+  postInstallAction,
+  modelConfig,
   dataSources,
   tools,
-  postInstallAction,
+  useLlamaParse,
+  useCase,
   observability,
-  modelConfig,
-  agents,
 }: Pick<
   InstallTemplateArgs,
+  | "appName"
   | "root"
-  | "framework"
   | "template"
+  | "framework"
   | "vectorDb"
+  | "postInstallAction"
+  | "modelConfig"
   | "dataSources"
   | "tools"
-  | "postInstallAction"
+  | "useLlamaParse"
+  | "useCase"
   | "observability"
-  | "modelConfig"
-  | "agents"
 >) => {
   console.log("\nInitializing Python project with template:", template, "\n");
   let templatePath;
@@ -476,21 +480,12 @@ export const installPythonTemplate = async ({
     await copyRouterCode(root, tools ?? []);
   }
 
-  if (template === "multiagent") {
-    // Copy multi-agent code
-    await copy("**", path.join(root), {
-      parents: true,
-      cwd: path.join(compPath, "multiagent", "python"),
-      rename: assetRelocator,
-    });
-  }
-
   if (template === "multiagent" || template === "reflex") {
-    if (agents) {
+    if (useCase) {
       const sourcePath =
         template === "multiagent"
-          ? path.join(compPath, "agents", "python", agents)
-          : path.join(compPath, "reflex", agents);
+          ? path.join(compPath, "agents", "python", useCase)
+          : path.join(compPath, "reflex", useCase);
 
       await copy("**", path.join(root), {
         parents: true,
@@ -500,7 +495,7 @@ export const installPythonTemplate = async ({
     } else {
       console.log(
         red(
-          `There is no agent selected for ${template} template. Please pick an agent to use via --agents flag.`,
+          `There is no use case selected for ${template} template. Please pick a use case to use via --use-case flag.`,
         ),
       );
       process.exit(1);
diff --git a/helpers/types.ts b/helpers/types.ts
index a4635f0e..544a2710 100644
--- a/helpers/types.ts
+++ b/helpers/types.ts
@@ -49,7 +49,7 @@ export type TemplateDataSource = {
 };
 export type TemplateDataSourceType = "file" | "web" | "db";
 export type TemplateObservability = "none" | "traceloop" | "llamatrace";
-export type TemplateAgents =
+export type TemplateUseCase =
   | "financial_report"
   | "blog"
   | "form_filling"
@@ -106,5 +106,5 @@ export interface InstallTemplateArgs {
   postInstallAction?: TemplatePostInstallAction;
   tools?: Tool[];
   observability?: TemplateObservability;
-  agents?: TemplateAgents;
+  useCase?: TemplateUseCase;
 }
diff --git a/helpers/typescript.ts b/helpers/typescript.ts
index 761a4bb3..b516cd39 100644
--- a/helpers/typescript.ts
+++ b/helpers/typescript.ts
@@ -26,7 +26,7 @@ export const installTSTemplate = async ({
   tools,
   dataSources,
   useLlamaParse,
-  agents,
+  useCase,
 }: InstallTemplateArgs & { backend: boolean }) => {
   console.log(bold(`Using ${packageManager}.`));
 
@@ -131,16 +131,16 @@ export const installTSTemplate = async ({
       cwd: path.join(multiagentPath, "workflow"),
     });
 
-    // Copy agents use case code for multiagent template
-    if (agents) {
-      console.log("\nCopying agent:", agents, "\n");
-      const useCasePath = path.join(compPath, "agents", "typescript", agents);
-      const agentsCodePath = path.join(useCasePath, "workflow");
+    // Copy use case code for multiagent template
+    if (useCase) {
+      console.log("\nCopying use case:", useCase, "\n");
+      const useCasePath = path.join(compPath, "agents", "typescript", useCase);
+      const useCaseCodePath = path.join(useCasePath, "workflow");
 
-      // Copy agent codes
+      // Copy use case codes
       await copy("**", path.join(root, relativeEngineDestPath, "workflow"), {
         parents: true,
-        cwd: agentsCodePath,
+        cwd: useCaseCodePath,
         rename: assetRelocator,
       });
 
@@ -153,7 +153,7 @@ export const installTSTemplate = async ({
     } else {
       console.log(
         red(
-          `There is no agent selected for ${template} template. Please pick an agent to use via --agents flag.`,
+          `There is no use case selected for ${template} template. Please pick a use case to use via --use-case flag.`,
         ),
       );
       process.exit(1);
diff --git a/index.ts b/index.ts
index 4a343899..370bd1e8 100644
--- a/index.ts
+++ b/index.ts
@@ -202,10 +202,10 @@ const program = new Command(packageJson.name)
     false,
   )
   .option(
-    "--agents <agents>",
+    "--use-case <useCase>",
     `
 
-  Select which agents to use for the multi-agent template (e.g: financial_report, blog).
+  Select which use case to use for the multi-agent template (e.g: financial_report, blog).
 `,
   )
   .allowUnknownOption()
diff --git a/questions/questions.ts b/questions/questions.ts
index ebc83396..55983997 100644
--- a/questions/questions.ts
+++ b/questions/questions.ts
@@ -2,7 +2,7 @@ import { blue } from "picocolors";
 import prompts from "prompts";
 import { isCI } from ".";
 import { COMMUNITY_OWNER, COMMUNITY_REPO } from "../helpers/constant";
-import { EXAMPLE_FILE } from "../helpers/datasources";
+import { EXAMPLE_FILE, EXAMPLE_GDPR } from "../helpers/datasources";
 import { getAvailableLlamapackOptions } from "../helpers/llama-pack";
 import { askModelConfig } from "../helpers/providers";
 import { getProjectOptions } from "../helpers/repo";
@@ -33,7 +33,7 @@ export const askProQuestions = async (program: QuestionArgs) => {
             title: "Multi-agent app (using workflows)",
             value: "multiagent",
           },
-          { title: "Structured Extractor", value: "extractor" },
+          { title: "Fullstack python template with Reflex", value: "reflex" },
           {
             title: `Community template from ${styledRepo}`,
             value: "community",
@@ -100,6 +100,24 @@ export const askProQuestions = async (program: QuestionArgs) => {
     // So we just use example file for extractor template, this allows user to choose vector database later
     program.dataSources = [EXAMPLE_FILE];
     program.framework = "fastapi";
+    // Ask for which Reflex use case to use
+    const { useCase } = await prompts(
+      {
+        type: "select",
+        name: "useCase",
+        message: "Which use case would you like to build?",
+        choices: [
+          { title: "Structured Extractor", value: "extractor" },
+          {
+            title: "Contract review (using Workflow)",
+            value: "contract_review",
+          },
+        ],
+        initial: 0,
+      },
+      questionHandlers,
+    );
+    program.useCase = useCase;
   }
 
   if (!program.framework) {
@@ -171,32 +189,50 @@ export const askProQuestions = async (program: QuestionArgs) => {
     program.observability = observability;
   }
 
-  // Ask agents
-  if (program.template === "multiagent" && !program.agents) {
-    const { agents } = await prompts(
+  if (
+    (program.template === "reflex" || program.template === "multiagent") &&
+    !program.useCase
+  ) {
+    const choices =
+      program.template === "reflex"
+        ? [
+            { title: "Structured Extractor", value: "extractor" },
+            {
+              title: "Contract review (using Workflow)",
+              value: "contract_review",
+            },
+          ]
+        : [
+            {
+              title: "Financial report (generate a financial report)",
+              value: "financial_report",
+            },
+            {
+              title: "Form filling (fill missing value in a CSV file)",
+              value: "form_filling",
+            },
+            { title: "Blog writer (Write a blog post)", value: "blog" },
+          ];
+
+    const { useCase } = await prompts(
       {
         type: "select",
-        name: "agents",
-        message: "Which agents would you like to use?",
-        choices: [
-          {
-            title: "Financial report (generate a financial report)",
-            value: "financial_report",
-          },
-          {
-            title: "Form filling (fill missing value in a CSV file)",
-            value: "form_filling",
-          },
-          {
-            title: "Blog writer (Write a blog post)",
-            value: "blog_writer",
-          },
-        ],
+        name: "useCase",
+        message: "Which use case would you like to use?",
+        choices,
         initial: 0,
       },
       questionHandlers,
     );
-    program.agents = agents;
+    program.useCase = useCase;
+  }
+
+  // Configure framework and data sources for Reflex template
+  if (program.template === "reflex") {
+    program.framework = "fastapi";
+
+    program.dataSources =
+      program.useCase === "extractor" ? [EXAMPLE_FILE] : [EXAMPLE_GDPR];
   }
 
   if (!program.modelConfig) {
@@ -222,8 +258,8 @@ export const askProQuestions = async (program: QuestionArgs) => {
     program.vectorDb = vectorDb;
   }
 
-  if (program.vectorDb === "llamacloud") {
-    // When using a LlamaCloud index, don't ask for data sources just copy an example file
+  if (program.vectorDb === "llamacloud" && program.dataSources.length === 0) {
+    // When using a LlamaCloud index and no data sources are provided, just copy an example file
     program.dataSources = [EXAMPLE_FILE];
   }
 
@@ -354,7 +390,7 @@ export const askProQuestions = async (program: QuestionArgs) => {
     // default to use LlamaParse if using LlamaCloud
     program.useLlamaParse = true;
   } else {
-    // Reflex template doesn't support LlamaParse and LlamaCloud right now (cannot use asyncio loop in Reflex)
+    // Reflex template doesn't support LlamaParse right now (cannot use asyncio loop in Reflex)
     if (program.useLlamaParse === undefined && program.template !== "reflex") {
       // if already set useLlamaParse, don't ask again
       if (program.dataSources.some((ds) => ds.type === "file")) {
diff --git a/questions/simple.ts b/questions/simple.ts
index 19826198..eb85065f 100644
--- a/questions/simple.ts
+++ b/questions/simple.ts
@@ -74,34 +74,34 @@ export const askSimpleQuestions = async (
       questionHandlers,
     );
     language = newLanguage;
+  }
+
+  const { useLlamaCloud: newUseLlamaCloud } = await prompts(
+    {
+      type: "toggle",
+      name: "useLlamaCloud",
+      message: "Do you want to use LlamaCloud services?",
+      initial: false,
+      active: "Yes",
+      inactive: "No",
+      hint: "see https://www.llamaindex.ai/enterprise for more info",
+    },
+    questionHandlers,
+  );
+  useLlamaCloud = newUseLlamaCloud;
 
-    const { useLlamaCloud: newUseLlamaCloud } = await prompts(
+  if (useLlamaCloud && !llamaCloudKey) {
+    // Ask for LlamaCloud API key, if not set
+    const { llamaCloudKey: newLlamaCloudKey } = await prompts(
       {
-        type: "toggle",
-        name: "useLlamaCloud",
-        message: "Do you want to use LlamaCloud services?",
-        initial: false,
-        active: "Yes",
-        inactive: "No",
-        hint: "see https://www.llamaindex.ai/enterprise for more info",
+        type: "text",
+        name: "llamaCloudKey",
+        message:
+          "Please provide your LlamaCloud API key (leave blank to skip):",
       },
       questionHandlers,
     );
-    useLlamaCloud = newUseLlamaCloud;
-
-    if (useLlamaCloud && !llamaCloudKey) {
-      // Ask for LlamaCloud API key, if not set
-      const { llamaCloudKey: newLlamaCloudKey } = await prompts(
-        {
-          type: "text",
-          name: "llamaCloudKey",
-          message:
-            "Please provide your LlamaCloud API key (leave blank to skip):",
-        },
-        questionHandlers,
-      );
-      llamaCloudKey = newLlamaCloudKey || process.env.LLAMA_CLOUD_API_KEY;
-    }
+    llamaCloudKey = newLlamaCloudKey || process.env.LLAMA_CLOUD_API_KEY;
   }
 
   const results = await convertAnswers(args, {
@@ -133,7 +133,7 @@ const convertAnswers = async (
     AppType,
     Pick<
       QuestionResults,
-      "template" | "tools" | "frontend" | "dataSources" | "agents"
+      "template" | "tools" | "frontend" | "dataSources" | "useCase"
     > & {
       modelConfig?: ModelConfig;
     }
@@ -160,7 +160,7 @@ const convertAnswers = async (
     },
     financial_report_agent: {
       template: "multiagent",
-      agents: "financial_report",
+      useCase: "financial_report",
       tools: getTools(["document_generator", "interpreter"]),
       dataSources: EXAMPLE_10K_SEC_FILES,
       frontend: true,
@@ -168,7 +168,7 @@ const convertAnswers = async (
     },
     form_filling: {
       template: "multiagent",
-      agents: "form_filling",
+      useCase: "form_filling",
       tools: getTools(["form_filling"]),
       dataSources: EXAMPLE_10K_SEC_FILES,
       frontend: true,
@@ -176,14 +176,14 @@ const convertAnswers = async (
     },
     extractor: {
       template: "reflex",
-      agents: "extractor",
+      useCase: "extractor",
       tools: [],
       frontend: false,
       dataSources: [EXAMPLE_FILE],
     },
     contract_review: {
       template: "reflex",
-      agents: "contract_review",
+      useCase: "contract_review",
       tools: [],
       frontend: false,
       dataSources: [EXAMPLE_GDPR],
diff --git a/templates/types/reflex/app/api/routers/models.py b/templates/types/reflex/app/api/routers/models.py
new file mode 100644
index 00000000..db672e78
--- /dev/null
+++ b/templates/types/reflex/app/api/routers/models.py
@@ -0,0 +1,65 @@
+import logging
+import os
+from typing import Any, Dict, List, Optional
+
+from llama_index.core.schema import NodeWithScore
+from pydantic import BaseModel
+
+from app.config import DATA_DIR
+
+logger = logging.getLogger("uvicorn")
+
+
+class SourceNodes(BaseModel):
+    id: str
+    metadata: Dict[str, Any]
+    score: Optional[float]
+    text: str
+    url: Optional[str]
+
+    @classmethod
+    def from_source_node(cls, source_node: NodeWithScore):
+        metadata = source_node.node.metadata
+        url = cls.get_url_from_metadata(metadata)
+
+        return cls(
+            id=source_node.node.node_id,
+            metadata=metadata,
+            score=source_node.score,
+            text=source_node.node.text,  # type: ignore
+            url=url,
+        )
+
+    @classmethod
+    def get_url_from_metadata(cls, metadata: Dict[str, Any]) -> Optional[str]:
+        url_prefix = os.getenv("FILESERVER_URL_PREFIX")
+        if not url_prefix:
+            logger.warning(
+                "Warning: FILESERVER_URL_PREFIX not set in environment variables. Can't use file server"
+            )
+        file_name = metadata.get("file_name")
+
+        if file_name and url_prefix:
+            # file_name exists and file server is configured
+            pipeline_id = metadata.get("pipeline_id")
+            if pipeline_id:
+                # file is from LlamaCloud
+                file_name = f"{pipeline_id}${file_name}"
+                return f"{url_prefix}/output/llamacloud/{file_name}"
+            is_private = metadata.get("private", "false") == "true"
+            if is_private:
+                # file is a private upload
+                return f"{url_prefix}/output/uploaded/{file_name}"
+            # file is from calling the 'generate' script
+            # Get the relative path of file_path to data_dir
+            file_path = metadata.get("file_path")
+            data_dir = os.path.abspath(DATA_DIR)
+            if file_path and data_dir:
+                relative_path = os.path.relpath(file_path, data_dir)
+                return f"{url_prefix}/data/{relative_path}"
+        # fallback to URL in metadata (e.g. for websites)
+        return metadata.get("URL")
+
+    @classmethod
+    def from_source_nodes(cls, source_nodes: List[NodeWithScore]):
+        return [cls.from_source_node(node) for node in source_nodes]
diff --git a/templates/types/reflex/app/services/file.py b/templates/types/reflex/app/services/file.py
new file mode 100644
index 00000000..3fc1a64f
--- /dev/null
+++ b/templates/types/reflex/app/services/file.py
@@ -0,0 +1,281 @@
+import base64
+import logging
+import mimetypes
+import os
+import re
+import uuid
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from llama_index.core import VectorStoreIndex
+from llama_index.core.ingestion import IngestionPipeline
+from llama_index.core.readers.file.base import (
+    _try_loading_included_file_formats as get_file_loaders_map,
+)
+from llama_index.core.schema import Document
+from llama_index.indices.managed.llama_cloud.base import LlamaCloudIndex
+from llama_index.readers.file import FlatReader
+from pydantic import BaseModel, Field
+
+logger = logging.getLogger(__name__)
+
+PRIVATE_STORE_PATH = str(Path("output", "uploaded"))
+TOOL_STORE_PATH = str(Path("output", "tools"))
+LLAMA_CLOUD_STORE_PATH = str(Path("output", "llamacloud"))
+
+
+class DocumentFile(BaseModel):
+    id: str
+    name: str  # Stored file name
+    type: str = None
+    size: int = None
+    url: str = None
+    path: Optional[str] = Field(
+        None,
+        description="The stored file path. Used internally in the server.",
+        exclude=True,
+    )
+    refs: Optional[List[str]] = Field(
+        None, description="The document ids in the index."
+    )
+
+
+class FileService:
+    """
+    To store the files uploaded by the user and add them to the index.
+    """
+
+    @classmethod
+    def process_private_file(
+        cls,
+        file_name: str,
+        base64_content: str,
+        params: Optional[dict] = None,
+    ) -> DocumentFile:
+        """
+        Store the uploaded file and index it if necessary.
+        """
+        try:
+            from app.engine.index import IndexConfig, get_index
+        except ImportError as e:
+            raise ValueError("IndexConfig or get_index is not found") from e
+
+        if params is None:
+            params = {}
+
+        # Add the nodes to the index and persist it
+        index_config = IndexConfig(**params)
+        index = get_index(index_config)
+
+        # Preprocess and store the file
+        file_data, extension = cls._preprocess_base64_file(base64_content)
+
+        document_file = cls.save_file(
+            file_data,
+            file_name=file_name,
+            save_dir=PRIVATE_STORE_PATH,
+        )
+
+        # Don't index csv files (they are handled by tools)
+        if extension == "csv":
+            return document_file
+        else:
+            # Insert the file into the index and update document ids to the file metadata
+            if isinstance(index, LlamaCloudIndex):
+                doc_id = cls._add_file_to_llama_cloud_index(
+                    index, document_file.name, file_data
+                )
+                # Add document ids to the file metadata
+                document_file.refs = [doc_id]
+            else:
+                documents = cls._load_file_to_documents(document_file)
+                cls._add_documents_to_vector_store_index(documents, index)
+                # Add document ids to the file metadata
+                document_file.refs = [doc.doc_id for doc in documents]
+
+        # Return the file metadata
+        return document_file
+
+    @classmethod
+    def save_file(
+        cls,
+        content: bytes | str,
+        file_name: str,
+        save_dir: Optional[str] = None,
+    ) -> DocumentFile:
+        """
+        Save the content to a file in the local file server (accessible via URL)
+
+        Args:
+            content (bytes | str): The content to save, either bytes or string.
+            file_name (str): The original name of the file.
+            save_dir (Optional[str]): The relative path from the current working directory. Defaults to the `output/uploaded` directory.
+        Returns:
+            The metadata of the saved file.
+        """
+        if save_dir is None:
+            save_dir = os.path.join("output", "uploaded")
+
+        file_id = str(uuid.uuid4())
+        name, extension = os.path.splitext(file_name)
+        extension = extension.lstrip(".")
+        sanitized_name = _sanitize_file_name(name)
+        if extension == "":
+            raise ValueError("File is not supported!")
+        new_file_name = f"{sanitized_name}_{file_id}.{extension}"
+
+        file_path = os.path.join(save_dir, new_file_name)
+
+        if isinstance(content, str):
+            content = content.encode()
+
+        try:
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+            with open(file_path, "wb") as file:
+                file.write(content)
+        except PermissionError as e:
+            logger.error(
+                f"Permission denied when writing to file {file_path}: {str(e)}"
+            )
+            raise
+        except IOError as e:
+            logger.error(
+                f"IO error occurred when writing to file {file_path}: {str(e)}"
+            )
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error when writing to file {file_path}: {str(e)}")
+            raise
+
+        logger.info(f"Saved file to {file_path}")
+
+        file_url_prefix = os.getenv("FILESERVER_URL_PREFIX")
+        if file_url_prefix is None:
+            logger.warning(
+                "FILESERVER_URL_PREFIX is not set, fallback to http://localhost:8000/api/files"
+            )
+            file_url_prefix = "http://localhost:8000/api/files"
+        file_size = os.path.getsize(file_path)
+
+        file_url = os.path.join(
+            file_url_prefix,
+            save_dir,
+            new_file_name,
+        )
+
+        return DocumentFile(
+            id=file_id,
+            name=new_file_name,
+            type=extension,
+            size=file_size,
+            path=file_path,
+            url=file_url,
+            refs=None,
+        )
+
+    @staticmethod
+    def _preprocess_base64_file(base64_content: str) -> Tuple[bytes, str | None]:
+        header, data = base64_content.split(",", 1)
+        mime_type = header.split(";")[0].split(":", 1)[1]
+        extension = mimetypes.guess_extension(mime_type).lstrip(".")
+        # File data as bytes
+        return base64.b64decode(data), extension
+
+    @staticmethod
+    def _load_file_to_documents(file: DocumentFile) -> List[Document]:
+        """
+        Load the file from the private directory and return the documents
+        """
+        _, extension = os.path.splitext(file.name)
+        extension = extension.lstrip(".")
+
+        # Load file to documents
+        # If LlamaParse is enabled, use it to parse the file
+        # Otherwise, use the default file loaders
+        reader = _get_llamaparse_parser()
+        if reader is None:
+            reader_cls = _default_file_loaders_map().get(f".{extension}")
+            if reader_cls is None:
+                raise ValueError(f"File extension {extension} is not supported")
+            reader = reader_cls()
+        if file.path is None:
+            raise ValueError("Document file path is not set")
+        documents = reader.load_data(Path(file.path))
+        # Add custom metadata
+        for doc in documents:
+            doc.metadata["file_name"] = file.name
+            doc.metadata["private"] = "true"
+        return documents
+
+    @staticmethod
+    def _add_documents_to_vector_store_index(
+        documents: List[Document], index: VectorStoreIndex
+    ) -> None:
+        """
+        Add the documents to the vector store index
+        """
+        pipeline = IngestionPipeline()
+        nodes = pipeline.run(documents=documents)
+
+        # Add the nodes to the index and persist it
+        if index is None:
+            index = VectorStoreIndex(nodes=nodes)
+        else:
+            index.insert_nodes(nodes=nodes)
+        index.storage_context.persist(
+            persist_dir=os.environ.get("STORAGE_DIR", "storage")
+        )
+
+    @staticmethod
+    def _add_file_to_llama_cloud_index(
+        index: LlamaCloudIndex,
+        file_name: str,
+        file_data: bytes,
+    ) -> str:
+        """
+        Add the file to the LlamaCloud index.
+        LlamaCloudIndex is a managed index so we can directly use the files.
+        """
+        try:
+            from app.engine.service import LLamaCloudFileService  # type: ignore
+        except ImportError as e:
+            raise ValueError("LlamaCloudFileService is not found") from e
+
+        # LlamaCloudIndex is a managed index so we can directly use the files
+        upload_file = (file_name, BytesIO(file_data))
+        doc_id = LLamaCloudFileService.add_file_to_pipeline(
+            index.project.id,
+            index.pipeline.id,
+            upload_file,
+            custom_metadata={},
+            wait_for_processing=True,
+        )
+        return doc_id
+
+
+def _sanitize_file_name(file_name: str) -> str:
+    """
+    Sanitize the file name by replacing all non-alphanumeric characters with underscores
+    """
+    sanitized_name = re.sub(r"[^a-zA-Z0-9.]", "_", file_name)
+    return sanitized_name
+
+
+def _get_llamaparse_parser():
+    from app.engine.loaders import load_configs
+    from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser
+
+    config = load_configs()
+    file_loader_config = FileLoaderConfig(**config["file"])
+    if file_loader_config.use_llama_parse:
+        return llama_parse_parser()
+    else:
+        return None
+
+
+def _default_file_loaders_map():
+    default_loaders = get_file_loaders_map()
+    default_loaders[".txt"] = FlatReader
+    default_loaders[".csv"] = FlatReader
+    return default_loaders
-- 
GitLab