From 39aaafcb6cbfe5340813be412b835e2adf8b4239 Mon Sep 17 00:00:00 2001 From: Marcus Schiesser <mail@marcusschiesser.de> Date: Wed, 20 Mar 2024 11:25:55 +0700 Subject: [PATCH] Add support for llamaparse using Typescript (#11) --- helpers/env-variables.ts | 30 ++++++++++----- helpers/index.ts | 31 +++++++-------- helpers/typescript.ts | 38 +++++++++++++++---- questions.ts | 5 +-- .../loaders/typescript/file/loader.mjs | 9 +++++ .../loaders/typescript/llama_parse/loader.mjs | 19 ++++++++++ .../vectordbs/typescript/milvus/generate.mjs | 12 ++---- .../vectordbs/typescript/milvus/shared.mjs | 1 - .../vectordbs/typescript/mongo/generate.mjs | 8 ++-- .../vectordbs/typescript/mongo/shared.mjs | 1 - .../vectordbs/typescript/none/constants.mjs | 1 - .../vectordbs/typescript/none/generate.mjs | 13 ++----- .../vectordbs/typescript/pg/generate.mjs | 10 ++--- .../vectordbs/typescript/pg/shared.mjs | 2 +- .../typescript/pinecone/generate.mjs | 8 ++-- .../vectordbs/typescript/pinecone/shared.mjs | 1 - 16 files changed, 112 insertions(+), 77 deletions(-) create mode 100644 templates/components/loaders/typescript/file/loader.mjs create mode 100644 templates/components/loaders/typescript/llama_parse/loader.mjs diff --git a/helpers/env-variables.ts b/helpers/env-variables.ts index c5e65442..2b084f3a 100644 --- a/helpers/env-variables.ts +++ b/helpers/env-variables.ts @@ -99,7 +99,10 @@ const getVectorDBEnvs = (vectorDb: TemplateVectorDB) => { } }; -const getDataSourceEnvs = (dataSource: TemplateDataSource) => { +const getDataSourceEnvs = ( + dataSource: TemplateDataSource, + llamaCloudKey?: string, +) => { switch (dataSource.type) { case "web": return [ @@ -116,6 +119,19 @@ const getDataSourceEnvs = (dataSource: TemplateDataSource) => { description: "The maximum depth to scrape.", }, ]; + case "file": + case "folder": + return [ + ...((dataSource?.config as FileSourceConfig).useLlamaParse + ? [ + { + name: "LLAMA_CLOUD_API_KEY", + description: `The Llama Cloud API key.`, + value: llamaCloudKey, + }, + ] + : []), + ]; default: return []; } @@ -149,10 +165,13 @@ export const createBackendEnvFile = async ( description: "The OpenAI API key to use.", value: opts.openAiKey, }, + // Add vector database environment variables ...(opts.vectorDb ? getVectorDBEnvs(opts.vectorDb) : []), // Add data source environment variables - ...(opts.dataSource ? getDataSourceEnvs(opts.dataSource) : []), + ...(opts.dataSource + ? getDataSourceEnvs(opts.dataSource, opts.llamaCloudKey) + : []), ]; let envVars: EnvVar[] = []; if (opts.framework === "fastapi") { @@ -204,13 +223,6 @@ We have provided context information below. Given this information, please answer the question: {query_str} "`, }, - (opts?.dataSource?.config as FileSourceConfig).useLlamaParse - ? { - name: "LLAMA_CLOUD_API_KEY", - description: `The Llama Cloud API key.`, - value: opts.llamaCloudKey, - } - : {}, ], ]; } else { diff --git a/helpers/index.ts b/helpers/index.ts index 3c99cc17..60aec506 100644 --- a/helpers/index.ts +++ b/helpers/index.ts @@ -42,30 +42,27 @@ async function generateContextData( ? llamaCloudKey || process.env["LLAMA_CLOUD_API_KEY"] : true; const hasVectorDb = vectorDb && vectorDb !== "none"; - if (framework === "fastapi") { - if ( - openAiKeyConfigured && - llamaCloudKeyConfigured && - !hasVectorDb && - isHavingPoetryLockFile() - ) { - console.log(`Running ${runGenerate} to generate the context data.`); - const result = tryPoetryRun("python app/engine/generate.py"); - if (!result) { - console.log(`Failed to run ${runGenerate}.`); - process.exit(1); + if (openAiKeyConfigured && llamaCloudKeyConfigured && !hasVectorDb) { + // If all the required environment variables are set, run the generate script + if (framework === "fastapi") { + if (isHavingPoetryLockFile()) { + console.log(`Running ${runGenerate} to generate the context data.`); + const result = tryPoetryRun("python app/engine/generate.py"); + if (!result) { + console.log(`Failed to run ${runGenerate}.`); + process.exit(1); + } + console.log(`Generated context data`); + return; } - console.log(`Generated context data`); - return; - } - } else { - if (openAiKeyConfigured && vectorDb === "none") { + } else { console.log(`Running ${runGenerate} to generate the context data.`); await callPackageManager(packageManager, true, ["run", "generate"]); return; } } + // generate the message of what to do to run the generate script manually const settings = []; if (!openAiKeyConfigured) settings.push("your OpenAI key"); if (!llamaCloudKeyConfigured) settings.push("your Llama Cloud key"); diff --git a/helpers/typescript.ts b/helpers/typescript.ts index 902a7d4d..91dd5b82 100644 --- a/helpers/typescript.ts +++ b/helpers/typescript.ts @@ -6,7 +6,7 @@ import { copy } from "../helpers/copy"; import { callPackageManager } from "../helpers/install"; import { templatesDir } from "./dir"; import { PackageManager } from "./get-pkg-manager"; -import { InstallTemplateArgs } from "./types"; +import { FileSourceConfig, InstallTemplateArgs } from "./types"; const rename = (name: string) => { switch (name) { @@ -64,6 +64,7 @@ export const installTSTemplate = async ({ postInstallAction, backend, observability, + dataSource, }: InstallTemplateArgs & { backend: boolean }) => { console.log(bold(`Using ${packageManager}.`)); @@ -118,6 +119,7 @@ export const installTSTemplate = async ({ } } + // copy observability component if (observability && observability !== "none") { const chosenObservabilityPath = path.join( templatesDir, @@ -150,20 +152,40 @@ export const installTSTemplate = async ({ vectorDBFolder = vectorDb; } - const VectorDBPath = path.join( + relativeEngineDestPath = + framework === "nextjs" + ? path.join("app", "api", "chat") + : path.join("src", "controllers"); + + const enginePath = path.join(root, relativeEngineDestPath, "engine"); + + // copy vector db component + const vectorDBPath = path.join( compPath, "vectordbs", "typescript", vectorDBFolder, ); - relativeEngineDestPath = - framework === "nextjs" - ? path.join("app", "api", "chat") - : path.join("src", "controllers"); - await copy("**", path.join(root, relativeEngineDestPath, "engine"), { + await copy("**", enginePath, { parents: true, - cwd: VectorDBPath, + cwd: vectorDBPath, }); + + // copy loader component + const dataSourceType = dataSource?.type; + if (dataSourceType && dataSourceType !== "none") { + let loaderFolder: string; + if (dataSourceType === "file" || dataSourceType === "folder") { + const dataSourceConfig = dataSource?.config as FileSourceConfig; + loaderFolder = dataSourceConfig.useLlamaParse ? "llama_parse" : "file"; + } else { + loaderFolder = dataSourceType; + } + await copy("**", enginePath, { + parents: true, + cwd: path.join(compPath, "loaders", "typescript", loaderFolder), + }); + } } /** diff --git a/questions.ts b/questions.ts index 5e84c82b..3134c315 100644 --- a/questions.ts +++ b/questions.ts @@ -691,9 +691,8 @@ export const askQuestions = async ( } if ( - (program.dataSource?.type === "file" || - program.dataSource?.type === "folder") && - program.framework === "fastapi" + program.dataSource?.type === "file" || + program.dataSource?.type === "folder" ) { if (ciInfo.isCI) { program.llamaCloudKey = getPrefOrDefault("llamaCloudKey"); diff --git a/templates/components/loaders/typescript/file/loader.mjs b/templates/components/loaders/typescript/file/loader.mjs new file mode 100644 index 00000000..3039f34f --- /dev/null +++ b/templates/components/loaders/typescript/file/loader.mjs @@ -0,0 +1,9 @@ +import { SimpleDirectoryReader } from "llamaindex"; + +export const DATA_DIR = "./data"; + +export async function getDocuments() { + return await new SimpleDirectoryReader().loadData({ + directoryPath: DATA_DIR, + }); +} diff --git a/templates/components/loaders/typescript/llama_parse/loader.mjs b/templates/components/loaders/typescript/llama_parse/loader.mjs new file mode 100644 index 00000000..f285673f --- /dev/null +++ b/templates/components/loaders/typescript/llama_parse/loader.mjs @@ -0,0 +1,19 @@ +import { + FILE_EXT_TO_READER, + LlamaParseReader, + SimpleDirectoryReader, +} from "llamaindex"; + +export const DATA_DIR = "./data"; + +export async function getDocuments() { + const reader = new SimpleDirectoryReader(); + // Load PDFs using LlamaParseReader + return await reader.loadData({ + directoryPath: DATA_DIR, + fileExtToReader: { + ...FILE_EXT_TO_READER, + pdf: new LlamaParseReader({ resultType: "markdown" }), + }, + }); +} diff --git a/templates/components/vectordbs/typescript/milvus/generate.mjs b/templates/components/vectordbs/typescript/milvus/generate.mjs index 905a066c..11c3184d 100644 --- a/templates/components/vectordbs/typescript/milvus/generate.mjs +++ b/templates/components/vectordbs/typescript/milvus/generate.mjs @@ -2,15 +2,11 @@ import * as dotenv from "dotenv"; import { MilvusVectorStore, - SimpleDirectoryReader, VectorStoreIndex, storageContextFromDefaults, } from "llamaindex"; -import { - STORAGE_DIR, - checkRequiredEnvVars, - getMilvusClient, -} from "./shared.mjs"; +import { getDocuments } from "./loader.mjs"; +import { checkRequiredEnvVars, getMilvusClient } from "./shared.mjs"; dotenv.config(); @@ -18,9 +14,7 @@ const collectionName = process.env.MILVUS_COLLECTION; async function loadAndIndex() { // load objects from storage and convert them into LlamaIndex Document objects - const documents = await new SimpleDirectoryReader().loadData({ - directoryPath: STORAGE_DIR, - }); + const documents = await getDocuments(); // Connect to Milvus const milvusClient = getMilvusClient(); diff --git a/templates/components/vectordbs/typescript/milvus/shared.mjs b/templates/components/vectordbs/typescript/milvus/shared.mjs index 0a35d715..a02ea57c 100644 --- a/templates/components/vectordbs/typescript/milvus/shared.mjs +++ b/templates/components/vectordbs/typescript/milvus/shared.mjs @@ -1,6 +1,5 @@ import { MilvusClient } from "@zilliz/milvus2-sdk-node"; -export const STORAGE_DIR = "./data"; export const CHUNK_SIZE = 512; export const CHUNK_OVERLAP = 20; diff --git a/templates/components/vectordbs/typescript/mongo/generate.mjs b/templates/components/vectordbs/typescript/mongo/generate.mjs index 7337d122..618859eb 100644 --- a/templates/components/vectordbs/typescript/mongo/generate.mjs +++ b/templates/components/vectordbs/typescript/mongo/generate.mjs @@ -2,12 +2,12 @@ import * as dotenv from "dotenv"; import { MongoDBAtlasVectorSearch, - SimpleDirectoryReader, VectorStoreIndex, storageContextFromDefaults, } from "llamaindex"; import { MongoClient } from "mongodb"; -import { STORAGE_DIR, checkRequiredEnvVars } from "./shared.mjs"; +import { getDocuments } from "./loader.mjs"; +import { checkRequiredEnvVars } from "./shared.mjs"; dotenv.config(); @@ -21,9 +21,7 @@ async function loadAndIndex() { const client = new MongoClient(mongoUri); // load objects from storage and convert them into LlamaIndex Document objects - const documents = await new SimpleDirectoryReader().loadData({ - directoryPath: STORAGE_DIR, - }); + const documents = await getDocuments(); // create Atlas as a vector store const vectorStore = new MongoDBAtlasVectorSearch({ diff --git a/templates/components/vectordbs/typescript/mongo/shared.mjs b/templates/components/vectordbs/typescript/mongo/shared.mjs index 264a82f0..ab467182 100644 --- a/templates/components/vectordbs/typescript/mongo/shared.mjs +++ b/templates/components/vectordbs/typescript/mongo/shared.mjs @@ -1,4 +1,3 @@ -export const STORAGE_DIR = "./data"; export const CHUNK_SIZE = 512; export const CHUNK_OVERLAP = 20; diff --git a/templates/components/vectordbs/typescript/none/constants.mjs b/templates/components/vectordbs/typescript/none/constants.mjs index 8cfb403c..42a8664a 100644 --- a/templates/components/vectordbs/typescript/none/constants.mjs +++ b/templates/components/vectordbs/typescript/none/constants.mjs @@ -1,4 +1,3 @@ -export const STORAGE_DIR = "./data"; export const STORAGE_CACHE_DIR = "./cache"; export const CHUNK_SIZE = 512; export const CHUNK_OVERLAP = 20; diff --git a/templates/components/vectordbs/typescript/none/generate.mjs b/templates/components/vectordbs/typescript/none/generate.mjs index 9334f98e..5b3987c1 100644 --- a/templates/components/vectordbs/typescript/none/generate.mjs +++ b/templates/components/vectordbs/typescript/none/generate.mjs @@ -1,18 +1,13 @@ import { serviceContextFromDefaults, - SimpleDirectoryReader, storageContextFromDefaults, VectorStoreIndex, } from "llamaindex"; import * as dotenv from "dotenv"; -import { - CHUNK_OVERLAP, - CHUNK_SIZE, - STORAGE_CACHE_DIR, - STORAGE_DIR, -} from "./constants.mjs"; +import { CHUNK_OVERLAP, CHUNK_SIZE, STORAGE_CACHE_DIR } from "./constants.mjs"; +import { getDocuments } from "./loader.mjs"; // Load environment variables from local .env file dotenv.config(); @@ -31,9 +26,7 @@ async function generateDatasource(serviceContext) { const storageContext = await storageContextFromDefaults({ persistDir: STORAGE_CACHE_DIR, }); - const documents = await new SimpleDirectoryReader().loadData({ - directoryPath: STORAGE_DIR, - }); + const documents = await getDocuments(); await VectorStoreIndex.fromDocuments(documents, { storageContext, serviceContext, diff --git a/templates/components/vectordbs/typescript/pg/generate.mjs b/templates/components/vectordbs/typescript/pg/generate.mjs index 3d959c69..ca8410bf 100644 --- a/templates/components/vectordbs/typescript/pg/generate.mjs +++ b/templates/components/vectordbs/typescript/pg/generate.mjs @@ -2,14 +2,14 @@ import * as dotenv from "dotenv"; import { PGVectorStore, - SimpleDirectoryReader, VectorStoreIndex, storageContextFromDefaults, } from "llamaindex"; +import { getDocuments } from "./loader.mjs"; import { + PGVECTOR_COLLECTION, PGVECTOR_SCHEMA, PGVECTOR_TABLE, - STORAGE_DIR, checkRequiredEnvVars, } from "./shared.mjs"; @@ -17,9 +17,7 @@ dotenv.config(); async function loadAndIndex() { // load objects from storage and convert them into LlamaIndex Document objects - const documents = await new SimpleDirectoryReader().loadData({ - directoryPath: STORAGE_DIR, - }); + const documents = await getDocuments(); // create postgres vector store const vectorStore = new PGVectorStore({ @@ -27,7 +25,7 @@ async function loadAndIndex() { schemaName: PGVECTOR_SCHEMA, tableName: PGVECTOR_TABLE, }); - vectorStore.setCollection(STORAGE_DIR); + vectorStore.setCollection(PGVECTOR_COLLECTION); vectorStore.clearCollection(); // create index from all the Documents diff --git a/templates/components/vectordbs/typescript/pg/shared.mjs b/templates/components/vectordbs/typescript/pg/shared.mjs index 8ad729c0..ba747934 100644 --- a/templates/components/vectordbs/typescript/pg/shared.mjs +++ b/templates/components/vectordbs/typescript/pg/shared.mjs @@ -1,4 +1,4 @@ -export const STORAGE_DIR = "./data"; +export const PGVECTOR_COLLECTION = "data"; export const CHUNK_SIZE = 512; export const CHUNK_OVERLAP = 20; export const PGVECTOR_SCHEMA = "public"; diff --git a/templates/components/vectordbs/typescript/pinecone/generate.mjs b/templates/components/vectordbs/typescript/pinecone/generate.mjs index b371a639..3e1fcaa0 100644 --- a/templates/components/vectordbs/typescript/pinecone/generate.mjs +++ b/templates/components/vectordbs/typescript/pinecone/generate.mjs @@ -2,19 +2,17 @@ import * as dotenv from "dotenv"; import { PineconeVectorStore, - SimpleDirectoryReader, VectorStoreIndex, storageContextFromDefaults, } from "llamaindex"; -import { STORAGE_DIR, checkRequiredEnvVars } from "./shared.mjs"; +import { getDocuments } from "./loader.mjs"; +import { checkRequiredEnvVars } from "./shared.mjs"; dotenv.config(); async function loadAndIndex() { // load objects from storage and convert them into LlamaIndex Document objects - const documents = await new SimpleDirectoryReader().loadData({ - directoryPath: STORAGE_DIR, - }); + const documents = await getDocuments(); // create vector store const vectorStore = new PineconeVectorStore(); diff --git a/templates/components/vectordbs/typescript/pinecone/shared.mjs b/templates/components/vectordbs/typescript/pinecone/shared.mjs index f9140261..ae2fd6b1 100644 --- a/templates/components/vectordbs/typescript/pinecone/shared.mjs +++ b/templates/components/vectordbs/typescript/pinecone/shared.mjs @@ -1,4 +1,3 @@ -export const STORAGE_DIR = "./data"; export const CHUNK_SIZE = 512; export const CHUNK_OVERLAP = 20; -- GitLab