From 38bc1d1350c4b7b22d8ca7dd67d0b800002b9204 Mon Sep 17 00:00:00 2001
From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com>
Date: Tue, 14 May 2024 18:58:07 +0700
Subject: [PATCH] Use ingestion pipeline for dedicated vector stores (#74)

---
 helpers/env-variables.ts                      |  4 +-
 templates/components/loaders/python/file.py   | 36 +++++++--
 .../vectordbs/python/astra/__init__.py        |  0
 .../vectordbs/python/astra/generate.py        | 37 ---------
 .../vectordbs/python/astra/index.py           | 21 -----
 .../vectordbs/python/astra/vectordb.py        | 20 +++++
 .../vectordbs/python/milvus/__init__.py       |  0
 .../vectordbs/python/milvus/generate.py       | 39 ---------
 .../vectordbs/python/milvus/index.py          | 22 -----
 .../vectordbs/python/milvus/vectordb.py       | 20 +++++
 .../vectordbs/python/mongo/__init__.py        |  0
 .../vectordbs/python/mongo/generate.py        | 43 ----------
 .../vectordbs/python/mongo/index.py           | 20 -----
 .../vectordbs/python/mongo/vectordb.py        | 20 +++++
 .../vectordbs/python/none/constants.py        |  1 -
 .../vectordbs/python/none/generate.py         |  7 +-
 .../components/vectordbs/python/none/index.py | 10 +--
 .../vectordbs/python/pg/__init__.py           |  0
 .../vectordbs/python/pg/constants.py          |  2 -
 .../vectordbs/python/pg/generate.py           | 35 --------
 .../components/vectordbs/python/pg/index.py   | 13 ---
 .../python/pg/{utils.py => vectordb.py}       |  7 +-
 .../vectordbs/python/pinecone/__init__.py     |  0
 .../vectordbs/python/pinecone/generate.py     | 39 ---------
 .../vectordbs/python/pinecone/index.py        | 20 -----
 .../vectordbs/python/pinecone/vectordb.py     | 19 +++++
 .../vectordbs/python/qdrant/__init__.py       |  0
 .../vectordbs/python/qdrant/generate.py       | 37 ---------
 .../vectordbs/python/qdrant/index.py          | 20 -----
 .../vectordbs/python/qdrant/vectordb.py       | 19 +++++
 .../vectordbs/typescript/mongo/generate.ts    |  2 +-
 .../vectordbs/typescript/mongo/shared.ts      |  2 +-
 .../streaming/fastapi/app/engine/generate.py  | 80 +++++++++++++++++++
 .../streaming/fastapi/app/engine/index.py     | 17 ++++
 34 files changed, 241 insertions(+), 371 deletions(-)
 delete mode 100644 templates/components/vectordbs/python/astra/__init__.py
 delete mode 100644 templates/components/vectordbs/python/astra/generate.py
 delete mode 100644 templates/components/vectordbs/python/astra/index.py
 create mode 100644 templates/components/vectordbs/python/astra/vectordb.py
 delete mode 100644 templates/components/vectordbs/python/milvus/__init__.py
 delete mode 100644 templates/components/vectordbs/python/milvus/generate.py
 delete mode 100644 templates/components/vectordbs/python/milvus/index.py
 create mode 100644 templates/components/vectordbs/python/milvus/vectordb.py
 delete mode 100644 templates/components/vectordbs/python/mongo/__init__.py
 delete mode 100644 templates/components/vectordbs/python/mongo/generate.py
 delete mode 100644 templates/components/vectordbs/python/mongo/index.py
 create mode 100644 templates/components/vectordbs/python/mongo/vectordb.py
 delete mode 100644 templates/components/vectordbs/python/none/constants.py
 delete mode 100644 templates/components/vectordbs/python/pg/__init__.py
 delete mode 100644 templates/components/vectordbs/python/pg/constants.py
 delete mode 100644 templates/components/vectordbs/python/pg/generate.py
 delete mode 100644 templates/components/vectordbs/python/pg/index.py
 rename templates/components/vectordbs/python/pg/{utils.py => vectordb.py} (86%)
 delete mode 100644 templates/components/vectordbs/python/pinecone/__init__.py
 delete mode 100644 templates/components/vectordbs/python/pinecone/generate.py
 delete mode 100644 templates/components/vectordbs/python/pinecone/index.py
 create mode 100644 templates/components/vectordbs/python/pinecone/vectordb.py
 delete mode 100644 templates/components/vectordbs/python/qdrant/__init__.py
 delete mode 100644 templates/components/vectordbs/python/qdrant/generate.py
 delete mode 100644 templates/components/vectordbs/python/qdrant/index.py
 create mode 100644 templates/components/vectordbs/python/qdrant/vectordb.py
 create mode 100644 templates/types/streaming/fastapi/app/engine/generate.py
 create mode 100644 templates/types/streaming/fastapi/app/engine/index.py

diff --git a/helpers/env-variables.ts b/helpers/env-variables.ts
index 2d69f227..46d1f94e 100644
--- a/helpers/env-variables.ts
+++ b/helpers/env-variables.ts
@@ -40,9 +40,9 @@ const getVectorDBEnvs = (
     case "mongo":
       return [
         {
-          name: "MONGO_URI",
+          name: "MONGODB_URI",
           description:
-            "For generating a connection URI, see https://docs.timescale.com/use-timescale/latest/services/create-a-service\nThe MongoDB connection URI.",
+            "For generating a connection URI, see https://www.mongodb.com/docs/manual/reference/connection-string/ \nThe MongoDB connection URI.",
         },
         {
           name: "MONGODB_DATABASE",
diff --git a/templates/components/loaders/python/file.py b/templates/components/loaders/python/file.py
index a814b0d0..95b5fd2f 100644
--- a/templates/components/loaders/python/file.py
+++ b/templates/components/loaders/python/file.py
@@ -1,7 +1,10 @@
 import os
+import logging
 from llama_parse import LlamaParse
 from pydantic import BaseModel, validator
 
+logger = logging.getLogger(__name__)
+
 
 class FileLoaderConfig(BaseModel):
     data_dir: str = "data"
@@ -27,11 +30,28 @@ def llama_parse_parser():
 def get_file_documents(config: FileLoaderConfig):
     from llama_index.core.readers import SimpleDirectoryReader
 
-    reader = SimpleDirectoryReader(
-        config.data_dir,
-        recursive=True,
-    )
-    if config.use_llama_parse:
-        parser = llama_parse_parser()
-        reader.file_extractor = {".pdf": parser}
-    return reader.load_data()
+    try:
+        reader = SimpleDirectoryReader(
+            config.data_dir,
+            recursive=True,
+            filename_as_id=True,
+        )
+        if config.use_llama_parse:
+            parser = llama_parse_parser()
+            reader.file_extractor = {".pdf": parser}
+        return reader.load_data()
+    except ValueError as e:
+        import sys, traceback
+
+        # Catch the error if the data dir is empty
+        # and return as empty document list
+        _, _, exc_traceback = sys.exc_info()
+        function_name = traceback.extract_tb(exc_traceback)[-1].name
+        if function_name == "_add_files":
+            logger.warning(
+                f"Failed to load file documents, error message: {e} . Return as empty document list."
+            )
+            return []
+        else:
+            # Raise the error if it is not the case of empty data dir
+            raise e
diff --git a/templates/components/vectordbs/python/astra/__init__.py b/templates/components/vectordbs/python/astra/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/templates/components/vectordbs/python/astra/generate.py b/templates/components/vectordbs/python/astra/generate.py
deleted file mode 100644
index 4d2a54af..00000000
--- a/templates/components/vectordbs/python/astra/generate.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import os
-import logging
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.astra_db import AstraDBVectorStore
-from app.settings import init_settings
-from app.engine.loaders import get_documents
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    documents = get_documents()
-    store = AstraDBVectorStore(
-        token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
-        api_endpoint=os.environ["ASTRA_DB_ENDPOINT"],
-        collection_name=os.environ["ASTRA_DB_COLLECTION"],
-        embedding_dimension=int(os.environ["EMBEDDING_DIM"]),
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(f"Successfully created embeddings in the AstraDB")
-
-
-if __name__ == "__main__":
-    generate_datasource()
diff --git a/templates/components/vectordbs/python/astra/index.py b/templates/components/vectordbs/python/astra/index.py
deleted file mode 100644
index b1389f76..00000000
--- a/templates/components/vectordbs/python/astra/index.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import logging
-import os
-
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.astra_db import AstraDBVectorStore
-
-
-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to index from AstraDB...")
-    store = AstraDBVectorStore(
-        token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
-        api_endpoint=os.environ["ASTRA_DB_ENDPOINT"],
-        collection_name=os.environ["ASTRA_DB_COLLECTION"],
-        embedding_dimension=int(os.environ["EMBEDDING_DIM"]),
-    )
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from AstraDB.")
-    return index
diff --git a/templates/components/vectordbs/python/astra/vectordb.py b/templates/components/vectordbs/python/astra/vectordb.py
new file mode 100644
index 00000000..f84b329e
--- /dev/null
+++ b/templates/components/vectordbs/python/astra/vectordb.py
@@ -0,0 +1,20 @@
+import os
+from llama_index.vector_stores.astra_db import AstraDBVectorStore
+
+
+def get_vector_store():
+    endpoint = os.getenv("ASTRA_DB_ENDPOINT")
+    token = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
+    collection = os.getenv("ASTRA_DB_COLLECTION")
+    if not endpoint or not token or not collection:
+        raise ValueError(
+            "Please config ASTRA_DB_ENDPOINT, ASTRA_DB_APPLICATION_TOKEN and ASTRA_DB_COLLECTION"
+            " to your environment variables or config them in the .env file"
+        )
+    store = AstraDBVectorStore(
+        token=token,
+        api_endpoint=endpoint,
+        collection_name=collection,
+        embedding_dimension=int(os.getenv("EMBEDDING_DIM")),
+    )
+    return store
diff --git a/templates/components/vectordbs/python/milvus/__init__.py b/templates/components/vectordbs/python/milvus/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/templates/components/vectordbs/python/milvus/generate.py b/templates/components/vectordbs/python/milvus/generate.py
deleted file mode 100644
index b5bfc9f9..00000000
--- a/templates/components/vectordbs/python/milvus/generate.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import os
-import logging
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.milvus import MilvusVectorStore
-from app.settings import init_settings
-from app.engine.loaders import get_documents
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    store = MilvusVectorStore(
-        uri=os.environ["MILVUS_ADDRESS"],
-        user=os.getenv("MILVUS_USERNAME"),
-        password=os.getenv("MILVUS_PASSWORD"),
-        collection_name=os.getenv("MILVUS_COLLECTION"),
-        dim=int(os.getenv("EMBEDDING_DIM")),
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(f"Successfully created embeddings in the Milvus")
-
-
-if __name__ == "__main__":
-    generate_datasource()
diff --git a/templates/components/vectordbs/python/milvus/index.py b/templates/components/vectordbs/python/milvus/index.py
deleted file mode 100644
index ffd87e63..00000000
--- a/templates/components/vectordbs/python/milvus/index.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import logging
-import os
-
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.milvus import MilvusVectorStore
-
-
-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to index from Milvus...")
-    store = MilvusVectorStore(
-        uri=os.getenv("MILVUS_ADDRESS"),
-        user=os.getenv("MILVUS_USERNAME"),
-        password=os.getenv("MILVUS_PASSWORD"),
-        collection_name=os.getenv("MILVUS_COLLECTION"),
-        dim=int(os.getenv("EMBEDDING_DIM")),
-    )
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from Milvus.")
-    return index
diff --git a/templates/components/vectordbs/python/milvus/vectordb.py b/templates/components/vectordbs/python/milvus/vectordb.py
new file mode 100644
index 00000000..7da817c9
--- /dev/null
+++ b/templates/components/vectordbs/python/milvus/vectordb.py
@@ -0,0 +1,20 @@
+import os
+from llama_index.vector_stores.milvus import MilvusVectorStore
+
+
+def get_vector_store():
+    address = os.getenv("MILVUS_ADDRESS")
+    collection = os.getenv("MILVUS_COLLECTION")
+    if not address or not collection:
+        raise ValueError(
+            "Please set MILVUS_ADDRESS and MILVUS_COLLECTION to your environment variables"
+            " or config them in the .env file"
+        )
+    store = MilvusVectorStore(
+        uri=address,
+        user=os.getenv("MILVUS_USERNAME"),
+        password=os.getenv("MILVUS_PASSWORD"),
+        collection_name=collection,
+        dim=int(os.getenv("EMBEDDING_DIM")),
+    )
+    return store
diff --git a/templates/components/vectordbs/python/mongo/__init__.py b/templates/components/vectordbs/python/mongo/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/templates/components/vectordbs/python/mongo/generate.py b/templates/components/vectordbs/python/mongo/generate.py
deleted file mode 100644
index abe844c0..00000000
--- a/templates/components/vectordbs/python/mongo/generate.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import os
-import logging
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
-from app.settings import init_settings
-from app.engine.loaders import get_documents
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    store = MongoDBAtlasVectorSearch(
-        db_name=os.environ["MONGODB_DATABASE"],
-        collection_name=os.environ["MONGODB_VECTORS"],
-        index_name=os.environ["MONGODB_VECTOR_INDEX"],
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(
-        f"Successfully created embeddings in the MongoDB collection {os.environ['MONGODB_VECTORS']}"
-    )
-    logger.info(
-        """IMPORTANT: You can't query your index yet because you need to create a vector search index in MongoDB's UI now.
-See https://github.com/run-llama/mongodb-demo/tree/main?tab=readme-ov-file#create-a-vector-search-index"""
-    )
-
-
-if __name__ == "__main__":
-    generate_datasource()
diff --git a/templates/components/vectordbs/python/mongo/index.py b/templates/components/vectordbs/python/mongo/index.py
deleted file mode 100644
index 6dba7c1d..00000000
--- a/templates/components/vectordbs/python/mongo/index.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import logging
-import os
-
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
-
-
-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to index from MongoDB...")
-    store = MongoDBAtlasVectorSearch(
-        db_name=os.environ["MONGODB_DATABASE"],
-        collection_name=os.environ["MONGODB_VECTORS"],
-        index_name=os.environ["MONGODB_VECTOR_INDEX"],
-    )
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from MongoDB.")
-    return index
diff --git a/templates/components/vectordbs/python/mongo/vectordb.py b/templates/components/vectordbs/python/mongo/vectordb.py
new file mode 100644
index 00000000..4807abe7
--- /dev/null
+++ b/templates/components/vectordbs/python/mongo/vectordb.py
@@ -0,0 +1,20 @@
+import os
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+
+
+def get_vector_store():
+    db_uri = os.getenv("MONGODB_URI")
+    db_name = os.getenv("MONGODB_DATABASE")
+    collection_name = os.getenv("MONGODB_VECTORS")
+    index_name = os.getenv("MONGODB_VECTOR_INDEX")
+    if not db_uri or not db_name or not collection_name or not index_name:
+        raise ValueError(
+            "Please set MONGODB_URI, MONGODB_DATABASE, MONGODB_VECTORS, and MONGODB_VECTOR_INDEX"
+            " to your environment variables or config them in .env file"
+        )
+    store = MongoDBAtlasVectorSearch(
+        db_name=db_name,
+        collection_name=collection_name,
+        index_name=index_name,
+    )
+    return store
diff --git a/templates/components/vectordbs/python/none/constants.py b/templates/components/vectordbs/python/none/constants.py
deleted file mode 100644
index 254998eb..00000000
--- a/templates/components/vectordbs/python/none/constants.py
+++ /dev/null
@@ -1 +0,0 @@
-STORAGE_DIR = "storage"  # directory to cache the generated index
diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py
index e38d89cb..cda55b10 100644
--- a/templates/components/vectordbs/python/none/generate.py
+++ b/templates/components/vectordbs/python/none/generate.py
@@ -2,11 +2,11 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
+import os
 import logging
 from llama_index.core.indices import (
     VectorStoreIndex,
 )
-from app.engine.constants import STORAGE_DIR
 from app.engine.loaders import get_documents
 from app.settings import init_settings
 
@@ -18,14 +18,15 @@ logger = logging.getLogger()
 def generate_datasource():
     init_settings()
     logger.info("Creating new index")
+    storage_dir = os.environ.get("STORAGE_DIR", "storage")
     # load the documents and create the index
     documents = get_documents()
     index = VectorStoreIndex.from_documents(
         documents,
     )
     # store it for later
-    index.storage_context.persist(STORAGE_DIR)
-    logger.info(f"Finished creating new index. Stored in {STORAGE_DIR}")
+    index.storage_context.persist(storage_dir)
+    logger.info(f"Finished creating new index. Stored in {storage_dir}")
 
 
 if __name__ == "__main__":
diff --git a/templates/components/vectordbs/python/none/index.py b/templates/components/vectordbs/python/none/index.py
index 8b77414a..7e9482c8 100644
--- a/templates/components/vectordbs/python/none/index.py
+++ b/templates/components/vectordbs/python/none/index.py
@@ -1,7 +1,6 @@
 import logging
 import os
 
-from app.engine.constants import STORAGE_DIR
 from llama_index.core.storage import StorageContext
 from llama_index.core.indices import load_index_from_storage
 
@@ -9,12 +8,13 @@ logger = logging.getLogger("uvicorn")
 
 
 def get_index():
+    storage_dir = os.getenv("STORAGE_DIR", "storage")
     # check if storage already exists
-    if not os.path.exists(STORAGE_DIR):
+    if not os.path.exists(storage_dir):
         return None
     # load the existing index
-    logger.info(f"Loading index from {STORAGE_DIR}...")
-    storage_context = StorageContext.from_defaults(persist_dir=STORAGE_DIR)
+    logger.info(f"Loading index from {storage_dir}...")
+    storage_context = StorageContext.from_defaults(persist_dir=storage_dir)
     index = load_index_from_storage(storage_context)
-    logger.info(f"Finished loading index from {STORAGE_DIR}")
+    logger.info(f"Finished loading index from {storage_dir}")
     return index
diff --git a/templates/components/vectordbs/python/pg/__init__.py b/templates/components/vectordbs/python/pg/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/templates/components/vectordbs/python/pg/constants.py b/templates/components/vectordbs/python/pg/constants.py
deleted file mode 100644
index a4ebd918..00000000
--- a/templates/components/vectordbs/python/pg/constants.py
+++ /dev/null
@@ -1,2 +0,0 @@
-PGVECTOR_SCHEMA = "public"
-PGVECTOR_TABLE = "llamaindex_embedding"
\ No newline at end of file
diff --git a/templates/components/vectordbs/python/pg/generate.py b/templates/components/vectordbs/python/pg/generate.py
deleted file mode 100644
index 79fa3bd7..00000000
--- a/templates/components/vectordbs/python/pg/generate.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import logging
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.core.storage import StorageContext
-
-from app.engine.loaders import get_documents
-from app.settings import init_settings
-from app.engine.utils import init_pg_vector_store_from_env
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    store = init_pg_vector_store_from_env()
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(
-        f"Successfully created embeddings in the PG vector store, schema={store.schema_name} table={store.table_name}"
-    )
-
-
-if __name__ == "__main__":
-    generate_datasource()
diff --git a/templates/components/vectordbs/python/pg/index.py b/templates/components/vectordbs/python/pg/index.py
deleted file mode 100644
index 3c4f3180..00000000
--- a/templates/components/vectordbs/python/pg/index.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import logging
-from llama_index.core.indices.vector_store import VectorStoreIndex
-from app.engine.utils import init_pg_vector_store_from_env
-
-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to index from PGVector...")
-    store = init_pg_vector_store_from_env()
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from PGVector.")
-    return index
diff --git a/templates/components/vectordbs/python/pg/utils.py b/templates/components/vectordbs/python/pg/vectordb.py
similarity index 86%
rename from templates/components/vectordbs/python/pg/utils.py
rename to templates/components/vectordbs/python/pg/vectordb.py
index 39127846..f7e0c11a 100644
--- a/templates/components/vectordbs/python/pg/utils.py
+++ b/templates/components/vectordbs/python/pg/vectordb.py
@@ -1,10 +1,12 @@
 import os
 from llama_index.vector_stores.postgres import PGVectorStore
 from urllib.parse import urlparse
-from app.engine.constants import PGVECTOR_SCHEMA, PGVECTOR_TABLE
 
+PGVECTOR_SCHEMA = "public"
+PGVECTOR_TABLE = "llamaindex_embedding"
 
-def init_pg_vector_store_from_env():
+
+def get_vector_store():
     original_conn_string = os.environ.get("PG_CONNECTION_STRING")
     if original_conn_string is None or original_conn_string == "":
         raise ValueError("PG_CONNECTION_STRING environment variable is not set.")
@@ -24,4 +26,5 @@ def init_pg_vector_store_from_env():
         async_connection_string=async_conn_string,
         schema_name=PGVECTOR_SCHEMA,
         table_name=PGVECTOR_TABLE,
+        embed_dim=int(os.environ.get("EMBEDDING_DIM", 1024)),
     )
diff --git a/templates/components/vectordbs/python/pinecone/__init__.py b/templates/components/vectordbs/python/pinecone/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/templates/components/vectordbs/python/pinecone/generate.py b/templates/components/vectordbs/python/pinecone/generate.py
deleted file mode 100644
index 5f233ba2..00000000
--- a/templates/components/vectordbs/python/pinecone/generate.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import os
-import logging
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.pinecone import PineconeVectorStore
-from app.settings import init_settings
-from app.engine.loaders import get_documents
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    store = PineconeVectorStore(
-        api_key=os.environ["PINECONE_API_KEY"],
-        index_name=os.environ["PINECONE_INDEX_NAME"],
-        environment=os.environ["PINECONE_ENVIRONMENT"],
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(
-        f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}"
-    )
-
-
-if __name__ == "__main__":
-    generate_datasource()
diff --git a/templates/components/vectordbs/python/pinecone/index.py b/templates/components/vectordbs/python/pinecone/index.py
deleted file mode 100644
index 98824ffd..00000000
--- a/templates/components/vectordbs/python/pinecone/index.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import logging
-import os
-
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.pinecone import PineconeVectorStore
-
-
-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to index from Pinecone...")
-    store = PineconeVectorStore(
-        api_key=os.environ["PINECONE_API_KEY"],
-        index_name=os.environ["PINECONE_INDEX_NAME"],
-        environment=os.environ["PINECONE_ENVIRONMENT"],
-    )
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from Pinecone.")
-    return index
diff --git a/templates/components/vectordbs/python/pinecone/vectordb.py b/templates/components/vectordbs/python/pinecone/vectordb.py
new file mode 100644
index 00000000..a1ddbdc7
--- /dev/null
+++ b/templates/components/vectordbs/python/pinecone/vectordb.py
@@ -0,0 +1,19 @@
+import os
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+
+
+def get_vector_store():
+    api_key = os.getenv("PINECONE_API_KEY")
+    index_name = os.getenv("PINECONE_INDEX_NAME")
+    environment = os.getenv("PINECONE_ENVIRONMENT")
+    if not api_key or not index_name or not environment:
+        raise ValueError(
+            "Please set PINECONE_API_KEY, PINECONE_INDEX_NAME, and PINECONE_ENVIRONMENT"
+            " to your environment variables or config them in the .env file"
+        )
+    store = PineconeVectorStore(
+        api_key=api_key,
+        index_name=index_name,
+        environment=environment,
+    )
+    return store
diff --git a/templates/components/vectordbs/python/qdrant/__init__.py b/templates/components/vectordbs/python/qdrant/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/templates/components/vectordbs/python/qdrant/generate.py b/templates/components/vectordbs/python/qdrant/generate.py
deleted file mode 100644
index db7c055e..00000000
--- a/templates/components/vectordbs/python/qdrant/generate.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import logging
-import os
-from app.engine.loaders import get_documents
-from app.settings import init_settings
-from dotenv import load_dotenv
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.core.storage import StorageContext
-from llama_index.vector_stores.qdrant import QdrantVectorStore
-load_dotenv()
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index with Qdrant")
-    # load the documents and create the index
-    documents = get_documents()
-    store = QdrantVectorStore(
-        collection_name=os.getenv("QDRANT_COLLECTION"),
-        url=os.getenv("QDRANT_URL"),
-        api_key=os.getenv("QDRANT_API_KEY"),
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(
-        f"Successfully uploaded documents to the {os.getenv('QDRANT_COLLECTION')} collection."
-    )
-
-
-if __name__ == "__main__":
-    generate_datasource()
diff --git a/templates/components/vectordbs/python/qdrant/index.py b/templates/components/vectordbs/python/qdrant/index.py
deleted file mode 100644
index 0a388d8a..00000000
--- a/templates/components/vectordbs/python/qdrant/index.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import logging
-import os
-
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.qdrant import QdrantVectorStore
-
-
-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to Qdrant collection..")
-    store = QdrantVectorStore(
-        collection_name=os.getenv("QDRANT_COLLECTION"),
-        url=os.getenv("QDRANT_URL"),
-        api_key=os.getenv("QDRANT_API_KEY"),
-    )
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to Qdrant collection.")
-    return index
diff --git a/templates/components/vectordbs/python/qdrant/vectordb.py b/templates/components/vectordbs/python/qdrant/vectordb.py
new file mode 100644
index 00000000..0f7914d7
--- /dev/null
+++ b/templates/components/vectordbs/python/qdrant/vectordb.py
@@ -0,0 +1,19 @@
+import os
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+
+
+def get_vector_store():
+    collection_name = os.getenv("QDRANT_COLLECTION")
+    url = os.getenv("QDRANT_URL")
+    api_key = os.getenv("QDRANT_API_KEY")
+    if not collection_name or not url:
+        raise ValueError(
+            "Please set QDRANT_COLLECTION, QDRANT_URL"
+            " to your environment variables or config them in the .env file"
+        )
+    store = QdrantVectorStore(
+        collection_name=collection_name,
+        url=url,
+        api_key=api_key,
+    )
+    return store
diff --git a/templates/components/vectordbs/typescript/mongo/generate.ts b/templates/components/vectordbs/typescript/mongo/generate.ts
index 40c90172..deee3183 100644
--- a/templates/components/vectordbs/typescript/mongo/generate.ts
+++ b/templates/components/vectordbs/typescript/mongo/generate.ts
@@ -9,7 +9,7 @@ import { checkRequiredEnvVars } from "./shared";
 
 dotenv.config();
 
-const mongoUri = process.env.MONGO_URI!;
+const mongoUri = process.env.MONGODB_URI!;
 const databaseName = process.env.MONGODB_DATABASE!;
 const vectorCollectionName = process.env.MONGODB_VECTORS!;
 const indexName = process.env.MONGODB_VECTOR_INDEX;
diff --git a/templates/components/vectordbs/typescript/mongo/shared.ts b/templates/components/vectordbs/typescript/mongo/shared.ts
index d6532a56..c6b5f303 100644
--- a/templates/components/vectordbs/typescript/mongo/shared.ts
+++ b/templates/components/vectordbs/typescript/mongo/shared.ts
@@ -1,5 +1,5 @@
 const REQUIRED_ENV_VARS = [
-  "MONGO_URI",
+  "MONGODB_URI",
   "MONGODB_DATABASE",
   "MONGODB_VECTORS",
   "MONGODB_VECTOR_INDEX",
diff --git a/templates/types/streaming/fastapi/app/engine/generate.py b/templates/types/streaming/fastapi/app/engine/generate.py
new file mode 100644
index 00000000..07588bd6
--- /dev/null
+++ b/templates/types/streaming/fastapi/app/engine/generate.py
@@ -0,0 +1,80 @@
+from dotenv import load_dotenv
+
+load_dotenv()
+
+import os
+import logging
+from llama_index.core.settings import Settings
+from llama_index.core.ingestion import IngestionPipeline
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.storage.docstore import SimpleDocumentStore
+from llama_index.core.storage import StorageContext
+from app.settings import init_settings
+from app.engine.loaders import get_documents
+from app.engine.vectordb import get_vector_store
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger()
+
+STORAGE_DIR = os.getenv("STORAGE_DIR", "storage")
+
+
+def get_doc_store():
+
+    # If the storage directory is there, load the document store from it.
+    # If not, set up an in-memory document store since we can't load from a directory that doesn't exist.
+    if os.path.exists(STORAGE_DIR):
+        return SimpleDocumentStore.from_persist_dir(STORAGE_DIR)
+    else:
+        return SimpleDocumentStore()
+
+
+def run_pipeline(docstore, vector_store, documents):
+    pipeline = IngestionPipeline(
+        transformations=[
+            SentenceSplitter(
+                chunk_size=Settings.chunk_size,
+                chunk_overlap=Settings.chunk_overlap,
+            ),
+            Settings.embed_model,
+        ],
+        docstore=docstore,
+        docstore_strategy="upserts_and_delete",
+        vector_store=vector_store,
+    )
+
+    # Run the ingestion pipeline and store the results
+    nodes = pipeline.run(show_progress=True, documents=documents)
+
+    return nodes
+
+
+def persist_storage(docstore, vector_store):
+    storage_context = StorageContext.from_defaults(
+        docstore=docstore,
+        vector_store=vector_store,
+    )
+    storage_context.persist(STORAGE_DIR)
+
+
+def generate_datasource():
+    init_settings()
+    logger.info("Generate index for the provided data")
+
+    # Get the stores and documents or create new ones
+    documents = get_documents()
+    docstore = get_doc_store()
+    vector_store = get_vector_store()
+
+    # Run the ingestion pipeline
+    _ = run_pipeline(docstore, vector_store, documents)
+
+    # Build the index and persist storage
+    persist_storage(docstore, vector_store)
+
+    logger.info("Finished generating the index")
+
+
+if __name__ == "__main__":
+    generate_datasource()
diff --git a/templates/types/streaming/fastapi/app/engine/index.py b/templates/types/streaming/fastapi/app/engine/index.py
new file mode 100644
index 00000000..2dbc589b
--- /dev/null
+++ b/templates/types/streaming/fastapi/app/engine/index.py
@@ -0,0 +1,17 @@
+import logging
+from llama_index.core.indices import VectorStoreIndex
+from app.engine.vectordb import get_vector_store
+
+
+logger = logging.getLogger("uvicorn")
+
+
+def get_index():
+    logger.info("Connecting vector store...")
+    store = get_vector_store()
+    # Load the index from the vector store
+    # If you are using a vector store that doesn't store text,
+    # you must load the index from both the vector store and the document store
+    index = VectorStoreIndex.from_vector_store(store)
+    logger.info("Finished load index from vector store.")
+    return index
-- 
GitLab