Use ingestion pipeline in Python code (#61)

--------- Co-authored-by: Marcus Schiesser <mail@marcusschiesser.de>

Use ingestion pipeline in Python code (#61)
c094b0c6 · Huu Le (Lee) · GitHub · e2567ffc · c094b0c6 · c094b0c6
Unverified Commit c094b0c6 authored 11 months ago by Huu Le (Lee) Committed by GitHub 11 months ago
--- a/.changeset/short-ducks-drum.md
+++ b/.changeset/short-ducks-drum.md
+---
+"create-llama": patch
+---
+
+Use ingestion pipeline for Python
--- a/templates/components/loaders/python/file.py
+++ b/templates/components/loaders/python/file.py
@@ -27,10 +27,7 @@ def llama_parse_parser():
 def get_file_documents(config: FileLoaderConfig):
    from llama_index.core.readers import SimpleDirectoryReader

-    reader = SimpleDirectoryReader(
-        config.data_dir,
-        recursive=True,
-    )
+    reader = SimpleDirectoryReader(config.data_dir, recursive=True, filename_as_id=True)
    if config.use_llama_parse:
        parser = llama_parse_parser()
        reader.file_extractor = {".pdf": parser}

--- a/templates/components/vectordbs/python/astra/__init__.py
+++ b/templates/components/vectordbs/python/astra/__init__.py
--- a/templates/components/vectordbs/python/astra/generate.py
+++ b/templates/components/vectordbs/python/astra/generate.py
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import os
-import logging
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.astra_db import AstraDBVectorStore
-from app.settings import init_settings
-from app.engine.loaders import get_documents
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    documents = get_documents()
-    store = AstraDBVectorStore(
-        token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
-        api_endpoint=os.environ["ASTRA_DB_ENDPOINT"],
-        collection_name=os.environ["ASTRA_DB_COLLECTION"],
-        embedding_dimension=int(os.environ["EMBEDDING_DIM"]),
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(f"Successfully created embeddings in the AstraDB")
-
-
-if __name__ == "__main__":
-    generate_datasource()
--- a/templates/components/vectordbs/python/astra/index.py
+++ b/templates/components/vectordbs/python/astra/index.py
-import logging
 import os
-
-from llama_index.core.indices import VectorStoreIndex
 from llama_index.vector_stores.astra_db import AstraDBVectorStore


-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to index from AstraDB...")
+def get_vector_store():
    store = AstraDBVectorStore(
        token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
        api_endpoint=os.environ["ASTRA_DB_ENDPOINT"],
        collection_name=os.environ["ASTRA_DB_COLLECTION"],
        embedding_dimension=int(os.environ["EMBEDDING_DIM"]),
    )
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from AstraDB.")
-    return index
+    return store
--- a/templates/components/vectordbs/python/milvus/__init__.py
+++ b/templates/components/vectordbs/python/milvus/__init__.py
--- a/templates/components/vectordbs/python/milvus/generate.py
+++ b/templates/components/vectordbs/python/milvus/generate.py
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import os
-import logging
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.milvus import MilvusVectorStore
-from app.settings import init_settings
-from app.engine.loaders import get_documents
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    store = MilvusVectorStore(
-        uri=os.environ["MILVUS_ADDRESS"],
-        user=os.getenv("MILVUS_USERNAME"),
-        password=os.getenv("MILVUS_PASSWORD"),
-        collection_name=os.getenv("MILVUS_COLLECTION"),
-        dim=int(os.getenv("EMBEDDING_DIM")),
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(f"Successfully created embeddings in the Milvus")
-
-
-if __name__ == "__main__":
-    generate_datasource()
--- a/templates/components/vectordbs/python/milvus/index.py
+++ b/templates/components/vectordbs/python/milvus/index.py
-import logging
 import os
-
-from llama_index.core.indices import VectorStoreIndex
 from llama_index.vector_stores.milvus import MilvusVectorStore


-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to index from Milvus...")
+def get_vector_store():
    store = MilvusVectorStore(
-        uri=os.getenv("MILVUS_ADDRESS"),
+        uri=os.environ["MILVUS_ADDRESS"],
        user=os.getenv("MILVUS_USERNAME"),
        password=os.getenv("MILVUS_PASSWORD"),
        collection_name=os.getenv("MILVUS_COLLECTION"),
        dim=int(os.getenv("EMBEDDING_DIM")),
    )
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from Milvus.")
-    return index
+    return store
--- a/templates/components/vectordbs/python/mongo/__init__.py
+++ b/templates/components/vectordbs/python/mongo/__init__.py
--- a/templates/components/vectordbs/python/mongo/generate.py
+++ b/templates/components/vectordbs/python/mongo/generate.py
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import os
-import logging
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
-from app.settings import init_settings
-from app.engine.loaders import get_documents
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    store = MongoDBAtlasVectorSearch(
-        db_name=os.environ["MONGODB_DATABASE"],
-        collection_name=os.environ["MONGODB_VECTORS"],
-        index_name=os.environ["MONGODB_VECTOR_INDEX"],
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(
-        f"Successfully created embeddings in the MongoDB collection {os.environ['MONGODB_VECTORS']}"
-    )
-    logger.info(
-        """IMPORTANT: You can't query your index yet because you need to create a vector search index in MongoDB's UI now.
-See https://github.com/run-llama/mongodb-demo/tree/main?tab=readme-ov-file#create-a-vector-search-index"""
-    )
-
-
-if __name__ == "__main__":
-    generate_datasource()
--- a/templates/components/vectordbs/python/mongo/index.py
+++ b/templates/components/vectordbs/python/mongo/index.py
-import logging
 import os
-
-from llama_index.core.indices import VectorStoreIndex
 from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch


-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    logger.info("Connecting to index from MongoDB...")
+def get_vector_store():
    store = MongoDBAtlasVectorSearch(
        db_name=os.environ["MONGODB_DATABASE"],
        collection_name=os.environ["MONGODB_VECTORS"],
        index_name=os.environ["MONGODB_VECTOR_INDEX"],
    )
-    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from MongoDB.")
-    return index
+    return store
--- a/templates/components/vectordbs/python/none/__init__.py
+++ b/templates/components/vectordbs/python/none/__init__.py
--- a/templates/components/vectordbs/python/none/constants.py
+++ b/templates/components/vectordbs/python/none/constants.py
-STORAGE_DIR = "storage"  # directory to cache the generated index
--- a/templates/components/vectordbs/python/none/generate.py
+++ b/templates/components/vectordbs/python/none/generate.py
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import logging
-from llama_index.core.indices import (
-    VectorStoreIndex,
-)
-from app.engine.constants import STORAGE_DIR
-from app.engine.loaders import get_documents
-from app.settings import init_settings
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    index = VectorStoreIndex.from_documents(
-        documents,
-    )
-    # store it for later
-    index.storage_context.persist(STORAGE_DIR)
-    logger.info(f"Finished creating new index. Stored in {STORAGE_DIR}")
-
-
-if __name__ == "__main__":
-    generate_datasource()
--- a/templates/components/vectordbs/python/none/index.py
+++ b/templates/components/vectordbs/python/none/index.py
-import logging
-import os
-
-from app.engine.constants import STORAGE_DIR
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import load_index_from_storage
-
-logger = logging.getLogger("uvicorn")
-
-
-def get_index():
-    # check if storage already exists
-    if not os.path.exists(STORAGE_DIR):
-        return None
-    # load the existing index
-    logger.info(f"Loading index from {STORAGE_DIR}...")
-    storage_context = StorageContext.from_defaults(persist_dir=STORAGE_DIR)
-    index = load_index_from_storage(storage_context)
-    logger.info(f"Finished loading index from {STORAGE_DIR}")
-    return index
--- a/templates/components/vectordbs/python/none/vectordb.py
+++ b/templates/components/vectordbs/python/none/vectordb.py
+import os
+
+from llama_index.core.vector_stores import SimpleVectorStore
+from app.constants import STORAGE_DIR
+
+
+def get_vector_store():
+    if not os.path.exists(STORAGE_DIR):
+        vector_store = SimpleVectorStore()
+    else:
+        vector_store = SimpleVectorStore.from_persist_dir(STORAGE_DIR)
+    vector_store.stores_text = True
+    return vector_store
--- a/templates/components/vectordbs/python/pg/__init__.py
+++ b/templates/components/vectordbs/python/pg/__init__.py
--- a/templates/components/vectordbs/python/pg/constants.py
+++ b/templates/components/vectordbs/python/pg/constants.py
-PGVECTOR_SCHEMA = "public"
-PGVECTOR_TABLE = "llamaindex_embedding"
\ No newline at end of file
--- a/templates/components/vectordbs/python/pg/generate.py
+++ b/templates/components/vectordbs/python/pg/generate.py
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import logging
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.core.storage import StorageContext
-
-from app.engine.loaders import get_documents
-from app.settings import init_settings
-from app.engine.utils import init_pg_vector_store_from_env
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-
-
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    store = init_pg_vector_store_from_env()
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(
-        f"Successfully created embeddings in the PG vector store, schema={store.schema_name} table={store.table_name}"
-    )
-
-
-if __name__ == "__main__":
-    generate_datasource()
--- a/templates/components/vectordbs/python/pg/utils.py
+++ b/templates/components/vectordbs/python/pg/utils.py
 import os
 from llama_index.vector_stores.postgres import PGVectorStore
 from urllib.parse import urlparse
-from app.engine.constants import PGVECTOR_SCHEMA, PGVECTOR_TABLE

+STORAGE_DIR = "storage"
+PGVECTOR_SCHEMA = "public"
+PGVECTOR_TABLE = "llamaindex_embedding"

-def init_pg_vector_store_from_env():
+
+def get_vector_store():
    original_conn_string = os.environ.get("PG_CONNECTION_STRING")
    if original_conn_string is None or original_conn_string == "":
        raise ValueError("PG_CONNECTION_STRING environment variable is not set.")
@@ -24,4 +27,5 @@ def init_pg_vector_store_from_env():
        async_connection_string=async_conn_string,
        schema_name=PGVECTOR_SCHEMA,
        table_name=PGVECTOR_TABLE,
+        embed_dim=int(os.environ.get("EMBEDDING_DIM", 768)),
    )