Use ingestion pipeline in Python code (#61)

--------- Co-authored-by: Marcus Schiesser <mail@marcusschiesser.de>

Use ingestion pipeline in Python code (#61)
c094b0c6 · Huu Le (Lee) · GitHub · e2567ffc · e2567ffc · e2567ffc
Unverified Commit c094b0c6 authored 10 months ago by Huu Le (Lee) Committed by GitHub 10 months ago
--- a/templates/components/vectordbs/python/pinecone/__init__.py
+++ b/templates/components/vectordbs/python/pinecone/__init__.py
--- a/templates/components/vectordbs/python/pinecone/generate.py
+++ b/templates/components/vectordbs/python/pinecone/generate.py
-from dotenv import load_dotenv
-load_dotenv()
-import os
-import logging
-from llama_index.core.storage import StorageContext
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.vector_stores.pinecone import PineconeVectorStore
-from app.settings import init_settings
-from app.engine.loaders import get_documents
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index")
-    # load the documents and create the index
-    documents = get_documents()
-    store = PineconeVectorStore(
-        api_key=os.environ["PINECONE_API_KEY"],
-        index_name=os.environ["PINECONE_INDEX_NAME"],
-        environment=os.environ["PINECONE_ENVIRONMENT"],
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(
-        f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}"
-    )
-if __name__ == "__main__":
-    generate_datasource()
--- a/templates/components/vectordbs/python/pinecone/index.py
+++ b/templates/components/vectordbs/python/pinecone/index.py
-import logging
 import os
-from llama_index.core.indices import VectorStoreIndex
 from llama_index.vector_stores.pinecone import PineconeVectorStore
-logger = logging.getLogger("uvicorn")
+def get_vector_store():
-def get_index():
-    logger.info("Connecting to index from Pinecone...")
    store = PineconeVectorStore(
        api_key=os.environ["PINECONE_API_KEY"],
        index_name=os.environ["PINECONE_INDEX_NAME"],
        environment=os.environ["PINECONE_ENVIRONMENT"],
    )
-    index = VectorStoreIndex.from_vector_store(store)
+    return store
-    logger.info("Finished connecting to index from Pinecone.")
-    return index
--- a/templates/components/vectordbs/python/qdrant/__init__.py
+++ b/templates/components/vectordbs/python/qdrant/__init__.py
--- a/templates/components/vectordbs/python/qdrant/generate.py
+++ b/templates/components/vectordbs/python/qdrant/generate.py
-import logging
-import os
-from app.engine.loaders import get_documents
-from app.settings import init_settings
-from dotenv import load_dotenv
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.core.storage import StorageContext
-from llama_index.vector_stores.qdrant import QdrantVectorStore
-load_dotenv()
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-def generate_datasource():
-    init_settings()
-    logger.info("Creating new index with Qdrant")
-    # load the documents and create the index
-    documents = get_documents()
-    store = QdrantVectorStore(
-        collection_name=os.getenv("QDRANT_COLLECTION"),
-        url=os.getenv("QDRANT_URL"),
-        api_key=os.getenv("QDRANT_API_KEY"),
-    )
-    storage_context = StorageContext.from_defaults(vector_store=store)
-    VectorStoreIndex.from_documents(
-        documents,
-        storage_context=storage_context,
-        show_progress=True,  # this will show you a progress bar as the embeddings are created
-    )
-    logger.info(
-        f"Successfully uploaded documents to the {os.getenv('QDRANT_COLLECTION')} collection."
-    )
-if __name__ == "__main__":
-    generate_datasource()
--- a/templates/components/vectordbs/python/qdrant/index.py
+++ b/templates/components/vectordbs/python/qdrant/index.py
-import logging
 import os
-from llama_index.core.indices import VectorStoreIndex
 from llama_index.vector_stores.qdrant import QdrantVectorStore
-logger = logging.getLogger("uvicorn")
+def get_vector_store():
-def get_index():
-    logger.info("Connecting to Qdrant collection..")
    store = QdrantVectorStore(
        collection_name=os.getenv("QDRANT_COLLECTION"),
        url=os.getenv("QDRANT_URL"),
        api_key=os.getenv("QDRANT_API_KEY"),
    )
-    index = VectorStoreIndex.from_vector_store(store)
+    return store
-    logger.info("Finished connecting to Qdrant collection.")
-    return index
--- a/templates/types/streaming/fastapi/app/constants.py
+++ b/templates/types/streaming/fastapi/app/constants.py
+STORAGE_DIR = "storage"  # directory to save the stores to (document store and if used, the `SimpleVectorStore`)
--- a/templates/types/streaming/fastapi/app/engine/generate.py
+++ b/templates/types/streaming/fastapi/app/engine/generate.py
+from dotenv import load_dotenv
+load_dotenv()
+import os
+import logging
+from llama_index.core.settings import Settings
+from llama_index.core.ingestion import IngestionPipeline
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.vector_stores import SimpleVectorStore
+from llama_index.core.storage.docstore import SimpleDocumentStore
+from app.constants import STORAGE_DIR
+from app.settings import init_settings
+from app.engine.loaders import get_documents
+from app.engine.vectordb import get_vector_store
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger()
+def get_doc_store():
+    if not os.path.exists(STORAGE_DIR):
+        docstore = SimpleDocumentStore()
+        return docstore
+    else:
+        return SimpleDocumentStore.from_persist_dir(STORAGE_DIR)
+def generate_datasource():
+    init_settings()
+    logger.info("Creating new index")
+    # load the documents and create the index
+    documents = get_documents()
+    docstore = get_doc_store()
+    vector_store = get_vector_store()
+    # Create ingestion pipeline
+    ingestion_pipeline = IngestionPipeline(
+        transformations=[
+            SentenceSplitter(
+                chunk_size=Settings.chunk_size,
+                chunk_overlap=Settings.chunk_overlap,
+            ),
+            Settings.embed_model,
+        ],
+        docstore=docstore,
+        docstore_strategy="upserts_and_delete",
+    )
+    # llama_index having an typing issue when passing vector_store to IngestionPipeline
+    # so we need to set it manually after initialization
+    ingestion_pipeline.vector_store = vector_store
+    # Run the ingestion pipeline and store the results
+    ingestion_pipeline.run(show_progress=True, documents=documents)
+    # Default vector store only keeps data in memory, so we need to persist it
+    # Can remove if using a different vector store
+    if isinstance(vector_store, SimpleVectorStore):
+        vector_store.persist(os.path.join(STORAGE_DIR, "vector_store.json"))
+    # Persist the docstore to apply ingestion strategy
+    docstore.persist(os.path.join(STORAGE_DIR, "docstore.json"))
+    logger.info("Finished creating new index.")
+if __name__ == "__main__":
+    generate_datasource()
--- a/templates/components/vectordbs/python/pg/index.py
+++ b/templates/components/vectordbs/python/pg/index.py
 import logging
 from llama_index.core.indices.vector_store import VectorStoreIndex
-from app.engine.utils import init_pg_vector_store_from_env
+from app.engine.vectordb import get_vector_store
 logger = logging.getLogger("uvicorn")
 def get_index():
-    logger.info("Connecting to index from PGVector...")
+    logger.info("Loading the index...")
-    store = init_pg_vector_store_from_env()
+    store = get_vector_store()
    index = VectorStoreIndex.from_vector_store(store)
-    logger.info("Finished connecting to index from PGVector.")
+    logger.info("Loaded index successfully.")
    return index