From c094b0c6bfee34b92a4daa2718e17307442e2a5f Mon Sep 17 00:00:00 2001 From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com> Date: Fri, 26 Apr 2024 14:42:34 +0700 Subject: [PATCH] Use ingestion pipeline in Python code (#61) --------- Co-authored-by: Marcus Schiesser <mail@marcusschiesser.de> --- .changeset/short-ducks-drum.md | 5 ++ templates/components/loaders/python/file.py | 5 +- .../vectordbs/python/astra/__init__.py | 0 .../vectordbs/python/astra/generate.py | 37 ---------- .../python/astra/{index.py => vectordb.py} | 13 +--- .../vectordbs/python/milvus/__init__.py | 0 .../vectordbs/python/milvus/generate.py | 39 ----------- .../vectordbs/python/milvus/index.py | 22 ------ .../vectordbs/python/milvus/vectordb.py | 13 ++++ .../vectordbs/python/mongo/__init__.py | 0 .../vectordbs/python/mongo/generate.py | 43 ------------ .../vectordbs/python/mongo/index.py | 20 ------ .../vectordbs/python/mongo/vectordb.py | 11 +++ .../vectordbs/python/none/__init__.py | 0 .../vectordbs/python/none/constants.py | 1 - .../vectordbs/python/none/generate.py | 32 --------- .../components/vectordbs/python/none/index.py | 20 ------ .../vectordbs/python/none/vectordb.py | 13 ++++ .../vectordbs/python/pg/__init__.py | 0 .../vectordbs/python/pg/constants.py | 2 - .../vectordbs/python/pg/generate.py | 35 ---------- .../components/vectordbs/python/pg/index.py | 13 ---- .../python/pg/{utils.py => vectordb.py} | 8 ++- .../vectordbs/python/pinecone/__init__.py | 0 .../vectordbs/python/pinecone/generate.py | 39 ----------- .../vectordbs/python/pinecone/index.py | 20 ------ .../vectordbs/python/pinecone/vectordb.py | 11 +++ .../vectordbs/python/qdrant/__init__.py | 0 .../vectordbs/python/qdrant/generate.py | 37 ---------- .../vectordbs/python/qdrant/index.py | 20 ------ .../vectordbs/python/qdrant/vectordb.py | 11 +++ .../types/streaming/fastapi/app/constants.py | 1 + .../streaming/fastapi/app/engine/generate.py | 70 +++++++++++++++++++ .../streaming/fastapi/app/engine/index.py | 13 ++++ 34 files changed, 157 insertions(+), 397 deletions(-) create mode 100644 .changeset/short-ducks-drum.md delete mode 100644 templates/components/vectordbs/python/astra/__init__.py delete mode 100644 templates/components/vectordbs/python/astra/generate.py rename templates/components/vectordbs/python/astra/{index.py => vectordb.py} (52%) delete mode 100644 templates/components/vectordbs/python/milvus/__init__.py delete mode 100644 templates/components/vectordbs/python/milvus/generate.py delete mode 100644 templates/components/vectordbs/python/milvus/index.py create mode 100644 templates/components/vectordbs/python/milvus/vectordb.py delete mode 100644 templates/components/vectordbs/python/mongo/__init__.py delete mode 100644 templates/components/vectordbs/python/mongo/generate.py delete mode 100644 templates/components/vectordbs/python/mongo/index.py create mode 100644 templates/components/vectordbs/python/mongo/vectordb.py delete mode 100644 templates/components/vectordbs/python/none/__init__.py delete mode 100644 templates/components/vectordbs/python/none/constants.py delete mode 100644 templates/components/vectordbs/python/none/generate.py delete mode 100644 templates/components/vectordbs/python/none/index.py create mode 100644 templates/components/vectordbs/python/none/vectordb.py delete mode 100644 templates/components/vectordbs/python/pg/__init__.py delete mode 100644 templates/components/vectordbs/python/pg/constants.py delete mode 100644 templates/components/vectordbs/python/pg/generate.py delete mode 100644 templates/components/vectordbs/python/pg/index.py rename templates/components/vectordbs/python/pg/{utils.py => vectordb.py} (84%) delete mode 100644 templates/components/vectordbs/python/pinecone/__init__.py delete mode 100644 templates/components/vectordbs/python/pinecone/generate.py delete mode 100644 templates/components/vectordbs/python/pinecone/index.py create mode 100644 templates/components/vectordbs/python/pinecone/vectordb.py delete mode 100644 templates/components/vectordbs/python/qdrant/__init__.py delete mode 100644 templates/components/vectordbs/python/qdrant/generate.py delete mode 100644 templates/components/vectordbs/python/qdrant/index.py create mode 100644 templates/components/vectordbs/python/qdrant/vectordb.py create mode 100644 templates/types/streaming/fastapi/app/constants.py create mode 100644 templates/types/streaming/fastapi/app/engine/generate.py create mode 100644 templates/types/streaming/fastapi/app/engine/index.py diff --git a/.changeset/short-ducks-drum.md b/.changeset/short-ducks-drum.md new file mode 100644 index 00000000..4980e727 --- /dev/null +++ b/.changeset/short-ducks-drum.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Use ingestion pipeline for Python diff --git a/templates/components/loaders/python/file.py b/templates/components/loaders/python/file.py index a814b0d0..6f72c29f 100644 --- a/templates/components/loaders/python/file.py +++ b/templates/components/loaders/python/file.py @@ -27,10 +27,7 @@ def llama_parse_parser(): def get_file_documents(config: FileLoaderConfig): from llama_index.core.readers import SimpleDirectoryReader - reader = SimpleDirectoryReader( - config.data_dir, - recursive=True, - ) + reader = SimpleDirectoryReader(config.data_dir, recursive=True, filename_as_id=True) if config.use_llama_parse: parser = llama_parse_parser() reader.file_extractor = {".pdf": parser} diff --git a/templates/components/vectordbs/python/astra/__init__.py b/templates/components/vectordbs/python/astra/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/templates/components/vectordbs/python/astra/generate.py b/templates/components/vectordbs/python/astra/generate.py deleted file mode 100644 index 4d2a54af..00000000 --- a/templates/components/vectordbs/python/astra/generate.py +++ /dev/null @@ -1,37 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.storage import StorageContext -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.astra_db import AstraDBVectorStore -from app.settings import init_settings -from app.engine.loaders import get_documents - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - documents = get_documents() - store = AstraDBVectorStore( - token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], - api_endpoint=os.environ["ASTRA_DB_ENDPOINT"], - collection_name=os.environ["ASTRA_DB_COLLECTION"], - embedding_dimension=int(os.environ["EMBEDDING_DIM"]), - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info(f"Successfully created embeddings in the AstraDB") - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/astra/index.py b/templates/components/vectordbs/python/astra/vectordb.py similarity index 52% rename from templates/components/vectordbs/python/astra/index.py rename to templates/components/vectordbs/python/astra/vectordb.py index b1389f76..0cd962d7 100644 --- a/templates/components/vectordbs/python/astra/index.py +++ b/templates/components/vectordbs/python/astra/vectordb.py @@ -1,21 +1,12 @@ -import logging import os - -from llama_index.core.indices import VectorStoreIndex from llama_index.vector_stores.astra_db import AstraDBVectorStore -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from AstraDB...") +def get_vector_store(): store = AstraDBVectorStore( token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], api_endpoint=os.environ["ASTRA_DB_ENDPOINT"], collection_name=os.environ["ASTRA_DB_COLLECTION"], embedding_dimension=int(os.environ["EMBEDDING_DIM"]), ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from AstraDB.") - return index + return store diff --git a/templates/components/vectordbs/python/milvus/__init__.py b/templates/components/vectordbs/python/milvus/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/templates/components/vectordbs/python/milvus/generate.py b/templates/components/vectordbs/python/milvus/generate.py deleted file mode 100644 index b5bfc9f9..00000000 --- a/templates/components/vectordbs/python/milvus/generate.py +++ /dev/null @@ -1,39 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.storage import StorageContext -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.milvus import MilvusVectorStore -from app.settings import init_settings -from app.engine.loaders import get_documents - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - store = MilvusVectorStore( - uri=os.environ["MILVUS_ADDRESS"], - user=os.getenv("MILVUS_USERNAME"), - password=os.getenv("MILVUS_PASSWORD"), - collection_name=os.getenv("MILVUS_COLLECTION"), - dim=int(os.getenv("EMBEDDING_DIM")), - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info(f"Successfully created embeddings in the Milvus") - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/milvus/index.py b/templates/components/vectordbs/python/milvus/index.py deleted file mode 100644 index ffd87e63..00000000 --- a/templates/components/vectordbs/python/milvus/index.py +++ /dev/null @@ -1,22 +0,0 @@ -import logging -import os - -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.milvus import MilvusVectorStore - - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from Milvus...") - store = MilvusVectorStore( - uri=os.getenv("MILVUS_ADDRESS"), - user=os.getenv("MILVUS_USERNAME"), - password=os.getenv("MILVUS_PASSWORD"), - collection_name=os.getenv("MILVUS_COLLECTION"), - dim=int(os.getenv("EMBEDDING_DIM")), - ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from Milvus.") - return index diff --git a/templates/components/vectordbs/python/milvus/vectordb.py b/templates/components/vectordbs/python/milvus/vectordb.py new file mode 100644 index 00000000..5791f15d --- /dev/null +++ b/templates/components/vectordbs/python/milvus/vectordb.py @@ -0,0 +1,13 @@ +import os +from llama_index.vector_stores.milvus import MilvusVectorStore + + +def get_vector_store(): + store = MilvusVectorStore( + uri=os.environ["MILVUS_ADDRESS"], + user=os.getenv("MILVUS_USERNAME"), + password=os.getenv("MILVUS_PASSWORD"), + collection_name=os.getenv("MILVUS_COLLECTION"), + dim=int(os.getenv("EMBEDDING_DIM")), + ) + return store diff --git a/templates/components/vectordbs/python/mongo/__init__.py b/templates/components/vectordbs/python/mongo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/templates/components/vectordbs/python/mongo/generate.py b/templates/components/vectordbs/python/mongo/generate.py deleted file mode 100644 index abe844c0..00000000 --- a/templates/components/vectordbs/python/mongo/generate.py +++ /dev/null @@ -1,43 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.storage import StorageContext -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch -from app.settings import init_settings -from app.engine.loaders import get_documents - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - store = MongoDBAtlasVectorSearch( - db_name=os.environ["MONGODB_DATABASE"], - collection_name=os.environ["MONGODB_VECTORS"], - index_name=os.environ["MONGODB_VECTOR_INDEX"], - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info( - f"Successfully created embeddings in the MongoDB collection {os.environ['MONGODB_VECTORS']}" - ) - logger.info( - """IMPORTANT: You can't query your index yet because you need to create a vector search index in MongoDB's UI now. -See https://github.com/run-llama/mongodb-demo/tree/main?tab=readme-ov-file#create-a-vector-search-index""" - ) - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/mongo/index.py b/templates/components/vectordbs/python/mongo/index.py deleted file mode 100644 index 6dba7c1d..00000000 --- a/templates/components/vectordbs/python/mongo/index.py +++ /dev/null @@ -1,20 +0,0 @@ -import logging -import os - -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch - - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from MongoDB...") - store = MongoDBAtlasVectorSearch( - db_name=os.environ["MONGODB_DATABASE"], - collection_name=os.environ["MONGODB_VECTORS"], - index_name=os.environ["MONGODB_VECTOR_INDEX"], - ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from MongoDB.") - return index diff --git a/templates/components/vectordbs/python/mongo/vectordb.py b/templates/components/vectordbs/python/mongo/vectordb.py new file mode 100644 index 00000000..d1fc5768 --- /dev/null +++ b/templates/components/vectordbs/python/mongo/vectordb.py @@ -0,0 +1,11 @@ +import os +from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch + + +def get_vector_store(): + store = MongoDBAtlasVectorSearch( + db_name=os.environ["MONGODB_DATABASE"], + collection_name=os.environ["MONGODB_VECTORS"], + index_name=os.environ["MONGODB_VECTOR_INDEX"], + ) + return store diff --git a/templates/components/vectordbs/python/none/__init__.py b/templates/components/vectordbs/python/none/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/templates/components/vectordbs/python/none/constants.py b/templates/components/vectordbs/python/none/constants.py deleted file mode 100644 index 254998eb..00000000 --- a/templates/components/vectordbs/python/none/constants.py +++ /dev/null @@ -1 +0,0 @@ -STORAGE_DIR = "storage" # directory to cache the generated index diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py deleted file mode 100644 index e38d89cb..00000000 --- a/templates/components/vectordbs/python/none/generate.py +++ /dev/null @@ -1,32 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import logging -from llama_index.core.indices import ( - VectorStoreIndex, -) -from app.engine.constants import STORAGE_DIR -from app.engine.loaders import get_documents -from app.settings import init_settings - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - index = VectorStoreIndex.from_documents( - documents, - ) - # store it for later - index.storage_context.persist(STORAGE_DIR) - logger.info(f"Finished creating new index. Stored in {STORAGE_DIR}") - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/none/index.py b/templates/components/vectordbs/python/none/index.py deleted file mode 100644 index 8b77414a..00000000 --- a/templates/components/vectordbs/python/none/index.py +++ /dev/null @@ -1,20 +0,0 @@ -import logging -import os - -from app.engine.constants import STORAGE_DIR -from llama_index.core.storage import StorageContext -from llama_index.core.indices import load_index_from_storage - -logger = logging.getLogger("uvicorn") - - -def get_index(): - # check if storage already exists - if not os.path.exists(STORAGE_DIR): - return None - # load the existing index - logger.info(f"Loading index from {STORAGE_DIR}...") - storage_context = StorageContext.from_defaults(persist_dir=STORAGE_DIR) - index = load_index_from_storage(storage_context) - logger.info(f"Finished loading index from {STORAGE_DIR}") - return index diff --git a/templates/components/vectordbs/python/none/vectordb.py b/templates/components/vectordbs/python/none/vectordb.py new file mode 100644 index 00000000..279f7a51 --- /dev/null +++ b/templates/components/vectordbs/python/none/vectordb.py @@ -0,0 +1,13 @@ +import os + +from llama_index.core.vector_stores import SimpleVectorStore +from app.constants import STORAGE_DIR + + +def get_vector_store(): + if not os.path.exists(STORAGE_DIR): + vector_store = SimpleVectorStore() + else: + vector_store = SimpleVectorStore.from_persist_dir(STORAGE_DIR) + vector_store.stores_text = True + return vector_store diff --git a/templates/components/vectordbs/python/pg/__init__.py b/templates/components/vectordbs/python/pg/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/templates/components/vectordbs/python/pg/constants.py b/templates/components/vectordbs/python/pg/constants.py deleted file mode 100644 index a4ebd918..00000000 --- a/templates/components/vectordbs/python/pg/constants.py +++ /dev/null @@ -1,2 +0,0 @@ -PGVECTOR_SCHEMA = "public" -PGVECTOR_TABLE = "llamaindex_embedding" \ No newline at end of file diff --git a/templates/components/vectordbs/python/pg/generate.py b/templates/components/vectordbs/python/pg/generate.py deleted file mode 100644 index 79fa3bd7..00000000 --- a/templates/components/vectordbs/python/pg/generate.py +++ /dev/null @@ -1,35 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import logging -from llama_index.core.indices import VectorStoreIndex -from llama_index.core.storage import StorageContext - -from app.engine.loaders import get_documents -from app.settings import init_settings -from app.engine.utils import init_pg_vector_store_from_env - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - store = init_pg_vector_store_from_env() - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info( - f"Successfully created embeddings in the PG vector store, schema={store.schema_name} table={store.table_name}" - ) - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/pg/index.py b/templates/components/vectordbs/python/pg/index.py deleted file mode 100644 index 3c4f3180..00000000 --- a/templates/components/vectordbs/python/pg/index.py +++ /dev/null @@ -1,13 +0,0 @@ -import logging -from llama_index.core.indices.vector_store import VectorStoreIndex -from app.engine.utils import init_pg_vector_store_from_env - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from PGVector...") - store = init_pg_vector_store_from_env() - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from PGVector.") - return index diff --git a/templates/components/vectordbs/python/pg/utils.py b/templates/components/vectordbs/python/pg/vectordb.py similarity index 84% rename from templates/components/vectordbs/python/pg/utils.py rename to templates/components/vectordbs/python/pg/vectordb.py index 39127846..da5eb1a2 100644 --- a/templates/components/vectordbs/python/pg/utils.py +++ b/templates/components/vectordbs/python/pg/vectordb.py @@ -1,10 +1,13 @@ import os from llama_index.vector_stores.postgres import PGVectorStore from urllib.parse import urlparse -from app.engine.constants import PGVECTOR_SCHEMA, PGVECTOR_TABLE +STORAGE_DIR = "storage" +PGVECTOR_SCHEMA = "public" +PGVECTOR_TABLE = "llamaindex_embedding" -def init_pg_vector_store_from_env(): + +def get_vector_store(): original_conn_string = os.environ.get("PG_CONNECTION_STRING") if original_conn_string is None or original_conn_string == "": raise ValueError("PG_CONNECTION_STRING environment variable is not set.") @@ -24,4 +27,5 @@ def init_pg_vector_store_from_env(): async_connection_string=async_conn_string, schema_name=PGVECTOR_SCHEMA, table_name=PGVECTOR_TABLE, + embed_dim=int(os.environ.get("EMBEDDING_DIM", 768)), ) diff --git a/templates/components/vectordbs/python/pinecone/__init__.py b/templates/components/vectordbs/python/pinecone/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/templates/components/vectordbs/python/pinecone/generate.py b/templates/components/vectordbs/python/pinecone/generate.py deleted file mode 100644 index 5f233ba2..00000000 --- a/templates/components/vectordbs/python/pinecone/generate.py +++ /dev/null @@ -1,39 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.storage import StorageContext -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.pinecone import PineconeVectorStore -from app.settings import init_settings -from app.engine.loaders import get_documents - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - store = PineconeVectorStore( - api_key=os.environ["PINECONE_API_KEY"], - index_name=os.environ["PINECONE_INDEX_NAME"], - environment=os.environ["PINECONE_ENVIRONMENT"], - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info( - f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}" - ) - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/pinecone/index.py b/templates/components/vectordbs/python/pinecone/index.py deleted file mode 100644 index 98824ffd..00000000 --- a/templates/components/vectordbs/python/pinecone/index.py +++ /dev/null @@ -1,20 +0,0 @@ -import logging -import os - -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.pinecone import PineconeVectorStore - - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from Pinecone...") - store = PineconeVectorStore( - api_key=os.environ["PINECONE_API_KEY"], - index_name=os.environ["PINECONE_INDEX_NAME"], - environment=os.environ["PINECONE_ENVIRONMENT"], - ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from Pinecone.") - return index diff --git a/templates/components/vectordbs/python/pinecone/vectordb.py b/templates/components/vectordbs/python/pinecone/vectordb.py new file mode 100644 index 00000000..d6ff2cf8 --- /dev/null +++ b/templates/components/vectordbs/python/pinecone/vectordb.py @@ -0,0 +1,11 @@ +import os +from llama_index.vector_stores.pinecone import PineconeVectorStore + + +def get_vector_store(): + store = PineconeVectorStore( + api_key=os.environ["PINECONE_API_KEY"], + index_name=os.environ["PINECONE_INDEX_NAME"], + environment=os.environ["PINECONE_ENVIRONMENT"], + ) + return store diff --git a/templates/components/vectordbs/python/qdrant/__init__.py b/templates/components/vectordbs/python/qdrant/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/templates/components/vectordbs/python/qdrant/generate.py b/templates/components/vectordbs/python/qdrant/generate.py deleted file mode 100644 index db7c055e..00000000 --- a/templates/components/vectordbs/python/qdrant/generate.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging -import os -from app.engine.loaders import get_documents -from app.settings import init_settings -from dotenv import load_dotenv -from llama_index.core.indices import VectorStoreIndex -from llama_index.core.storage import StorageContext -from llama_index.vector_stores.qdrant import QdrantVectorStore -load_dotenv() - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index with Qdrant") - # load the documents and create the index - documents = get_documents() - store = QdrantVectorStore( - collection_name=os.getenv("QDRANT_COLLECTION"), - url=os.getenv("QDRANT_URL"), - api_key=os.getenv("QDRANT_API_KEY"), - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info( - f"Successfully uploaded documents to the {os.getenv('QDRANT_COLLECTION')} collection." - ) - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/qdrant/index.py b/templates/components/vectordbs/python/qdrant/index.py deleted file mode 100644 index 0a388d8a..00000000 --- a/templates/components/vectordbs/python/qdrant/index.py +++ /dev/null @@ -1,20 +0,0 @@ -import logging -import os - -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.qdrant import QdrantVectorStore - - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to Qdrant collection..") - store = QdrantVectorStore( - collection_name=os.getenv("QDRANT_COLLECTION"), - url=os.getenv("QDRANT_URL"), - api_key=os.getenv("QDRANT_API_KEY"), - ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to Qdrant collection.") - return index diff --git a/templates/components/vectordbs/python/qdrant/vectordb.py b/templates/components/vectordbs/python/qdrant/vectordb.py new file mode 100644 index 00000000..5f36c202 --- /dev/null +++ b/templates/components/vectordbs/python/qdrant/vectordb.py @@ -0,0 +1,11 @@ +import os +from llama_index.vector_stores.qdrant import QdrantVectorStore + + +def get_vector_store(): + store = QdrantVectorStore( + collection_name=os.getenv("QDRANT_COLLECTION"), + url=os.getenv("QDRANT_URL"), + api_key=os.getenv("QDRANT_API_KEY"), + ) + return store diff --git a/templates/types/streaming/fastapi/app/constants.py b/templates/types/streaming/fastapi/app/constants.py new file mode 100644 index 00000000..61daefe5 --- /dev/null +++ b/templates/types/streaming/fastapi/app/constants.py @@ -0,0 +1 @@ +STORAGE_DIR = "storage" # directory to save the stores to (document store and if used, the `SimpleVectorStore`) diff --git a/templates/types/streaming/fastapi/app/engine/generate.py b/templates/types/streaming/fastapi/app/engine/generate.py new file mode 100644 index 00000000..3e1686dd --- /dev/null +++ b/templates/types/streaming/fastapi/app/engine/generate.py @@ -0,0 +1,70 @@ +from dotenv import load_dotenv + +load_dotenv() + +import os +import logging +from llama_index.core.settings import Settings +from llama_index.core.ingestion import IngestionPipeline +from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.vector_stores import SimpleVectorStore +from llama_index.core.storage.docstore import SimpleDocumentStore +from app.constants import STORAGE_DIR +from app.settings import init_settings +from app.engine.loaders import get_documents +from app.engine.vectordb import get_vector_store + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def get_doc_store(): + if not os.path.exists(STORAGE_DIR): + docstore = SimpleDocumentStore() + return docstore + else: + return SimpleDocumentStore.from_persist_dir(STORAGE_DIR) + + +def generate_datasource(): + init_settings() + logger.info("Creating new index") + + # load the documents and create the index + documents = get_documents() + docstore = get_doc_store() + vector_store = get_vector_store() + + # Create ingestion pipeline + ingestion_pipeline = IngestionPipeline( + transformations=[ + SentenceSplitter( + chunk_size=Settings.chunk_size, + chunk_overlap=Settings.chunk_overlap, + ), + Settings.embed_model, + ], + docstore=docstore, + docstore_strategy="upserts_and_delete", + ) + + # llama_index having an typing issue when passing vector_store to IngestionPipeline + # so we need to set it manually after initialization + ingestion_pipeline.vector_store = vector_store + + # Run the ingestion pipeline and store the results + ingestion_pipeline.run(show_progress=True, documents=documents) + + # Default vector store only keeps data in memory, so we need to persist it + # Can remove if using a different vector store + if isinstance(vector_store, SimpleVectorStore): + vector_store.persist(os.path.join(STORAGE_DIR, "vector_store.json")) + # Persist the docstore to apply ingestion strategy + docstore.persist(os.path.join(STORAGE_DIR, "docstore.json")) + + logger.info("Finished creating new index.") + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/types/streaming/fastapi/app/engine/index.py b/templates/types/streaming/fastapi/app/engine/index.py new file mode 100644 index 00000000..3cc2beb7 --- /dev/null +++ b/templates/types/streaming/fastapi/app/engine/index.py @@ -0,0 +1,13 @@ +import logging +from llama_index.core.indices.vector_store import VectorStoreIndex +from app.engine.vectordb import get_vector_store + +logger = logging.getLogger("uvicorn") + + +def get_index(): + logger.info("Loading the index...") + store = get_vector_store() + index = VectorStoreIndex.from_vector_store(store) + logger.info("Loaded index successfully.") + return index -- GitLab