From a1feb524e953edbe62c5b8d8c9b500de360efafe Mon Sep 17 00:00:00 2001 From: Marcus Schiesser <mail@marcusschiesser.de> Date: Fri, 3 May 2024 11:06:02 +0800 Subject: [PATCH] Revert "Use ingestion pipeline in Python code (#61)" This reverts commit c094b0c6bfee34b92a4daa2718e17307442e2a5f. --- .changeset/short-ducks-drum.md | 5 -- templates/components/loaders/python/file.py | 5 +- .../vectordbs/python/astra/__init__.py | 0 .../vectordbs/python/astra/generate.py | 37 ++++++++++ .../python/astra/{vectordb.py => index.py} | 13 +++- .../vectordbs/python/milvus/__init__.py | 0 .../vectordbs/python/milvus/generate.py | 39 +++++++++++ .../vectordbs/python/milvus/index.py | 22 ++++++ .../vectordbs/python/milvus/vectordb.py | 13 ---- .../vectordbs/python/mongo/__init__.py | 0 .../vectordbs/python/mongo/generate.py | 43 ++++++++++++ .../vectordbs/python/mongo/index.py | 20 ++++++ .../vectordbs/python/mongo/vectordb.py | 11 --- .../vectordbs/python/none/__init__.py | 0 .../vectordbs/python/none/constants.py | 1 + .../vectordbs/python/none/generate.py | 32 +++++++++ .../components/vectordbs/python/none/index.py | 20 ++++++ .../vectordbs/python/none/vectordb.py | 13 ---- .../vectordbs/python/pg/__init__.py | 0 .../vectordbs/python/pg/constants.py | 2 + .../vectordbs/python/pg/generate.py | 35 ++++++++++ .../components/vectordbs/python/pg/index.py | 13 ++++ .../python/pg/{vectordb.py => utils.py} | 8 +-- .../vectordbs/python/pinecone/__init__.py | 0 .../vectordbs/python/pinecone/generate.py | 39 +++++++++++ .../vectordbs/python/pinecone/index.py | 20 ++++++ .../vectordbs/python/pinecone/vectordb.py | 11 --- .../vectordbs/python/qdrant/__init__.py | 0 .../vectordbs/python/qdrant/generate.py | 37 ++++++++++ .../vectordbs/python/qdrant/index.py | 20 ++++++ .../vectordbs/python/qdrant/vectordb.py | 11 --- .../types/streaming/fastapi/app/constants.py | 1 - .../streaming/fastapi/app/engine/generate.py | 70 ------------------- .../streaming/fastapi/app/engine/index.py | 13 ---- 34 files changed, 397 insertions(+), 157 deletions(-) delete mode 100644 .changeset/short-ducks-drum.md create mode 100644 templates/components/vectordbs/python/astra/__init__.py create mode 100644 templates/components/vectordbs/python/astra/generate.py rename templates/components/vectordbs/python/astra/{vectordb.py => index.py} (52%) create mode 100644 templates/components/vectordbs/python/milvus/__init__.py create mode 100644 templates/components/vectordbs/python/milvus/generate.py create mode 100644 templates/components/vectordbs/python/milvus/index.py delete mode 100644 templates/components/vectordbs/python/milvus/vectordb.py create mode 100644 templates/components/vectordbs/python/mongo/__init__.py create mode 100644 templates/components/vectordbs/python/mongo/generate.py create mode 100644 templates/components/vectordbs/python/mongo/index.py delete mode 100644 templates/components/vectordbs/python/mongo/vectordb.py create mode 100644 templates/components/vectordbs/python/none/__init__.py create mode 100644 templates/components/vectordbs/python/none/constants.py create mode 100644 templates/components/vectordbs/python/none/generate.py create mode 100644 templates/components/vectordbs/python/none/index.py delete mode 100644 templates/components/vectordbs/python/none/vectordb.py create mode 100644 templates/components/vectordbs/python/pg/__init__.py create mode 100644 templates/components/vectordbs/python/pg/constants.py create mode 100644 templates/components/vectordbs/python/pg/generate.py create mode 100644 templates/components/vectordbs/python/pg/index.py rename templates/components/vectordbs/python/pg/{vectordb.py => utils.py} (84%) create mode 100644 templates/components/vectordbs/python/pinecone/__init__.py create mode 100644 templates/components/vectordbs/python/pinecone/generate.py create mode 100644 templates/components/vectordbs/python/pinecone/index.py delete mode 100644 templates/components/vectordbs/python/pinecone/vectordb.py create mode 100644 templates/components/vectordbs/python/qdrant/__init__.py create mode 100644 templates/components/vectordbs/python/qdrant/generate.py create mode 100644 templates/components/vectordbs/python/qdrant/index.py delete mode 100644 templates/components/vectordbs/python/qdrant/vectordb.py delete mode 100644 templates/types/streaming/fastapi/app/constants.py delete mode 100644 templates/types/streaming/fastapi/app/engine/generate.py delete mode 100644 templates/types/streaming/fastapi/app/engine/index.py diff --git a/.changeset/short-ducks-drum.md b/.changeset/short-ducks-drum.md deleted file mode 100644 index 4980e727..00000000 --- a/.changeset/short-ducks-drum.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"create-llama": patch ---- - -Use ingestion pipeline for Python diff --git a/templates/components/loaders/python/file.py b/templates/components/loaders/python/file.py index 6f72c29f..a814b0d0 100644 --- a/templates/components/loaders/python/file.py +++ b/templates/components/loaders/python/file.py @@ -27,7 +27,10 @@ def llama_parse_parser(): def get_file_documents(config: FileLoaderConfig): from llama_index.core.readers import SimpleDirectoryReader - reader = SimpleDirectoryReader(config.data_dir, recursive=True, filename_as_id=True) + reader = SimpleDirectoryReader( + config.data_dir, + recursive=True, + ) if config.use_llama_parse: parser = llama_parse_parser() reader.file_extractor = {".pdf": parser} diff --git a/templates/components/vectordbs/python/astra/__init__.py b/templates/components/vectordbs/python/astra/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/components/vectordbs/python/astra/generate.py b/templates/components/vectordbs/python/astra/generate.py new file mode 100644 index 00000000..4d2a54af --- /dev/null +++ b/templates/components/vectordbs/python/astra/generate.py @@ -0,0 +1,37 @@ +from dotenv import load_dotenv + +load_dotenv() + +import os +import logging +from llama_index.core.storage import StorageContext +from llama_index.core.indices import VectorStoreIndex +from llama_index.vector_stores.astra_db import AstraDBVectorStore +from app.settings import init_settings +from app.engine.loaders import get_documents + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(): + init_settings() + logger.info("Creating new index") + documents = get_documents() + store = AstraDBVectorStore( + token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], + api_endpoint=os.environ["ASTRA_DB_ENDPOINT"], + collection_name=os.environ["ASTRA_DB_COLLECTION"], + embedding_dimension=int(os.environ["EMBEDDING_DIM"]), + ) + storage_context = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + show_progress=True, # this will show you a progress bar as the embeddings are created + ) + logger.info(f"Successfully created embeddings in the AstraDB") + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/components/vectordbs/python/astra/vectordb.py b/templates/components/vectordbs/python/astra/index.py similarity index 52% rename from templates/components/vectordbs/python/astra/vectordb.py rename to templates/components/vectordbs/python/astra/index.py index 0cd962d7..b1389f76 100644 --- a/templates/components/vectordbs/python/astra/vectordb.py +++ b/templates/components/vectordbs/python/astra/index.py @@ -1,12 +1,21 @@ +import logging import os + +from llama_index.core.indices import VectorStoreIndex from llama_index.vector_stores.astra_db import AstraDBVectorStore -def get_vector_store(): +logger = logging.getLogger("uvicorn") + + +def get_index(): + logger.info("Connecting to index from AstraDB...") store = AstraDBVectorStore( token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], api_endpoint=os.environ["ASTRA_DB_ENDPOINT"], collection_name=os.environ["ASTRA_DB_COLLECTION"], embedding_dimension=int(os.environ["EMBEDDING_DIM"]), ) - return store + index = VectorStoreIndex.from_vector_store(store) + logger.info("Finished connecting to index from AstraDB.") + return index diff --git a/templates/components/vectordbs/python/milvus/__init__.py b/templates/components/vectordbs/python/milvus/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/components/vectordbs/python/milvus/generate.py b/templates/components/vectordbs/python/milvus/generate.py new file mode 100644 index 00000000..b5bfc9f9 --- /dev/null +++ b/templates/components/vectordbs/python/milvus/generate.py @@ -0,0 +1,39 @@ +from dotenv import load_dotenv + +load_dotenv() + +import os +import logging +from llama_index.core.storage import StorageContext +from llama_index.core.indices import VectorStoreIndex +from llama_index.vector_stores.milvus import MilvusVectorStore +from app.settings import init_settings +from app.engine.loaders import get_documents + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(): + init_settings() + logger.info("Creating new index") + # load the documents and create the index + documents = get_documents() + store = MilvusVectorStore( + uri=os.environ["MILVUS_ADDRESS"], + user=os.getenv("MILVUS_USERNAME"), + password=os.getenv("MILVUS_PASSWORD"), + collection_name=os.getenv("MILVUS_COLLECTION"), + dim=int(os.getenv("EMBEDDING_DIM")), + ) + storage_context = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + show_progress=True, # this will show you a progress bar as the embeddings are created + ) + logger.info(f"Successfully created embeddings in the Milvus") + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/components/vectordbs/python/milvus/index.py b/templates/components/vectordbs/python/milvus/index.py new file mode 100644 index 00000000..ffd87e63 --- /dev/null +++ b/templates/components/vectordbs/python/milvus/index.py @@ -0,0 +1,22 @@ +import logging +import os + +from llama_index.core.indices import VectorStoreIndex +from llama_index.vector_stores.milvus import MilvusVectorStore + + +logger = logging.getLogger("uvicorn") + + +def get_index(): + logger.info("Connecting to index from Milvus...") + store = MilvusVectorStore( + uri=os.getenv("MILVUS_ADDRESS"), + user=os.getenv("MILVUS_USERNAME"), + password=os.getenv("MILVUS_PASSWORD"), + collection_name=os.getenv("MILVUS_COLLECTION"), + dim=int(os.getenv("EMBEDDING_DIM")), + ) + index = VectorStoreIndex.from_vector_store(store) + logger.info("Finished connecting to index from Milvus.") + return index diff --git a/templates/components/vectordbs/python/milvus/vectordb.py b/templates/components/vectordbs/python/milvus/vectordb.py deleted file mode 100644 index 5791f15d..00000000 --- a/templates/components/vectordbs/python/milvus/vectordb.py +++ /dev/null @@ -1,13 +0,0 @@ -import os -from llama_index.vector_stores.milvus import MilvusVectorStore - - -def get_vector_store(): - store = MilvusVectorStore( - uri=os.environ["MILVUS_ADDRESS"], - user=os.getenv("MILVUS_USERNAME"), - password=os.getenv("MILVUS_PASSWORD"), - collection_name=os.getenv("MILVUS_COLLECTION"), - dim=int(os.getenv("EMBEDDING_DIM")), - ) - return store diff --git a/templates/components/vectordbs/python/mongo/__init__.py b/templates/components/vectordbs/python/mongo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/components/vectordbs/python/mongo/generate.py b/templates/components/vectordbs/python/mongo/generate.py new file mode 100644 index 00000000..abe844c0 --- /dev/null +++ b/templates/components/vectordbs/python/mongo/generate.py @@ -0,0 +1,43 @@ +from dotenv import load_dotenv + +load_dotenv() + +import os +import logging +from llama_index.core.storage import StorageContext +from llama_index.core.indices import VectorStoreIndex +from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch +from app.settings import init_settings +from app.engine.loaders import get_documents + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(): + init_settings() + logger.info("Creating new index") + # load the documents and create the index + documents = get_documents() + store = MongoDBAtlasVectorSearch( + db_name=os.environ["MONGODB_DATABASE"], + collection_name=os.environ["MONGODB_VECTORS"], + index_name=os.environ["MONGODB_VECTOR_INDEX"], + ) + storage_context = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + show_progress=True, # this will show you a progress bar as the embeddings are created + ) + logger.info( + f"Successfully created embeddings in the MongoDB collection {os.environ['MONGODB_VECTORS']}" + ) + logger.info( + """IMPORTANT: You can't query your index yet because you need to create a vector search index in MongoDB's UI now. +See https://github.com/run-llama/mongodb-demo/tree/main?tab=readme-ov-file#create-a-vector-search-index""" + ) + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/components/vectordbs/python/mongo/index.py b/templates/components/vectordbs/python/mongo/index.py new file mode 100644 index 00000000..6dba7c1d --- /dev/null +++ b/templates/components/vectordbs/python/mongo/index.py @@ -0,0 +1,20 @@ +import logging +import os + +from llama_index.core.indices import VectorStoreIndex +from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch + + +logger = logging.getLogger("uvicorn") + + +def get_index(): + logger.info("Connecting to index from MongoDB...") + store = MongoDBAtlasVectorSearch( + db_name=os.environ["MONGODB_DATABASE"], + collection_name=os.environ["MONGODB_VECTORS"], + index_name=os.environ["MONGODB_VECTOR_INDEX"], + ) + index = VectorStoreIndex.from_vector_store(store) + logger.info("Finished connecting to index from MongoDB.") + return index diff --git a/templates/components/vectordbs/python/mongo/vectordb.py b/templates/components/vectordbs/python/mongo/vectordb.py deleted file mode 100644 index d1fc5768..00000000 --- a/templates/components/vectordbs/python/mongo/vectordb.py +++ /dev/null @@ -1,11 +0,0 @@ -import os -from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch - - -def get_vector_store(): - store = MongoDBAtlasVectorSearch( - db_name=os.environ["MONGODB_DATABASE"], - collection_name=os.environ["MONGODB_VECTORS"], - index_name=os.environ["MONGODB_VECTOR_INDEX"], - ) - return store diff --git a/templates/components/vectordbs/python/none/__init__.py b/templates/components/vectordbs/python/none/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/components/vectordbs/python/none/constants.py b/templates/components/vectordbs/python/none/constants.py new file mode 100644 index 00000000..254998eb --- /dev/null +++ b/templates/components/vectordbs/python/none/constants.py @@ -0,0 +1 @@ +STORAGE_DIR = "storage" # directory to cache the generated index diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py new file mode 100644 index 00000000..e38d89cb --- /dev/null +++ b/templates/components/vectordbs/python/none/generate.py @@ -0,0 +1,32 @@ +from dotenv import load_dotenv + +load_dotenv() + +import logging +from llama_index.core.indices import ( + VectorStoreIndex, +) +from app.engine.constants import STORAGE_DIR +from app.engine.loaders import get_documents +from app.settings import init_settings + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(): + init_settings() + logger.info("Creating new index") + # load the documents and create the index + documents = get_documents() + index = VectorStoreIndex.from_documents( + documents, + ) + # store it for later + index.storage_context.persist(STORAGE_DIR) + logger.info(f"Finished creating new index. Stored in {STORAGE_DIR}") + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/components/vectordbs/python/none/index.py b/templates/components/vectordbs/python/none/index.py new file mode 100644 index 00000000..8b77414a --- /dev/null +++ b/templates/components/vectordbs/python/none/index.py @@ -0,0 +1,20 @@ +import logging +import os + +from app.engine.constants import STORAGE_DIR +from llama_index.core.storage import StorageContext +from llama_index.core.indices import load_index_from_storage + +logger = logging.getLogger("uvicorn") + + +def get_index(): + # check if storage already exists + if not os.path.exists(STORAGE_DIR): + return None + # load the existing index + logger.info(f"Loading index from {STORAGE_DIR}...") + storage_context = StorageContext.from_defaults(persist_dir=STORAGE_DIR) + index = load_index_from_storage(storage_context) + logger.info(f"Finished loading index from {STORAGE_DIR}") + return index diff --git a/templates/components/vectordbs/python/none/vectordb.py b/templates/components/vectordbs/python/none/vectordb.py deleted file mode 100644 index 279f7a51..00000000 --- a/templates/components/vectordbs/python/none/vectordb.py +++ /dev/null @@ -1,13 +0,0 @@ -import os - -from llama_index.core.vector_stores import SimpleVectorStore -from app.constants import STORAGE_DIR - - -def get_vector_store(): - if not os.path.exists(STORAGE_DIR): - vector_store = SimpleVectorStore() - else: - vector_store = SimpleVectorStore.from_persist_dir(STORAGE_DIR) - vector_store.stores_text = True - return vector_store diff --git a/templates/components/vectordbs/python/pg/__init__.py b/templates/components/vectordbs/python/pg/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/components/vectordbs/python/pg/constants.py b/templates/components/vectordbs/python/pg/constants.py new file mode 100644 index 00000000..a4ebd918 --- /dev/null +++ b/templates/components/vectordbs/python/pg/constants.py @@ -0,0 +1,2 @@ +PGVECTOR_SCHEMA = "public" +PGVECTOR_TABLE = "llamaindex_embedding" \ No newline at end of file diff --git a/templates/components/vectordbs/python/pg/generate.py b/templates/components/vectordbs/python/pg/generate.py new file mode 100644 index 00000000..79fa3bd7 --- /dev/null +++ b/templates/components/vectordbs/python/pg/generate.py @@ -0,0 +1,35 @@ +from dotenv import load_dotenv + +load_dotenv() + +import logging +from llama_index.core.indices import VectorStoreIndex +from llama_index.core.storage import StorageContext + +from app.engine.loaders import get_documents +from app.settings import init_settings +from app.engine.utils import init_pg_vector_store_from_env + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(): + init_settings() + logger.info("Creating new index") + # load the documents and create the index + documents = get_documents() + store = init_pg_vector_store_from_env() + storage_context = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + show_progress=True, # this will show you a progress bar as the embeddings are created + ) + logger.info( + f"Successfully created embeddings in the PG vector store, schema={store.schema_name} table={store.table_name}" + ) + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/components/vectordbs/python/pg/index.py b/templates/components/vectordbs/python/pg/index.py new file mode 100644 index 00000000..3c4f3180 --- /dev/null +++ b/templates/components/vectordbs/python/pg/index.py @@ -0,0 +1,13 @@ +import logging +from llama_index.core.indices.vector_store import VectorStoreIndex +from app.engine.utils import init_pg_vector_store_from_env + +logger = logging.getLogger("uvicorn") + + +def get_index(): + logger.info("Connecting to index from PGVector...") + store = init_pg_vector_store_from_env() + index = VectorStoreIndex.from_vector_store(store) + logger.info("Finished connecting to index from PGVector.") + return index diff --git a/templates/components/vectordbs/python/pg/vectordb.py b/templates/components/vectordbs/python/pg/utils.py similarity index 84% rename from templates/components/vectordbs/python/pg/vectordb.py rename to templates/components/vectordbs/python/pg/utils.py index da5eb1a2..39127846 100644 --- a/templates/components/vectordbs/python/pg/vectordb.py +++ b/templates/components/vectordbs/python/pg/utils.py @@ -1,13 +1,10 @@ import os from llama_index.vector_stores.postgres import PGVectorStore from urllib.parse import urlparse +from app.engine.constants import PGVECTOR_SCHEMA, PGVECTOR_TABLE -STORAGE_DIR = "storage" -PGVECTOR_SCHEMA = "public" -PGVECTOR_TABLE = "llamaindex_embedding" - -def get_vector_store(): +def init_pg_vector_store_from_env(): original_conn_string = os.environ.get("PG_CONNECTION_STRING") if original_conn_string is None or original_conn_string == "": raise ValueError("PG_CONNECTION_STRING environment variable is not set.") @@ -27,5 +24,4 @@ def get_vector_store(): async_connection_string=async_conn_string, schema_name=PGVECTOR_SCHEMA, table_name=PGVECTOR_TABLE, - embed_dim=int(os.environ.get("EMBEDDING_DIM", 768)), ) diff --git a/templates/components/vectordbs/python/pinecone/__init__.py b/templates/components/vectordbs/python/pinecone/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/components/vectordbs/python/pinecone/generate.py b/templates/components/vectordbs/python/pinecone/generate.py new file mode 100644 index 00000000..5f233ba2 --- /dev/null +++ b/templates/components/vectordbs/python/pinecone/generate.py @@ -0,0 +1,39 @@ +from dotenv import load_dotenv + +load_dotenv() + +import os +import logging +from llama_index.core.storage import StorageContext +from llama_index.core.indices import VectorStoreIndex +from llama_index.vector_stores.pinecone import PineconeVectorStore +from app.settings import init_settings +from app.engine.loaders import get_documents + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(): + init_settings() + logger.info("Creating new index") + # load the documents and create the index + documents = get_documents() + store = PineconeVectorStore( + api_key=os.environ["PINECONE_API_KEY"], + index_name=os.environ["PINECONE_INDEX_NAME"], + environment=os.environ["PINECONE_ENVIRONMENT"], + ) + storage_context = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + show_progress=True, # this will show you a progress bar as the embeddings are created + ) + logger.info( + f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}" + ) + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/components/vectordbs/python/pinecone/index.py b/templates/components/vectordbs/python/pinecone/index.py new file mode 100644 index 00000000..98824ffd --- /dev/null +++ b/templates/components/vectordbs/python/pinecone/index.py @@ -0,0 +1,20 @@ +import logging +import os + +from llama_index.core.indices import VectorStoreIndex +from llama_index.vector_stores.pinecone import PineconeVectorStore + + +logger = logging.getLogger("uvicorn") + + +def get_index(): + logger.info("Connecting to index from Pinecone...") + store = PineconeVectorStore( + api_key=os.environ["PINECONE_API_KEY"], + index_name=os.environ["PINECONE_INDEX_NAME"], + environment=os.environ["PINECONE_ENVIRONMENT"], + ) + index = VectorStoreIndex.from_vector_store(store) + logger.info("Finished connecting to index from Pinecone.") + return index diff --git a/templates/components/vectordbs/python/pinecone/vectordb.py b/templates/components/vectordbs/python/pinecone/vectordb.py deleted file mode 100644 index d6ff2cf8..00000000 --- a/templates/components/vectordbs/python/pinecone/vectordb.py +++ /dev/null @@ -1,11 +0,0 @@ -import os -from llama_index.vector_stores.pinecone import PineconeVectorStore - - -def get_vector_store(): - store = PineconeVectorStore( - api_key=os.environ["PINECONE_API_KEY"], - index_name=os.environ["PINECONE_INDEX_NAME"], - environment=os.environ["PINECONE_ENVIRONMENT"], - ) - return store diff --git a/templates/components/vectordbs/python/qdrant/__init__.py b/templates/components/vectordbs/python/qdrant/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/components/vectordbs/python/qdrant/generate.py b/templates/components/vectordbs/python/qdrant/generate.py new file mode 100644 index 00000000..db7c055e --- /dev/null +++ b/templates/components/vectordbs/python/qdrant/generate.py @@ -0,0 +1,37 @@ +import logging +import os +from app.engine.loaders import get_documents +from app.settings import init_settings +from dotenv import load_dotenv +from llama_index.core.indices import VectorStoreIndex +from llama_index.core.storage import StorageContext +from llama_index.vector_stores.qdrant import QdrantVectorStore +load_dotenv() + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(): + init_settings() + logger.info("Creating new index with Qdrant") + # load the documents and create the index + documents = get_documents() + store = QdrantVectorStore( + collection_name=os.getenv("QDRANT_COLLECTION"), + url=os.getenv("QDRANT_URL"), + api_key=os.getenv("QDRANT_API_KEY"), + ) + storage_context = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + show_progress=True, # this will show you a progress bar as the embeddings are created + ) + logger.info( + f"Successfully uploaded documents to the {os.getenv('QDRANT_COLLECTION')} collection." + ) + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/components/vectordbs/python/qdrant/index.py b/templates/components/vectordbs/python/qdrant/index.py new file mode 100644 index 00000000..0a388d8a --- /dev/null +++ b/templates/components/vectordbs/python/qdrant/index.py @@ -0,0 +1,20 @@ +import logging +import os + +from llama_index.core.indices import VectorStoreIndex +from llama_index.vector_stores.qdrant import QdrantVectorStore + + +logger = logging.getLogger("uvicorn") + + +def get_index(): + logger.info("Connecting to Qdrant collection..") + store = QdrantVectorStore( + collection_name=os.getenv("QDRANT_COLLECTION"), + url=os.getenv("QDRANT_URL"), + api_key=os.getenv("QDRANT_API_KEY"), + ) + index = VectorStoreIndex.from_vector_store(store) + logger.info("Finished connecting to Qdrant collection.") + return index diff --git a/templates/components/vectordbs/python/qdrant/vectordb.py b/templates/components/vectordbs/python/qdrant/vectordb.py deleted file mode 100644 index 5f36c202..00000000 --- a/templates/components/vectordbs/python/qdrant/vectordb.py +++ /dev/null @@ -1,11 +0,0 @@ -import os -from llama_index.vector_stores.qdrant import QdrantVectorStore - - -def get_vector_store(): - store = QdrantVectorStore( - collection_name=os.getenv("QDRANT_COLLECTION"), - url=os.getenv("QDRANT_URL"), - api_key=os.getenv("QDRANT_API_KEY"), - ) - return store diff --git a/templates/types/streaming/fastapi/app/constants.py b/templates/types/streaming/fastapi/app/constants.py deleted file mode 100644 index 61daefe5..00000000 --- a/templates/types/streaming/fastapi/app/constants.py +++ /dev/null @@ -1 +0,0 @@ -STORAGE_DIR = "storage" # directory to save the stores to (document store and if used, the `SimpleVectorStore`) diff --git a/templates/types/streaming/fastapi/app/engine/generate.py b/templates/types/streaming/fastapi/app/engine/generate.py deleted file mode 100644 index 3e1686dd..00000000 --- a/templates/types/streaming/fastapi/app/engine/generate.py +++ /dev/null @@ -1,70 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.settings import Settings -from llama_index.core.ingestion import IngestionPipeline -from llama_index.core.node_parser import SentenceSplitter -from llama_index.core.vector_stores import SimpleVectorStore -from llama_index.core.storage.docstore import SimpleDocumentStore -from app.constants import STORAGE_DIR -from app.settings import init_settings -from app.engine.loaders import get_documents -from app.engine.vectordb import get_vector_store - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def get_doc_store(): - if not os.path.exists(STORAGE_DIR): - docstore = SimpleDocumentStore() - return docstore - else: - return SimpleDocumentStore.from_persist_dir(STORAGE_DIR) - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - - # load the documents and create the index - documents = get_documents() - docstore = get_doc_store() - vector_store = get_vector_store() - - # Create ingestion pipeline - ingestion_pipeline = IngestionPipeline( - transformations=[ - SentenceSplitter( - chunk_size=Settings.chunk_size, - chunk_overlap=Settings.chunk_overlap, - ), - Settings.embed_model, - ], - docstore=docstore, - docstore_strategy="upserts_and_delete", - ) - - # llama_index having an typing issue when passing vector_store to IngestionPipeline - # so we need to set it manually after initialization - ingestion_pipeline.vector_store = vector_store - - # Run the ingestion pipeline and store the results - ingestion_pipeline.run(show_progress=True, documents=documents) - - # Default vector store only keeps data in memory, so we need to persist it - # Can remove if using a different vector store - if isinstance(vector_store, SimpleVectorStore): - vector_store.persist(os.path.join(STORAGE_DIR, "vector_store.json")) - # Persist the docstore to apply ingestion strategy - docstore.persist(os.path.join(STORAGE_DIR, "docstore.json")) - - logger.info("Finished creating new index.") - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/types/streaming/fastapi/app/engine/index.py b/templates/types/streaming/fastapi/app/engine/index.py deleted file mode 100644 index 3cc2beb7..00000000 --- a/templates/types/streaming/fastapi/app/engine/index.py +++ /dev/null @@ -1,13 +0,0 @@ -import logging -from llama_index.core.indices.vector_store import VectorStoreIndex -from app.engine.vectordb import get_vector_store - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Loading the index...") - store = get_vector_store() - index = VectorStoreIndex.from_vector_store(store) - logger.info("Loaded index successfully.") - return index -- GitLab