diff --git a/.changeset/short-ducks-drum.md b/.changeset/short-ducks-drum.md new file mode 100644 index 0000000000000000000000000000000000000000..4980e727faee34c9dbb18fd963fe02ec6ed4281d --- /dev/null +++ b/.changeset/short-ducks-drum.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Use ingestion pipeline for Python diff --git a/templates/components/loaders/python/file.py b/templates/components/loaders/python/file.py index a814b0d083fb6331f904604304723e8617b3c423..6f72c29f369dde93ed83308b9ef15856c06a8d20 100644 --- a/templates/components/loaders/python/file.py +++ b/templates/components/loaders/python/file.py @@ -27,10 +27,7 @@ def llama_parse_parser(): def get_file_documents(config: FileLoaderConfig): from llama_index.core.readers import SimpleDirectoryReader - reader = SimpleDirectoryReader( - config.data_dir, - recursive=True, - ) + reader = SimpleDirectoryReader(config.data_dir, recursive=True, filename_as_id=True) if config.use_llama_parse: parser = llama_parse_parser() reader.file_extractor = {".pdf": parser} diff --git a/templates/components/vectordbs/python/astra/__init__.py b/templates/components/vectordbs/python/astra/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/templates/components/vectordbs/python/astra/generate.py b/templates/components/vectordbs/python/astra/generate.py deleted file mode 100644 index 4d2a54af9685ae16f54d61ad5316c162ad6239bd..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/astra/generate.py +++ /dev/null @@ -1,37 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.storage import StorageContext -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.astra_db import AstraDBVectorStore -from app.settings import init_settings -from app.engine.loaders import get_documents - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - documents = get_documents() - store = AstraDBVectorStore( - token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], - api_endpoint=os.environ["ASTRA_DB_ENDPOINT"], - collection_name=os.environ["ASTRA_DB_COLLECTION"], - embedding_dimension=int(os.environ["EMBEDDING_DIM"]), - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info(f"Successfully created embeddings in the AstraDB") - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/astra/index.py b/templates/components/vectordbs/python/astra/vectordb.py similarity index 52% rename from templates/components/vectordbs/python/astra/index.py rename to templates/components/vectordbs/python/astra/vectordb.py index b1389f7659278c1e859da5d941716e8540b06977..0cd962d706ee2390837a112beacca750033b2ba9 100644 --- a/templates/components/vectordbs/python/astra/index.py +++ b/templates/components/vectordbs/python/astra/vectordb.py @@ -1,21 +1,12 @@ -import logging import os - -from llama_index.core.indices import VectorStoreIndex from llama_index.vector_stores.astra_db import AstraDBVectorStore -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from AstraDB...") +def get_vector_store(): store = AstraDBVectorStore( token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], api_endpoint=os.environ["ASTRA_DB_ENDPOINT"], collection_name=os.environ["ASTRA_DB_COLLECTION"], embedding_dimension=int(os.environ["EMBEDDING_DIM"]), ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from AstraDB.") - return index + return store diff --git a/templates/components/vectordbs/python/milvus/__init__.py b/templates/components/vectordbs/python/milvus/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/templates/components/vectordbs/python/milvus/generate.py b/templates/components/vectordbs/python/milvus/generate.py deleted file mode 100644 index b5bfc9f910819a0468bb6a29ce11b7ee6aa10d02..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/milvus/generate.py +++ /dev/null @@ -1,39 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.storage import StorageContext -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.milvus import MilvusVectorStore -from app.settings import init_settings -from app.engine.loaders import get_documents - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - store = MilvusVectorStore( - uri=os.environ["MILVUS_ADDRESS"], - user=os.getenv("MILVUS_USERNAME"), - password=os.getenv("MILVUS_PASSWORD"), - collection_name=os.getenv("MILVUS_COLLECTION"), - dim=int(os.getenv("EMBEDDING_DIM")), - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info(f"Successfully created embeddings in the Milvus") - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/milvus/index.py b/templates/components/vectordbs/python/milvus/index.py deleted file mode 100644 index ffd87e630cb90b91d5b83e410627a36d13af1ff5..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/milvus/index.py +++ /dev/null @@ -1,22 +0,0 @@ -import logging -import os - -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.milvus import MilvusVectorStore - - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from Milvus...") - store = MilvusVectorStore( - uri=os.getenv("MILVUS_ADDRESS"), - user=os.getenv("MILVUS_USERNAME"), - password=os.getenv("MILVUS_PASSWORD"), - collection_name=os.getenv("MILVUS_COLLECTION"), - dim=int(os.getenv("EMBEDDING_DIM")), - ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from Milvus.") - return index diff --git a/templates/components/vectordbs/python/milvus/vectordb.py b/templates/components/vectordbs/python/milvus/vectordb.py new file mode 100644 index 0000000000000000000000000000000000000000..5791f15d82d693475780d0f972c7371367c8fb28 --- /dev/null +++ b/templates/components/vectordbs/python/milvus/vectordb.py @@ -0,0 +1,13 @@ +import os +from llama_index.vector_stores.milvus import MilvusVectorStore + + +def get_vector_store(): + store = MilvusVectorStore( + uri=os.environ["MILVUS_ADDRESS"], + user=os.getenv("MILVUS_USERNAME"), + password=os.getenv("MILVUS_PASSWORD"), + collection_name=os.getenv("MILVUS_COLLECTION"), + dim=int(os.getenv("EMBEDDING_DIM")), + ) + return store diff --git a/templates/components/vectordbs/python/mongo/__init__.py b/templates/components/vectordbs/python/mongo/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/templates/components/vectordbs/python/mongo/generate.py b/templates/components/vectordbs/python/mongo/generate.py deleted file mode 100644 index abe844c03b7e210e1991fca2056d1bd44d4797a0..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/mongo/generate.py +++ /dev/null @@ -1,43 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.storage import StorageContext -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch -from app.settings import init_settings -from app.engine.loaders import get_documents - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - store = MongoDBAtlasVectorSearch( - db_name=os.environ["MONGODB_DATABASE"], - collection_name=os.environ["MONGODB_VECTORS"], - index_name=os.environ["MONGODB_VECTOR_INDEX"], - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info( - f"Successfully created embeddings in the MongoDB collection {os.environ['MONGODB_VECTORS']}" - ) - logger.info( - """IMPORTANT: You can't query your index yet because you need to create a vector search index in MongoDB's UI now. -See https://github.com/run-llama/mongodb-demo/tree/main?tab=readme-ov-file#create-a-vector-search-index""" - ) - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/mongo/index.py b/templates/components/vectordbs/python/mongo/index.py deleted file mode 100644 index 6dba7c1d05ddd8e77853f081632b3232a89bc7e2..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/mongo/index.py +++ /dev/null @@ -1,20 +0,0 @@ -import logging -import os - -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch - - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from MongoDB...") - store = MongoDBAtlasVectorSearch( - db_name=os.environ["MONGODB_DATABASE"], - collection_name=os.environ["MONGODB_VECTORS"], - index_name=os.environ["MONGODB_VECTOR_INDEX"], - ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from MongoDB.") - return index diff --git a/templates/components/vectordbs/python/mongo/vectordb.py b/templates/components/vectordbs/python/mongo/vectordb.py new file mode 100644 index 0000000000000000000000000000000000000000..d1fc57681867589017d48e253060e6dee710a4be --- /dev/null +++ b/templates/components/vectordbs/python/mongo/vectordb.py @@ -0,0 +1,11 @@ +import os +from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch + + +def get_vector_store(): + store = MongoDBAtlasVectorSearch( + db_name=os.environ["MONGODB_DATABASE"], + collection_name=os.environ["MONGODB_VECTORS"], + index_name=os.environ["MONGODB_VECTOR_INDEX"], + ) + return store diff --git a/templates/components/vectordbs/python/none/__init__.py b/templates/components/vectordbs/python/none/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/templates/components/vectordbs/python/none/constants.py b/templates/components/vectordbs/python/none/constants.py deleted file mode 100644 index 254998ebbda96cd491b7914ed795eb6b5cfe0d39..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/none/constants.py +++ /dev/null @@ -1 +0,0 @@ -STORAGE_DIR = "storage" # directory to cache the generated index diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py deleted file mode 100644 index e38d89cb81035ff1a3c3cf627add6e569c589b9f..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/none/generate.py +++ /dev/null @@ -1,32 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import logging -from llama_index.core.indices import ( - VectorStoreIndex, -) -from app.engine.constants import STORAGE_DIR -from app.engine.loaders import get_documents -from app.settings import init_settings - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - index = VectorStoreIndex.from_documents( - documents, - ) - # store it for later - index.storage_context.persist(STORAGE_DIR) - logger.info(f"Finished creating new index. Stored in {STORAGE_DIR}") - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/none/index.py b/templates/components/vectordbs/python/none/index.py deleted file mode 100644 index 8b77414a41f911de916d1d57c451d76df6707868..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/none/index.py +++ /dev/null @@ -1,20 +0,0 @@ -import logging -import os - -from app.engine.constants import STORAGE_DIR -from llama_index.core.storage import StorageContext -from llama_index.core.indices import load_index_from_storage - -logger = logging.getLogger("uvicorn") - - -def get_index(): - # check if storage already exists - if not os.path.exists(STORAGE_DIR): - return None - # load the existing index - logger.info(f"Loading index from {STORAGE_DIR}...") - storage_context = StorageContext.from_defaults(persist_dir=STORAGE_DIR) - index = load_index_from_storage(storage_context) - logger.info(f"Finished loading index from {STORAGE_DIR}") - return index diff --git a/templates/components/vectordbs/python/none/vectordb.py b/templates/components/vectordbs/python/none/vectordb.py new file mode 100644 index 0000000000000000000000000000000000000000..279f7a51114ff2aca82ad649dd0868d87e1e039d --- /dev/null +++ b/templates/components/vectordbs/python/none/vectordb.py @@ -0,0 +1,13 @@ +import os + +from llama_index.core.vector_stores import SimpleVectorStore +from app.constants import STORAGE_DIR + + +def get_vector_store(): + if not os.path.exists(STORAGE_DIR): + vector_store = SimpleVectorStore() + else: + vector_store = SimpleVectorStore.from_persist_dir(STORAGE_DIR) + vector_store.stores_text = True + return vector_store diff --git a/templates/components/vectordbs/python/pg/__init__.py b/templates/components/vectordbs/python/pg/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/templates/components/vectordbs/python/pg/constants.py b/templates/components/vectordbs/python/pg/constants.py deleted file mode 100644 index a4ebd91831da4f3e6ff585106eee69fcf6993b0e..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/pg/constants.py +++ /dev/null @@ -1,2 +0,0 @@ -PGVECTOR_SCHEMA = "public" -PGVECTOR_TABLE = "llamaindex_embedding" \ No newline at end of file diff --git a/templates/components/vectordbs/python/pg/generate.py b/templates/components/vectordbs/python/pg/generate.py deleted file mode 100644 index 79fa3bd7345fb1bda984947a4449403e1becd23d..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/pg/generate.py +++ /dev/null @@ -1,35 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import logging -from llama_index.core.indices import VectorStoreIndex -from llama_index.core.storage import StorageContext - -from app.engine.loaders import get_documents -from app.settings import init_settings -from app.engine.utils import init_pg_vector_store_from_env - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - store = init_pg_vector_store_from_env() - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info( - f"Successfully created embeddings in the PG vector store, schema={store.schema_name} table={store.table_name}" - ) - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/pg/index.py b/templates/components/vectordbs/python/pg/index.py deleted file mode 100644 index 3c4f31800b4f06fd286e8c23ab3fbdca393c4fca..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/pg/index.py +++ /dev/null @@ -1,13 +0,0 @@ -import logging -from llama_index.core.indices.vector_store import VectorStoreIndex -from app.engine.utils import init_pg_vector_store_from_env - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from PGVector...") - store = init_pg_vector_store_from_env() - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from PGVector.") - return index diff --git a/templates/components/vectordbs/python/pg/utils.py b/templates/components/vectordbs/python/pg/vectordb.py similarity index 84% rename from templates/components/vectordbs/python/pg/utils.py rename to templates/components/vectordbs/python/pg/vectordb.py index 39127846dfddb706a22fc1b20e3cef1bf98751a7..da5eb1a207e85078e4f314d193dac19ce9de4731 100644 --- a/templates/components/vectordbs/python/pg/utils.py +++ b/templates/components/vectordbs/python/pg/vectordb.py @@ -1,10 +1,13 @@ import os from llama_index.vector_stores.postgres import PGVectorStore from urllib.parse import urlparse -from app.engine.constants import PGVECTOR_SCHEMA, PGVECTOR_TABLE +STORAGE_DIR = "storage" +PGVECTOR_SCHEMA = "public" +PGVECTOR_TABLE = "llamaindex_embedding" -def init_pg_vector_store_from_env(): + +def get_vector_store(): original_conn_string = os.environ.get("PG_CONNECTION_STRING") if original_conn_string is None or original_conn_string == "": raise ValueError("PG_CONNECTION_STRING environment variable is not set.") @@ -24,4 +27,5 @@ def init_pg_vector_store_from_env(): async_connection_string=async_conn_string, schema_name=PGVECTOR_SCHEMA, table_name=PGVECTOR_TABLE, + embed_dim=int(os.environ.get("EMBEDDING_DIM", 768)), ) diff --git a/templates/components/vectordbs/python/pinecone/__init__.py b/templates/components/vectordbs/python/pinecone/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/templates/components/vectordbs/python/pinecone/generate.py b/templates/components/vectordbs/python/pinecone/generate.py deleted file mode 100644 index 5f233ba235f40917b9b1dbc3b7c581802ec2f793..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/pinecone/generate.py +++ /dev/null @@ -1,39 +0,0 @@ -from dotenv import load_dotenv - -load_dotenv() - -import os -import logging -from llama_index.core.storage import StorageContext -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.pinecone import PineconeVectorStore -from app.settings import init_settings -from app.engine.loaders import get_documents - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index") - # load the documents and create the index - documents = get_documents() - store = PineconeVectorStore( - api_key=os.environ["PINECONE_API_KEY"], - index_name=os.environ["PINECONE_INDEX_NAME"], - environment=os.environ["PINECONE_ENVIRONMENT"], - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info( - f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}" - ) - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/pinecone/index.py b/templates/components/vectordbs/python/pinecone/index.py deleted file mode 100644 index 98824ffdc5f197ad9d3d0a3b546ffbee64f4f7ed..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/pinecone/index.py +++ /dev/null @@ -1,20 +0,0 @@ -import logging -import os - -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.pinecone import PineconeVectorStore - - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to index from Pinecone...") - store = PineconeVectorStore( - api_key=os.environ["PINECONE_API_KEY"], - index_name=os.environ["PINECONE_INDEX_NAME"], - environment=os.environ["PINECONE_ENVIRONMENT"], - ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to index from Pinecone.") - return index diff --git a/templates/components/vectordbs/python/pinecone/vectordb.py b/templates/components/vectordbs/python/pinecone/vectordb.py new file mode 100644 index 0000000000000000000000000000000000000000..d6ff2cf8785e5f2c62647e050fe754db7af442a5 --- /dev/null +++ b/templates/components/vectordbs/python/pinecone/vectordb.py @@ -0,0 +1,11 @@ +import os +from llama_index.vector_stores.pinecone import PineconeVectorStore + + +def get_vector_store(): + store = PineconeVectorStore( + api_key=os.environ["PINECONE_API_KEY"], + index_name=os.environ["PINECONE_INDEX_NAME"], + environment=os.environ["PINECONE_ENVIRONMENT"], + ) + return store diff --git a/templates/components/vectordbs/python/qdrant/__init__.py b/templates/components/vectordbs/python/qdrant/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/templates/components/vectordbs/python/qdrant/generate.py b/templates/components/vectordbs/python/qdrant/generate.py deleted file mode 100644 index db7c055e492311a5f6e6f01bbdf0406ce470cfde..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/qdrant/generate.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging -import os -from app.engine.loaders import get_documents -from app.settings import init_settings -from dotenv import load_dotenv -from llama_index.core.indices import VectorStoreIndex -from llama_index.core.storage import StorageContext -from llama_index.vector_stores.qdrant import QdrantVectorStore -load_dotenv() - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() - - -def generate_datasource(): - init_settings() - logger.info("Creating new index with Qdrant") - # load the documents and create the index - documents = get_documents() - store = QdrantVectorStore( - collection_name=os.getenv("QDRANT_COLLECTION"), - url=os.getenv("QDRANT_URL"), - api_key=os.getenv("QDRANT_API_KEY"), - ) - storage_context = StorageContext.from_defaults(vector_store=store) - VectorStoreIndex.from_documents( - documents, - storage_context=storage_context, - show_progress=True, # this will show you a progress bar as the embeddings are created - ) - logger.info( - f"Successfully uploaded documents to the {os.getenv('QDRANT_COLLECTION')} collection." - ) - - -if __name__ == "__main__": - generate_datasource() diff --git a/templates/components/vectordbs/python/qdrant/index.py b/templates/components/vectordbs/python/qdrant/index.py deleted file mode 100644 index 0a388d8a303f937aedec42aaecb73668b83f0c84..0000000000000000000000000000000000000000 --- a/templates/components/vectordbs/python/qdrant/index.py +++ /dev/null @@ -1,20 +0,0 @@ -import logging -import os - -from llama_index.core.indices import VectorStoreIndex -from llama_index.vector_stores.qdrant import QdrantVectorStore - - -logger = logging.getLogger("uvicorn") - - -def get_index(): - logger.info("Connecting to Qdrant collection..") - store = QdrantVectorStore( - collection_name=os.getenv("QDRANT_COLLECTION"), - url=os.getenv("QDRANT_URL"), - api_key=os.getenv("QDRANT_API_KEY"), - ) - index = VectorStoreIndex.from_vector_store(store) - logger.info("Finished connecting to Qdrant collection.") - return index diff --git a/templates/components/vectordbs/python/qdrant/vectordb.py b/templates/components/vectordbs/python/qdrant/vectordb.py new file mode 100644 index 0000000000000000000000000000000000000000..5f36c202df1adf45f342004641036009e7c9b0e1 --- /dev/null +++ b/templates/components/vectordbs/python/qdrant/vectordb.py @@ -0,0 +1,11 @@ +import os +from llama_index.vector_stores.qdrant import QdrantVectorStore + + +def get_vector_store(): + store = QdrantVectorStore( + collection_name=os.getenv("QDRANT_COLLECTION"), + url=os.getenv("QDRANT_URL"), + api_key=os.getenv("QDRANT_API_KEY"), + ) + return store diff --git a/templates/types/streaming/fastapi/app/constants.py b/templates/types/streaming/fastapi/app/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..61daefe5c4c2036acfc6757355ad1ca8eedca490 --- /dev/null +++ b/templates/types/streaming/fastapi/app/constants.py @@ -0,0 +1 @@ +STORAGE_DIR = "storage" # directory to save the stores to (document store and if used, the `SimpleVectorStore`) diff --git a/templates/types/streaming/fastapi/app/engine/generate.py b/templates/types/streaming/fastapi/app/engine/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..3e1686dd9035b65149ac670b9e84248461bf65ed --- /dev/null +++ b/templates/types/streaming/fastapi/app/engine/generate.py @@ -0,0 +1,70 @@ +from dotenv import load_dotenv + +load_dotenv() + +import os +import logging +from llama_index.core.settings import Settings +from llama_index.core.ingestion import IngestionPipeline +from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.vector_stores import SimpleVectorStore +from llama_index.core.storage.docstore import SimpleDocumentStore +from app.constants import STORAGE_DIR +from app.settings import init_settings +from app.engine.loaders import get_documents +from app.engine.vectordb import get_vector_store + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def get_doc_store(): + if not os.path.exists(STORAGE_DIR): + docstore = SimpleDocumentStore() + return docstore + else: + return SimpleDocumentStore.from_persist_dir(STORAGE_DIR) + + +def generate_datasource(): + init_settings() + logger.info("Creating new index") + + # load the documents and create the index + documents = get_documents() + docstore = get_doc_store() + vector_store = get_vector_store() + + # Create ingestion pipeline + ingestion_pipeline = IngestionPipeline( + transformations=[ + SentenceSplitter( + chunk_size=Settings.chunk_size, + chunk_overlap=Settings.chunk_overlap, + ), + Settings.embed_model, + ], + docstore=docstore, + docstore_strategy="upserts_and_delete", + ) + + # llama_index having an typing issue when passing vector_store to IngestionPipeline + # so we need to set it manually after initialization + ingestion_pipeline.vector_store = vector_store + + # Run the ingestion pipeline and store the results + ingestion_pipeline.run(show_progress=True, documents=documents) + + # Default vector store only keeps data in memory, so we need to persist it + # Can remove if using a different vector store + if isinstance(vector_store, SimpleVectorStore): + vector_store.persist(os.path.join(STORAGE_DIR, "vector_store.json")) + # Persist the docstore to apply ingestion strategy + docstore.persist(os.path.join(STORAGE_DIR, "docstore.json")) + + logger.info("Finished creating new index.") + + +if __name__ == "__main__": + generate_datasource() diff --git a/templates/types/streaming/fastapi/app/engine/index.py b/templates/types/streaming/fastapi/app/engine/index.py new file mode 100644 index 0000000000000000000000000000000000000000..3cc2beb7000ced0eb7d5aac01cde8fbe583466c5 --- /dev/null +++ b/templates/types/streaming/fastapi/app/engine/index.py @@ -0,0 +1,13 @@ +import logging +from llama_index.core.indices.vector_store import VectorStoreIndex +from app.engine.vectordb import get_vector_store + +logger = logging.getLogger("uvicorn") + + +def get_index(): + logger.info("Loading the index...") + store = get_vector_store() + index = VectorStoreIndex.from_vector_store(store) + logger.info("Loaded index successfully.") + return index