Skip to content
Snippets Groups Projects
Unverified Commit c094b0c6 authored by Huu Le (Lee)'s avatar Huu Le (Lee) Committed by GitHub
Browse files

Use ingestion pipeline in Python code (#61)


---------
Co-authored-by: default avatarMarcus Schiesser <mail@marcusschiesser.de>
parent e2567ffc
No related branches found
No related tags found
No related merge requests found
from dotenv import load_dotenv
load_dotenv()
import os
import logging
from llama_index.core.storage import StorageContext
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore
from app.settings import init_settings
from app.engine.loaders import get_documents
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def generate_datasource():
init_settings()
logger.info("Creating new index")
# load the documents and create the index
documents = get_documents()
store = PineconeVectorStore(
api_key=os.environ["PINECONE_API_KEY"],
index_name=os.environ["PINECONE_INDEX_NAME"],
environment=os.environ["PINECONE_ENVIRONMENT"],
)
storage_context = StorageContext.from_defaults(vector_store=store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
show_progress=True, # this will show you a progress bar as the embeddings are created
)
logger.info(
f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}"
)
if __name__ == "__main__":
generate_datasource()
import logging
import os import os
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore from llama_index.vector_stores.pinecone import PineconeVectorStore
logger = logging.getLogger("uvicorn") def get_vector_store():
def get_index():
logger.info("Connecting to index from Pinecone...")
store = PineconeVectorStore( store = PineconeVectorStore(
api_key=os.environ["PINECONE_API_KEY"], api_key=os.environ["PINECONE_API_KEY"],
index_name=os.environ["PINECONE_INDEX_NAME"], index_name=os.environ["PINECONE_INDEX_NAME"],
environment=os.environ["PINECONE_ENVIRONMENT"], environment=os.environ["PINECONE_ENVIRONMENT"],
) )
index = VectorStoreIndex.from_vector_store(store) return store
logger.info("Finished connecting to index from Pinecone.")
return index
import logging
import os
from app.engine.loaders import get_documents
from app.settings import init_settings
from dotenv import load_dotenv
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.storage import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def generate_datasource():
init_settings()
logger.info("Creating new index with Qdrant")
# load the documents and create the index
documents = get_documents()
store = QdrantVectorStore(
collection_name=os.getenv("QDRANT_COLLECTION"),
url=os.getenv("QDRANT_URL"),
api_key=os.getenv("QDRANT_API_KEY"),
)
storage_context = StorageContext.from_defaults(vector_store=store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
show_progress=True, # this will show you a progress bar as the embeddings are created
)
logger.info(
f"Successfully uploaded documents to the {os.getenv('QDRANT_COLLECTION')} collection."
)
if __name__ == "__main__":
generate_datasource()
import logging
import os import os
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore from llama_index.vector_stores.qdrant import QdrantVectorStore
logger = logging.getLogger("uvicorn") def get_vector_store():
def get_index():
logger.info("Connecting to Qdrant collection..")
store = QdrantVectorStore( store = QdrantVectorStore(
collection_name=os.getenv("QDRANT_COLLECTION"), collection_name=os.getenv("QDRANT_COLLECTION"),
url=os.getenv("QDRANT_URL"), url=os.getenv("QDRANT_URL"),
api_key=os.getenv("QDRANT_API_KEY"), api_key=os.getenv("QDRANT_API_KEY"),
) )
index = VectorStoreIndex.from_vector_store(store) return store
logger.info("Finished connecting to Qdrant collection.")
return index
STORAGE_DIR = "storage" # directory to save the stores to (document store and if used, the `SimpleVectorStore`)
from dotenv import load_dotenv
load_dotenv()
import os
import logging
from llama_index.core.settings import Settings
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.core.storage.docstore import SimpleDocumentStore
from app.constants import STORAGE_DIR
from app.settings import init_settings
from app.engine.loaders import get_documents
from app.engine.vectordb import get_vector_store
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def get_doc_store():
if not os.path.exists(STORAGE_DIR):
docstore = SimpleDocumentStore()
return docstore
else:
return SimpleDocumentStore.from_persist_dir(STORAGE_DIR)
def generate_datasource():
init_settings()
logger.info("Creating new index")
# load the documents and create the index
documents = get_documents()
docstore = get_doc_store()
vector_store = get_vector_store()
# Create ingestion pipeline
ingestion_pipeline = IngestionPipeline(
transformations=[
SentenceSplitter(
chunk_size=Settings.chunk_size,
chunk_overlap=Settings.chunk_overlap,
),
Settings.embed_model,
],
docstore=docstore,
docstore_strategy="upserts_and_delete",
)
# llama_index having an typing issue when passing vector_store to IngestionPipeline
# so we need to set it manually after initialization
ingestion_pipeline.vector_store = vector_store
# Run the ingestion pipeline and store the results
ingestion_pipeline.run(show_progress=True, documents=documents)
# Default vector store only keeps data in memory, so we need to persist it
# Can remove if using a different vector store
if isinstance(vector_store, SimpleVectorStore):
vector_store.persist(os.path.join(STORAGE_DIR, "vector_store.json"))
# Persist the docstore to apply ingestion strategy
docstore.persist(os.path.join(STORAGE_DIR, "docstore.json"))
logger.info("Finished creating new index.")
if __name__ == "__main__":
generate_datasource()
import logging import logging
from llama_index.core.indices.vector_store import VectorStoreIndex from llama_index.core.indices.vector_store import VectorStoreIndex
from app.engine.utils import init_pg_vector_store_from_env from app.engine.vectordb import get_vector_store
logger = logging.getLogger("uvicorn") logger = logging.getLogger("uvicorn")
def get_index(): def get_index():
logger.info("Connecting to index from PGVector...") logger.info("Loading the index...")
store = init_pg_vector_store_from_env() store = get_vector_store()
index = VectorStoreIndex.from_vector_store(store) index = VectorStoreIndex.from_vector_store(store)
logger.info("Finished connecting to index from PGVector.") logger.info("Loaded index successfully.")
return index return index
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment