Skip to content
Snippets Groups Projects
Unverified Commit c094b0c6 authored by Huu Le (Lee)'s avatar Huu Le (Lee) Committed by GitHub
Browse files

Use ingestion pipeline in Python code (#61)


---------
Co-authored-by: default avatarMarcus Schiesser <mail@marcusschiesser.de>
parent e2567ffc
No related branches found
No related tags found
No related merge requests found
Showing
with 51 additions and 226 deletions
---
"create-llama": patch
---
Use ingestion pipeline for Python
...@@ -27,10 +27,7 @@ def llama_parse_parser(): ...@@ -27,10 +27,7 @@ def llama_parse_parser():
def get_file_documents(config: FileLoaderConfig): def get_file_documents(config: FileLoaderConfig):
from llama_index.core.readers import SimpleDirectoryReader from llama_index.core.readers import SimpleDirectoryReader
reader = SimpleDirectoryReader( reader = SimpleDirectoryReader(config.data_dir, recursive=True, filename_as_id=True)
config.data_dir,
recursive=True,
)
if config.use_llama_parse: if config.use_llama_parse:
parser = llama_parse_parser() parser = llama_parse_parser()
reader.file_extractor = {".pdf": parser} reader.file_extractor = {".pdf": parser}
......
from dotenv import load_dotenv
load_dotenv()
import os
import logging
from llama_index.core.storage import StorageContext
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.astra_db import AstraDBVectorStore
from app.settings import init_settings
from app.engine.loaders import get_documents
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def generate_datasource():
init_settings()
logger.info("Creating new index")
documents = get_documents()
store = AstraDBVectorStore(
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
api_endpoint=os.environ["ASTRA_DB_ENDPOINT"],
collection_name=os.environ["ASTRA_DB_COLLECTION"],
embedding_dimension=int(os.environ["EMBEDDING_DIM"]),
)
storage_context = StorageContext.from_defaults(vector_store=store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
show_progress=True, # this will show you a progress bar as the embeddings are created
)
logger.info(f"Successfully created embeddings in the AstraDB")
if __name__ == "__main__":
generate_datasource()
import logging
import os import os
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.astra_db import AstraDBVectorStore from llama_index.vector_stores.astra_db import AstraDBVectorStore
logger = logging.getLogger("uvicorn") def get_vector_store():
def get_index():
logger.info("Connecting to index from AstraDB...")
store = AstraDBVectorStore( store = AstraDBVectorStore(
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
api_endpoint=os.environ["ASTRA_DB_ENDPOINT"], api_endpoint=os.environ["ASTRA_DB_ENDPOINT"],
collection_name=os.environ["ASTRA_DB_COLLECTION"], collection_name=os.environ["ASTRA_DB_COLLECTION"],
embedding_dimension=int(os.environ["EMBEDDING_DIM"]), embedding_dimension=int(os.environ["EMBEDDING_DIM"]),
) )
index = VectorStoreIndex.from_vector_store(store) return store
logger.info("Finished connecting to index from AstraDB.")
return index
from dotenv import load_dotenv
load_dotenv()
import os
import logging
from llama_index.core.storage import StorageContext
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.milvus import MilvusVectorStore
from app.settings import init_settings
from app.engine.loaders import get_documents
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def generate_datasource():
init_settings()
logger.info("Creating new index")
# load the documents and create the index
documents = get_documents()
store = MilvusVectorStore(
uri=os.environ["MILVUS_ADDRESS"],
user=os.getenv("MILVUS_USERNAME"),
password=os.getenv("MILVUS_PASSWORD"),
collection_name=os.getenv("MILVUS_COLLECTION"),
dim=int(os.getenv("EMBEDDING_DIM")),
)
storage_context = StorageContext.from_defaults(vector_store=store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
show_progress=True, # this will show you a progress bar as the embeddings are created
)
logger.info(f"Successfully created embeddings in the Milvus")
if __name__ == "__main__":
generate_datasource()
import logging
import os import os
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.milvus import MilvusVectorStore from llama_index.vector_stores.milvus import MilvusVectorStore
logger = logging.getLogger("uvicorn") def get_vector_store():
def get_index():
logger.info("Connecting to index from Milvus...")
store = MilvusVectorStore( store = MilvusVectorStore(
uri=os.getenv("MILVUS_ADDRESS"), uri=os.environ["MILVUS_ADDRESS"],
user=os.getenv("MILVUS_USERNAME"), user=os.getenv("MILVUS_USERNAME"),
password=os.getenv("MILVUS_PASSWORD"), password=os.getenv("MILVUS_PASSWORD"),
collection_name=os.getenv("MILVUS_COLLECTION"), collection_name=os.getenv("MILVUS_COLLECTION"),
dim=int(os.getenv("EMBEDDING_DIM")), dim=int(os.getenv("EMBEDDING_DIM")),
) )
index = VectorStoreIndex.from_vector_store(store) return store
logger.info("Finished connecting to index from Milvus.")
return index
from dotenv import load_dotenv
load_dotenv()
import os
import logging
from llama_index.core.storage import StorageContext
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from app.settings import init_settings
from app.engine.loaders import get_documents
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def generate_datasource():
init_settings()
logger.info("Creating new index")
# load the documents and create the index
documents = get_documents()
store = MongoDBAtlasVectorSearch(
db_name=os.environ["MONGODB_DATABASE"],
collection_name=os.environ["MONGODB_VECTORS"],
index_name=os.environ["MONGODB_VECTOR_INDEX"],
)
storage_context = StorageContext.from_defaults(vector_store=store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
show_progress=True, # this will show you a progress bar as the embeddings are created
)
logger.info(
f"Successfully created embeddings in the MongoDB collection {os.environ['MONGODB_VECTORS']}"
)
logger.info(
"""IMPORTANT: You can't query your index yet because you need to create a vector search index in MongoDB's UI now.
See https://github.com/run-llama/mongodb-demo/tree/main?tab=readme-ov-file#create-a-vector-search-index"""
)
if __name__ == "__main__":
generate_datasource()
import logging
import os import os
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
logger = logging.getLogger("uvicorn") def get_vector_store():
def get_index():
logger.info("Connecting to index from MongoDB...")
store = MongoDBAtlasVectorSearch( store = MongoDBAtlasVectorSearch(
db_name=os.environ["MONGODB_DATABASE"], db_name=os.environ["MONGODB_DATABASE"],
collection_name=os.environ["MONGODB_VECTORS"], collection_name=os.environ["MONGODB_VECTORS"],
index_name=os.environ["MONGODB_VECTOR_INDEX"], index_name=os.environ["MONGODB_VECTOR_INDEX"],
) )
index = VectorStoreIndex.from_vector_store(store) return store
logger.info("Finished connecting to index from MongoDB.")
return index
STORAGE_DIR = "storage" # directory to cache the generated index
from dotenv import load_dotenv
load_dotenv()
import logging
from llama_index.core.indices import (
VectorStoreIndex,
)
from app.engine.constants import STORAGE_DIR
from app.engine.loaders import get_documents
from app.settings import init_settings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def generate_datasource():
init_settings()
logger.info("Creating new index")
# load the documents and create the index
documents = get_documents()
index = VectorStoreIndex.from_documents(
documents,
)
# store it for later
index.storage_context.persist(STORAGE_DIR)
logger.info(f"Finished creating new index. Stored in {STORAGE_DIR}")
if __name__ == "__main__":
generate_datasource()
import logging
import os
from app.engine.constants import STORAGE_DIR
from llama_index.core.storage import StorageContext
from llama_index.core.indices import load_index_from_storage
logger = logging.getLogger("uvicorn")
def get_index():
# check if storage already exists
if not os.path.exists(STORAGE_DIR):
return None
# load the existing index
logger.info(f"Loading index from {STORAGE_DIR}...")
storage_context = StorageContext.from_defaults(persist_dir=STORAGE_DIR)
index = load_index_from_storage(storage_context)
logger.info(f"Finished loading index from {STORAGE_DIR}")
return index
import os
from llama_index.core.vector_stores import SimpleVectorStore
from app.constants import STORAGE_DIR
def get_vector_store():
if not os.path.exists(STORAGE_DIR):
vector_store = SimpleVectorStore()
else:
vector_store = SimpleVectorStore.from_persist_dir(STORAGE_DIR)
vector_store.stores_text = True
return vector_store
PGVECTOR_SCHEMA = "public"
PGVECTOR_TABLE = "llamaindex_embedding"
\ No newline at end of file
from dotenv import load_dotenv
load_dotenv()
import logging
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.storage import StorageContext
from app.engine.loaders import get_documents
from app.settings import init_settings
from app.engine.utils import init_pg_vector_store_from_env
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
def generate_datasource():
init_settings()
logger.info("Creating new index")
# load the documents and create the index
documents = get_documents()
store = init_pg_vector_store_from_env()
storage_context = StorageContext.from_defaults(vector_store=store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
show_progress=True, # this will show you a progress bar as the embeddings are created
)
logger.info(
f"Successfully created embeddings in the PG vector store, schema={store.schema_name} table={store.table_name}"
)
if __name__ == "__main__":
generate_datasource()
import os import os
from llama_index.vector_stores.postgres import PGVectorStore from llama_index.vector_stores.postgres import PGVectorStore
from urllib.parse import urlparse from urllib.parse import urlparse
from app.engine.constants import PGVECTOR_SCHEMA, PGVECTOR_TABLE
STORAGE_DIR = "storage"
PGVECTOR_SCHEMA = "public"
PGVECTOR_TABLE = "llamaindex_embedding"
def init_pg_vector_store_from_env():
def get_vector_store():
original_conn_string = os.environ.get("PG_CONNECTION_STRING") original_conn_string = os.environ.get("PG_CONNECTION_STRING")
if original_conn_string is None or original_conn_string == "": if original_conn_string is None or original_conn_string == "":
raise ValueError("PG_CONNECTION_STRING environment variable is not set.") raise ValueError("PG_CONNECTION_STRING environment variable is not set.")
...@@ -24,4 +27,5 @@ def init_pg_vector_store_from_env(): ...@@ -24,4 +27,5 @@ def init_pg_vector_store_from_env():
async_connection_string=async_conn_string, async_connection_string=async_conn_string,
schema_name=PGVECTOR_SCHEMA, schema_name=PGVECTOR_SCHEMA,
table_name=PGVECTOR_TABLE, table_name=PGVECTOR_TABLE,
embed_dim=int(os.environ.get("EMBEDDING_DIM", 768)),
) )
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment