From 2f3ddda5fcd452f816755c8324295150f69e1eea Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Thu, 22 Feb 2024 09:55:50 +0700 Subject: [PATCH] feat: add pinecone support to create llama (#555) --- helpers/types.ts | 2 +- questions.ts | 1 + .../vectordbs/python/pinecone/__init__.py | 0 .../vectordbs/python/pinecone/constants.py | 3 ++ .../vectordbs/python/pinecone/context.py | 14 ++++++ .../vectordbs/python/pinecone/generate.py | 45 +++++++++++++++++++ .../vectordbs/python/pinecone/index.py | 23 ++++++++++ .../typescript/pinecone/generate.mjs | 35 +++++++++++++++ .../vectordbs/typescript/pinecone/index.ts | 29 ++++++++++++ .../vectordbs/typescript/pinecone/shared.mjs | 22 +++++++++ 10 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 templates/components/vectordbs/python/pinecone/__init__.py create mode 100644 templates/components/vectordbs/python/pinecone/constants.py create mode 100644 templates/components/vectordbs/python/pinecone/context.py create mode 100644 templates/components/vectordbs/python/pinecone/generate.py create mode 100644 templates/components/vectordbs/python/pinecone/index.py create mode 100644 templates/components/vectordbs/typescript/pinecone/generate.mjs create mode 100644 templates/components/vectordbs/typescript/pinecone/index.ts create mode 100644 templates/components/vectordbs/typescript/pinecone/shared.mjs diff --git a/helpers/types.ts b/helpers/types.ts index 5e4a9f6e..19253cc1 100644 --- a/helpers/types.ts +++ b/helpers/types.ts @@ -4,7 +4,7 @@ export type TemplateType = "simple" | "streaming" | "community" | "llamapack"; export type TemplateFramework = "nextjs" | "express" | "fastapi"; export type TemplateEngine = "simple" | "context"; export type TemplateUI = "html" | "shadcn"; -export type TemplateVectorDB = "none" | "mongo" | "pg"; +export type TemplateVectorDB = "none" | "mongo" | "pg" | "pinecone"; export type TemplatePostInstallAction = "none" | "dependencies" | "runApp"; export type TemplateDataSource = { type: TemplateDataSourceType; diff --git a/questions.ts b/questions.ts index 62fdb7ec..b0cca28c 100644 --- a/questions.ts +++ b/questions.ts @@ -89,6 +89,7 @@ const getVectorDbChoices = (framework: TemplateFramework) => { }, { title: "MongoDB", value: "mongo" }, { title: "PostgreSQL", value: "pg" }, + { title: "Pinecone", value: "pinecone" }, ]; const vectordbLang = framework === "fastapi" ? "python" : "typescript"; diff --git a/templates/components/vectordbs/python/pinecone/__init__.py b/templates/components/vectordbs/python/pinecone/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/templates/components/vectordbs/python/pinecone/constants.py b/templates/components/vectordbs/python/pinecone/constants.py new file mode 100644 index 00000000..0dd46619 --- /dev/null +++ b/templates/components/vectordbs/python/pinecone/constants.py @@ -0,0 +1,3 @@ +DATA_DIR = "data" # directory containing the documents to index +CHUNK_SIZE = 512 +CHUNK_OVERLAP = 20 diff --git a/templates/components/vectordbs/python/pinecone/context.py b/templates/components/vectordbs/python/pinecone/context.py new file mode 100644 index 00000000..ceb8a50a --- /dev/null +++ b/templates/components/vectordbs/python/pinecone/context.py @@ -0,0 +1,14 @@ +from llama_index import ServiceContext + +from app.context import create_base_context +from app.engine.constants import CHUNK_SIZE, CHUNK_OVERLAP + + +def create_service_context(): + base = create_base_context() + return ServiceContext.from_defaults( + llm=base.llm, + embed_model=base.embed_model, + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + ) diff --git a/templates/components/vectordbs/python/pinecone/generate.py b/templates/components/vectordbs/python/pinecone/generate.py new file mode 100644 index 00000000..8c0e1c0b --- /dev/null +++ b/templates/components/vectordbs/python/pinecone/generate.py @@ -0,0 +1,45 @@ +from dotenv import load_dotenv + +load_dotenv() +import os +import logging +from llama_index.vector_stores import PineconeVectorStore + +from app.engine.constants import DATA_DIR +from app.engine.context import create_service_context +from app.engine.loader import get_documents + + +from llama_index import ( + SimpleDirectoryReader, + VectorStoreIndex, + StorageContext, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(service_context): + logger.info("Creating new index") + # load the documents and create the index + documents = get_documents() + store = PineconeVectorStore( + api_key=os.environ["PINECONE_API_KEY"], + index_name=os.environ["PINECONE_INDEX_NAME"], + environment=os.environ["PINECONE_ENVIRONMENT"], + ) + storage_context = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex.from_documents( + documents, + service_context=service_context, + storage_context=storage_context, + show_progress=True, # this will show you a progress bar as the embeddings are created + ) + logger.info( + f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}" + ) + + +if __name__ == "__main__": + generate_datasource(create_service_context()) diff --git a/templates/components/vectordbs/python/pinecone/index.py b/templates/components/vectordbs/python/pinecone/index.py new file mode 100644 index 00000000..6e9b8810 --- /dev/null +++ b/templates/components/vectordbs/python/pinecone/index.py @@ -0,0 +1,23 @@ +import logging +import os + +from llama_index import ( + VectorStoreIndex, +) +from llama_index.vector_stores import PineconeVectorStore + +from app.engine.context import create_service_context + + +def get_index(): + service_context = create_service_context() + logger = logging.getLogger("uvicorn") + logger.info("Connecting to index from Pinecone...") + store = PineconeVectorStore( + api_key=os.environ["PINECONE_API_KEY"], + index_name=os.environ["PINECONE_INDEX_NAME"], + environment=os.environ["PINECONE_ENVIRONMENT"], + ) + index = VectorStoreIndex.from_vector_store(store, service_context) + logger.info("Finished connecting to index from Pinecone.") + return index diff --git a/templates/components/vectordbs/typescript/pinecone/generate.mjs b/templates/components/vectordbs/typescript/pinecone/generate.mjs new file mode 100644 index 00000000..b371a639 --- /dev/null +++ b/templates/components/vectordbs/typescript/pinecone/generate.mjs @@ -0,0 +1,35 @@ +/* eslint-disable turbo/no-undeclared-env-vars */ +import * as dotenv from "dotenv"; +import { + PineconeVectorStore, + SimpleDirectoryReader, + VectorStoreIndex, + storageContextFromDefaults, +} from "llamaindex"; +import { STORAGE_DIR, checkRequiredEnvVars } from "./shared.mjs"; + +dotenv.config(); + +async function loadAndIndex() { + // load objects from storage and convert them into LlamaIndex Document objects + const documents = await new SimpleDirectoryReader().loadData({ + directoryPath: STORAGE_DIR, + }); + + // create vector store + const vectorStore = new PineconeVectorStore(); + + // create index from all the Documentss and store them in Pinecone + console.log("Start creating embeddings..."); + const storageContext = await storageContextFromDefaults({ vectorStore }); + await VectorStoreIndex.fromDocuments(documents, { storageContext }); + console.log( + "Successfully created embeddings and save to your Pinecone index.", + ); +} + +(async () => { + checkRequiredEnvVars(); + await loadAndIndex(); + console.log("Finished generating storage."); +})(); diff --git a/templates/components/vectordbs/typescript/pinecone/index.ts b/templates/components/vectordbs/typescript/pinecone/index.ts new file mode 100644 index 00000000..be18486c --- /dev/null +++ b/templates/components/vectordbs/typescript/pinecone/index.ts @@ -0,0 +1,29 @@ +/* eslint-disable turbo/no-undeclared-env-vars */ +import { + ContextChatEngine, + LLM, + PineconeVectorStore, + VectorStoreIndex, + serviceContextFromDefaults, +} from "llamaindex"; +import { CHUNK_OVERLAP, CHUNK_SIZE, checkRequiredEnvVars } from "./shared.mjs"; + +async function getDataSource(llm: LLM) { + checkRequiredEnvVars(); + const serviceContext = serviceContextFromDefaults({ + llm, + chunkSize: CHUNK_SIZE, + chunkOverlap: CHUNK_OVERLAP, + }); + const store = new PineconeVectorStore(); + return await VectorStoreIndex.fromVectorStore(store, serviceContext); +} + +export async function createChatEngine(llm: LLM) { + const index = await getDataSource(llm); + const retriever = index.asRetriever({ similarityTopK: 5 }); + return new ContextChatEngine({ + chatModel: llm, + retriever, + }); +} diff --git a/templates/components/vectordbs/typescript/pinecone/shared.mjs b/templates/components/vectordbs/typescript/pinecone/shared.mjs new file mode 100644 index 00000000..f9140261 --- /dev/null +++ b/templates/components/vectordbs/typescript/pinecone/shared.mjs @@ -0,0 +1,22 @@ +export const STORAGE_DIR = "./data"; +export const CHUNK_SIZE = 512; +export const CHUNK_OVERLAP = 20; + +const REQUIRED_ENV_VARS = ["PINECONE_ENVIRONMENT", "PINECONE_API_KEY"]; + +export function checkRequiredEnvVars() { + const missingEnvVars = REQUIRED_ENV_VARS.filter((envVar) => { + return !process.env[envVar]; + }); + + if (missingEnvVars.length > 0) { + console.log( + `The following environment variables are required but missing: ${missingEnvVars.join( + ", ", + )}`, + ); + throw new Error( + `Missing environment variables: ${missingEnvVars.join(", ")}`, + ); + } +} -- GitLab