From 2f3ddda5fcd452f816755c8324295150f69e1eea Mon Sep 17 00:00:00 2001
From: Thuc Pham <51660321+thucpn@users.noreply.github.com>
Date: Thu, 22 Feb 2024 09:55:50 +0700
Subject: [PATCH] feat: add pinecone support to create llama (#555)

---
 helpers/types.ts                              |  2 +-
 questions.ts                                  |  1 +
 .../vectordbs/python/pinecone/__init__.py     |  0
 .../vectordbs/python/pinecone/constants.py    |  3 ++
 .../vectordbs/python/pinecone/context.py      | 14 ++++++
 .../vectordbs/python/pinecone/generate.py     | 45 +++++++++++++++++++
 .../vectordbs/python/pinecone/index.py        | 23 ++++++++++
 .../typescript/pinecone/generate.mjs          | 35 +++++++++++++++
 .../vectordbs/typescript/pinecone/index.ts    | 29 ++++++++++++
 .../vectordbs/typescript/pinecone/shared.mjs  | 22 +++++++++
 10 files changed, 173 insertions(+), 1 deletion(-)
 create mode 100644 templates/components/vectordbs/python/pinecone/__init__.py
 create mode 100644 templates/components/vectordbs/python/pinecone/constants.py
 create mode 100644 templates/components/vectordbs/python/pinecone/context.py
 create mode 100644 templates/components/vectordbs/python/pinecone/generate.py
 create mode 100644 templates/components/vectordbs/python/pinecone/index.py
 create mode 100644 templates/components/vectordbs/typescript/pinecone/generate.mjs
 create mode 100644 templates/components/vectordbs/typescript/pinecone/index.ts
 create mode 100644 templates/components/vectordbs/typescript/pinecone/shared.mjs

diff --git a/helpers/types.ts b/helpers/types.ts
index 5e4a9f6e..19253cc1 100644
--- a/helpers/types.ts
+++ b/helpers/types.ts
@@ -4,7 +4,7 @@ export type TemplateType = "simple" | "streaming" | "community" | "llamapack";
 export type TemplateFramework = "nextjs" | "express" | "fastapi";
 export type TemplateEngine = "simple" | "context";
 export type TemplateUI = "html" | "shadcn";
-export type TemplateVectorDB = "none" | "mongo" | "pg";
+export type TemplateVectorDB = "none" | "mongo" | "pg" | "pinecone";
 export type TemplatePostInstallAction = "none" | "dependencies" | "runApp";
 export type TemplateDataSource = {
   type: TemplateDataSourceType;
diff --git a/questions.ts b/questions.ts
index 62fdb7ec..b0cca28c 100644
--- a/questions.ts
+++ b/questions.ts
@@ -89,6 +89,7 @@ const getVectorDbChoices = (framework: TemplateFramework) => {
     },
     { title: "MongoDB", value: "mongo" },
     { title: "PostgreSQL", value: "pg" },
+    { title: "Pinecone", value: "pinecone" },
   ];
 
   const vectordbLang = framework === "fastapi" ? "python" : "typescript";
diff --git a/templates/components/vectordbs/python/pinecone/__init__.py b/templates/components/vectordbs/python/pinecone/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/templates/components/vectordbs/python/pinecone/constants.py b/templates/components/vectordbs/python/pinecone/constants.py
new file mode 100644
index 00000000..0dd46619
--- /dev/null
+++ b/templates/components/vectordbs/python/pinecone/constants.py
@@ -0,0 +1,3 @@
+DATA_DIR = "data"  # directory containing the documents to index
+CHUNK_SIZE = 512
+CHUNK_OVERLAP = 20
diff --git a/templates/components/vectordbs/python/pinecone/context.py b/templates/components/vectordbs/python/pinecone/context.py
new file mode 100644
index 00000000..ceb8a50a
--- /dev/null
+++ b/templates/components/vectordbs/python/pinecone/context.py
@@ -0,0 +1,14 @@
+from llama_index import ServiceContext
+
+from app.context import create_base_context
+from app.engine.constants import CHUNK_SIZE, CHUNK_OVERLAP
+
+
+def create_service_context():
+    base = create_base_context()
+    return ServiceContext.from_defaults(
+        llm=base.llm,
+        embed_model=base.embed_model,
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+    )
diff --git a/templates/components/vectordbs/python/pinecone/generate.py b/templates/components/vectordbs/python/pinecone/generate.py
new file mode 100644
index 00000000..8c0e1c0b
--- /dev/null
+++ b/templates/components/vectordbs/python/pinecone/generate.py
@@ -0,0 +1,45 @@
+from dotenv import load_dotenv
+
+load_dotenv()
+import os
+import logging
+from llama_index.vector_stores import PineconeVectorStore
+
+from app.engine.constants import DATA_DIR
+from app.engine.context import create_service_context
+from app.engine.loader import get_documents
+
+
+from llama_index import (
+    SimpleDirectoryReader,
+    VectorStoreIndex,
+    StorageContext,
+)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger()
+
+
+def generate_datasource(service_context):
+    logger.info("Creating new index")
+    # load the documents and create the index
+    documents = get_documents()
+    store = PineconeVectorStore(
+        api_key=os.environ["PINECONE_API_KEY"],
+        index_name=os.environ["PINECONE_INDEX_NAME"],
+        environment=os.environ["PINECONE_ENVIRONMENT"],
+    )
+    storage_context = StorageContext.from_defaults(vector_store=store)
+    VectorStoreIndex.from_documents(
+        documents,
+        service_context=service_context,
+        storage_context=storage_context,
+        show_progress=True,  # this will show you a progress bar as the embeddings are created
+    )
+    logger.info(
+        f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}"
+    )
+
+
+if __name__ == "__main__":
+    generate_datasource(create_service_context())
diff --git a/templates/components/vectordbs/python/pinecone/index.py b/templates/components/vectordbs/python/pinecone/index.py
new file mode 100644
index 00000000..6e9b8810
--- /dev/null
+++ b/templates/components/vectordbs/python/pinecone/index.py
@@ -0,0 +1,23 @@
+import logging
+import os
+
+from llama_index import (
+    VectorStoreIndex,
+)
+from llama_index.vector_stores import PineconeVectorStore
+
+from app.engine.context import create_service_context
+
+
+def get_index():
+    service_context = create_service_context()
+    logger = logging.getLogger("uvicorn")
+    logger.info("Connecting to index from Pinecone...")
+    store = PineconeVectorStore(
+        api_key=os.environ["PINECONE_API_KEY"],
+        index_name=os.environ["PINECONE_INDEX_NAME"],
+        environment=os.environ["PINECONE_ENVIRONMENT"],
+    )
+    index = VectorStoreIndex.from_vector_store(store, service_context)
+    logger.info("Finished connecting to index from Pinecone.")
+    return index
diff --git a/templates/components/vectordbs/typescript/pinecone/generate.mjs b/templates/components/vectordbs/typescript/pinecone/generate.mjs
new file mode 100644
index 00000000..b371a639
--- /dev/null
+++ b/templates/components/vectordbs/typescript/pinecone/generate.mjs
@@ -0,0 +1,35 @@
+/* eslint-disable turbo/no-undeclared-env-vars */
+import * as dotenv from "dotenv";
+import {
+  PineconeVectorStore,
+  SimpleDirectoryReader,
+  VectorStoreIndex,
+  storageContextFromDefaults,
+} from "llamaindex";
+import { STORAGE_DIR, checkRequiredEnvVars } from "./shared.mjs";
+
+dotenv.config();
+
+async function loadAndIndex() {
+  // load objects from storage and convert them into LlamaIndex Document objects
+  const documents = await new SimpleDirectoryReader().loadData({
+    directoryPath: STORAGE_DIR,
+  });
+
+  // create vector store
+  const vectorStore = new PineconeVectorStore();
+
+  // create index from all the Documentss and store them in Pinecone
+  console.log("Start creating embeddings...");
+  const storageContext = await storageContextFromDefaults({ vectorStore });
+  await VectorStoreIndex.fromDocuments(documents, { storageContext });
+  console.log(
+    "Successfully created embeddings and save to your Pinecone index.",
+  );
+}
+
+(async () => {
+  checkRequiredEnvVars();
+  await loadAndIndex();
+  console.log("Finished generating storage.");
+})();
diff --git a/templates/components/vectordbs/typescript/pinecone/index.ts b/templates/components/vectordbs/typescript/pinecone/index.ts
new file mode 100644
index 00000000..be18486c
--- /dev/null
+++ b/templates/components/vectordbs/typescript/pinecone/index.ts
@@ -0,0 +1,29 @@
+/* eslint-disable turbo/no-undeclared-env-vars */
+import {
+  ContextChatEngine,
+  LLM,
+  PineconeVectorStore,
+  VectorStoreIndex,
+  serviceContextFromDefaults,
+} from "llamaindex";
+import { CHUNK_OVERLAP, CHUNK_SIZE, checkRequiredEnvVars } from "./shared.mjs";
+
+async function getDataSource(llm: LLM) {
+  checkRequiredEnvVars();
+  const serviceContext = serviceContextFromDefaults({
+    llm,
+    chunkSize: CHUNK_SIZE,
+    chunkOverlap: CHUNK_OVERLAP,
+  });
+  const store = new PineconeVectorStore();
+  return await VectorStoreIndex.fromVectorStore(store, serviceContext);
+}
+
+export async function createChatEngine(llm: LLM) {
+  const index = await getDataSource(llm);
+  const retriever = index.asRetriever({ similarityTopK: 5 });
+  return new ContextChatEngine({
+    chatModel: llm,
+    retriever,
+  });
+}
diff --git a/templates/components/vectordbs/typescript/pinecone/shared.mjs b/templates/components/vectordbs/typescript/pinecone/shared.mjs
new file mode 100644
index 00000000..f9140261
--- /dev/null
+++ b/templates/components/vectordbs/typescript/pinecone/shared.mjs
@@ -0,0 +1,22 @@
+export const STORAGE_DIR = "./data";
+export const CHUNK_SIZE = 512;
+export const CHUNK_OVERLAP = 20;
+
+const REQUIRED_ENV_VARS = ["PINECONE_ENVIRONMENT", "PINECONE_API_KEY"];
+
+export function checkRequiredEnvVars() {
+  const missingEnvVars = REQUIRED_ENV_VARS.filter((envVar) => {
+    return !process.env[envVar];
+  });
+
+  if (missingEnvVars.length > 0) {
+    console.log(
+      `The following environment variables are required but missing: ${missingEnvVars.join(
+        ", ",
+      )}`,
+    );
+    throw new Error(
+      `Missing environment variables: ${missingEnvVars.join(", ")}`,
+    );
+  }
+}
-- 
GitLab