From 036c00db7394614000aaa0d252eb01c7931b0cc9 Mon Sep 17 00:00:00 2001
From: Thuc Pham <51660321+thucpn@users.noreply.github.com>
Date: Fri, 5 Jan 2024 13:13:52 +0700
Subject: [PATCH] Feat: add postgres vectordb (#308)

* feat: integrate create-llama with postgresql
* fix: get data for verification before inserting
* feat: show available vector DBs based on framework
---
 examples/pg-vector-store/README.md            |  27 +++-
 .../src/storage/vectorStore/PGVectorStore.ts  | 119 ++++++++++--------
 packages/create-llama/questions.ts            |  69 ++++++----
 .../vectordbs/typescript/pg/generate.mjs      |  45 +++++++
 .../vectordbs/typescript/pg/index.ts          |  39 ++++++
 .../vectordbs/typescript/pg/shared.mjs        |  24 ++++
 packages/create-llama/templates/index.ts      |   5 +
 packages/create-llama/templates/types.ts      |   2 +-
 packages/eslint-config-custom/index.js        |   2 +
 9 files changed, 252 insertions(+), 80 deletions(-)
 create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pg/generate.mjs
 create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pg/index.ts
 create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pg/shared.mjs

diff --git a/examples/pg-vector-store/README.md b/examples/pg-vector-store/README.md
index fc0316896..d6b4b9fc6 100644
--- a/examples/pg-vector-store/README.md
+++ b/examples/pg-vector-store/README.md
@@ -1,18 +1,35 @@
 # Postgres Vector Store
 
-There are two scripts available here: load-docs.ts and query.ts
+There are two scripts available here: `load-docs.ts` and `query.ts`
 
 ## Prerequisites
 
+### Start a DB Instance
+
 You'll need a postgres database instance against which to run these scripts. A simple docker command would look like this:
 
 > `docker run -d --rm --name vector-db -p 5432:5432 -e "POSTGRES_HOST_AUTH_METHOD=trust" ankane/pgvector`
 
-Set the PGHOST and PGUSER (and PGPASSWORD) environment variables to match your database setup.
+**NOTE:** Using `--rm` in the example docker command above means that the vector store will be deleted every time the container is stopped. For production purposes, use a volume to ensure persistence across restarts.
 
-You'll also need a value for OPENAI_API_KEY in your environment.
+If you prefer using a managed service, you can use [Timescale](https://docs.timescale.com/use-timescale/latest/services/create-a-service/?ref=timescale.com) to create a PostgreSQL database instance in the cloud as an alternative.
 
-**NOTE:** Using `--rm` in the example docker command above means that the vector store will be deleted every time the container is stopped. For production purposes, use a volume to ensure persistence across restarts.
+### Set up Environment
+
+Having created a DB instance, you can then set up environment variables for your database connection:
+
+```bash
+export PGHOST=<your database host>
+export PGUSER=<your database user>
+export PGPASSWORD=<your database password>
+export PGDATABASE=<your database name>
+export PGPORT=<your database port>
+export OPENAI_API_KEY=<your openai api key>
+```
+
+Set the environment variables above to match your database setup.
+Note that you'll also need an OpenAI key (`OPENAI_API_KEY`) in your environment.
+You're now ready to start the scripts.
 
 ## Setup and Loading Docs
 
@@ -22,7 +39,7 @@ To import documents and save the embedding vectors to your database:
 
 > `npx ts-node pg-vector-store/load-docs.ts data`
 
-where data is the directory containing your input files. Using the _data_ directory in the example above will read all of the files in that directory using the llamaindexTS default readers for each file type.
+where data is the directory containing your input files. Using the `data` directory in the example above will read all of the files in that directory using the LlamaIndexTS default readers for each file type.
 
 ## RAG Querying
 
diff --git a/packages/core/src/storage/vectorStore/PGVectorStore.ts b/packages/core/src/storage/vectorStore/PGVectorStore.ts
index 7bc0ccd33..46c0127b7 100644
--- a/packages/core/src/storage/vectorStore/PGVectorStore.ts
+++ b/packages/core/src/storage/vectorStore/PGVectorStore.ts
@@ -11,33 +11,43 @@ export const PGVECTOR_TABLE = "llamaindex_embedding";
 
 /**
  * Provides support for writing and querying vector data in Postgres.
+ * Note: Can't be used with data created using the Python version of the vector store (https://docs.llamaindex.ai/en/stable/examples/vector_stores/postgres.html)
  */
 export class PGVectorStore implements VectorStore {
   storesText: boolean = true;
 
   private collection: string = "";
+  private schemaName: string = PGVECTOR_SCHEMA;
+  private tableName: string = PGVECTOR_TABLE;
+  private connectionString: string | undefined = undefined;
 
-  /*
-    FROM pg LIBRARY:
-    type Config = {
-      user?: string, // default process.env.PGUSER || process.env.USER
-      password?: string or function, //default process.env.PGPASSWORD
-      host?: string, // default process.env.PGHOST
-      database?: string, // default process.env.PGDATABASE || user
-      port?: number, // default process.env.PGPORT
-      connectionString?: string, // e.g. postgres://user:password@host:5432/database
-      ssl?: any, // passed directly to node.TLSSocket, supports all tls.connect options
-      types?: any, // custom type parsers
-      statement_timeout?: number, // number of milliseconds before a statement in query will time out, default is no timeout
-      query_timeout?: number, // number of milliseconds before a query call will timeout, default is no timeout
-      application_name?: string, // The name of the application that created this Client instance
-      connectionTimeoutMillis?: number, // number of milliseconds to wait for connection, default is no timeout
-      idle_in_transaction_session_timeout?: number // number of milliseconds before terminating any session with an open idle transaction, default is no timeout
-    }  
-  */
-  db?: pg.Client;
-
-  constructor() {}
+  private db?: pg.Client;
+
+  /**
+   * Constructs a new instance of the PGVectorStore
+   *
+   * If the `connectionString` is not provided the following env variables are
+   * used to connect to the DB:
+   * PGHOST=<your database host>
+   * PGUSER=<your database user>
+   * PGPASSWORD=<your database password>
+   * PGDATABASE=<your database name>
+   * PGPORT=<your database port>
+   *
+   * @param {object} config - The configuration settings for the instance.
+   * @param {string} config.schemaName - The name of the schema (optional). Defaults to PGVECTOR_SCHEMA.
+   * @param {string} config.tableName - The name of the table (optional). Defaults to PGVECTOR_TABLE.
+   * @param {string} config.connectionString - The connection string (optional).
+   */
+  constructor(config?: {
+    schemaName?: string;
+    tableName?: string;
+    connectionString?: string;
+  }) {
+    this.schemaName = config?.schemaName ?? PGVECTOR_SCHEMA;
+    this.tableName = config?.tableName ?? PGVECTOR_TABLE;
+    this.connectionString = config?.connectionString;
+  }
 
   /**
    * Setter for the collection property.
@@ -66,7 +76,9 @@ export class PGVectorStore implements VectorStore {
       try {
         // Create DB connection
         // Read connection params from env - see comment block above
-        const db = new pg.Client();
+        const db = new pg.Client({
+          connectionString: this.connectionString,
+        });
         await db.connect();
 
         // Check vector extension
@@ -88,9 +100,9 @@ export class PGVectorStore implements VectorStore {
   }
 
   private async checkSchema(db: pg.Client) {
-    await db.query(`CREATE SCHEMA IF NOT EXISTS ${PGVECTOR_SCHEMA}`);
+    await db.query(`CREATE SCHEMA IF NOT EXISTS ${this.schemaName}`);
 
-    const tbl = `CREATE TABLE IF NOT EXISTS ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE}(
+    const tbl = `CREATE TABLE IF NOT EXISTS ${this.schemaName}.${this.tableName}(
       id uuid DEFAULT gen_random_uuid() PRIMARY KEY,
       external_id VARCHAR,
       collection VARCHAR,
@@ -100,16 +112,14 @@ export class PGVectorStore implements VectorStore {
     )`;
     await db.query(tbl);
 
-    const idxs = `CREATE INDEX IF NOT EXISTS idx_${PGVECTOR_TABLE}_external_id ON ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} (external_id);
-      CREATE INDEX IF NOT EXISTS idx_${PGVECTOR_TABLE}_collection ON ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} (collection);`;
+    const idxs = `CREATE INDEX IF NOT EXISTS idx_${this.tableName}_external_id ON ${this.schemaName}.${this.tableName} (external_id);
+      CREATE INDEX IF NOT EXISTS idx_${this.tableName}_collection ON ${this.schemaName}.${this.tableName} (collection);`;
     await db.query(idxs);
 
     // TODO add IVFFlat or HNSW indexing?
     return db;
   }
 
-  // isEmbeddingQuery?: boolean | undefined;
-
   /**
    * Connects to the database specified in environment vars.
    * This method also checks and creates the vector extension,
@@ -126,7 +136,7 @@ export class PGVectorStore implements VectorStore {
    * @returns The result of the delete query.
    */
   async clearCollection() {
-    const sql: string = `DELETE FROM ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} 
+    const sql: string = `DELETE FROM ${this.schemaName}.${this.tableName} 
       WHERE collection = $1`;
 
     const db = (await this.getDb()) as pg.Client;
@@ -135,6 +145,29 @@ export class PGVectorStore implements VectorStore {
     return ret;
   }
 
+  private getDataToInsert(embeddingResults: BaseNode<Metadata>[]) {
+    const result = [];
+    for (let index = 0; index < embeddingResults.length; index++) {
+      const row = embeddingResults[index];
+
+      let id: any = row.id_.length ? row.id_ : null;
+      let meta = row.metadata || {};
+      meta.create_date = new Date();
+
+      const params = [
+        id,
+        "",
+        this.collection,
+        row.getContent(MetadataMode.EMBED),
+        meta,
+        "[" + row.getEmbedding().join(",") + "]",
+      ];
+
+      result.push(params);
+    }
+    return result;
+  }
+
   /**
    * Adds vector record(s) to the table.
    * NOTE: Uses the collection property controlled by setCollection/getCollection.
@@ -147,34 +180,20 @@ export class PGVectorStore implements VectorStore {
       return Promise.resolve([]);
     }
 
-    const sql: string = `INSERT INTO ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} 
+    const sql: string = `INSERT INTO ${this.schemaName}.${this.tableName} 
       (id, external_id, collection, document, metadata, embeddings) 
       VALUES ($1, $2, $3, $4, $5, $6)`;
 
     const db = (await this.getDb()) as pg.Client;
+    const data = this.getDataToInsert(embeddingResults);
 
     let ret: string[] = [];
-    for (let index = 0; index < embeddingResults.length; index++) {
-      const row = embeddingResults[index];
-
-      let id: any = row.id_.length ? row.id_ : null;
-      let meta = row.metadata || {};
-      meta.create_date = new Date();
-
-      const params = [
-        id,
-        "",
-        this.collection,
-        row.getContent(MetadataMode.EMBED),
-        meta,
-        "[" + row.getEmbedding().join(",") + "]",
-      ];
-
+    for (let index = 0; index < data.length; index++) {
+      const params = data[index];
       try {
         const result = await db.query(sql, params);
-
         if (result.rows.length) {
-          id = result.rows[0].id as string;
+          const id = result.rows[0].id as string;
           ret.push(id);
         }
       } catch (err) {
@@ -197,7 +216,7 @@ export class PGVectorStore implements VectorStore {
     const collectionCriteria = this.collection.length
       ? "AND collection = $2"
       : "";
-    const sql: string = `DELETE FROM ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} 
+    const sql: string = `DELETE FROM ${this.schemaName}.${this.tableName} 
       WHERE id = $1 ${collectionCriteria}`;
 
     const db = (await this.getDb()) as pg.Client;
@@ -230,7 +249,7 @@ export class PGVectorStore implements VectorStore {
     const sql = `SELECT 
         v.*, 
         embeddings <-> $1 s 
-      FROM ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} v
+      FROM ${this.schemaName}.${this.tableName} v
       ${where}
       ORDER BY s 
       LIMIT ${max}
diff --git a/packages/create-llama/questions.ts b/packages/create-llama/questions.ts
index 72c1e0758..b144bfd70 100644
--- a/packages/create-llama/questions.ts
+++ b/packages/create-llama/questions.ts
@@ -1,9 +1,12 @@
 import ciInfo from "ci-info";
+import fs from "fs";
+import path from "path";
 import { blue, green } from "picocolors";
 import prompts from "prompts";
 import { InstallAppArgs } from "./create-app";
 import { COMMUNITY_OWNER, COMMUNITY_REPO } from "./helpers/constant";
 import { getRepoRootFolders } from "./helpers/repo";
+import { TemplateFramework } from "./templates";
 
 export type QuestionArgs = Omit<InstallAppArgs, "appPath" | "packageManager">;
 
@@ -26,6 +29,31 @@ const handlers = {
   },
 };
 
+const getVectorDbChoices = (framework: TemplateFramework) => {
+  const choices = [
+    {
+      title: "No, just store the data in the file system",
+      value: "none",
+    },
+    { title: "MongoDB", value: "mongo" },
+    { title: "PostgreSQL", value: "pg" },
+  ];
+
+  const vectodbLang = framework === "fastapi" ? "python" : "typescript";
+  const compPath = path.join(__dirname, "components");
+  const vectordbPath = path.join(compPath, "vectordbs", vectodbLang);
+
+  const availableChoices = fs
+    .readdirSync(vectordbPath)
+    .filter((file) => fs.statSync(path.join(vectordbPath, file)).isDirectory());
+
+  const displayedChoices = choices.filter((choice) =>
+    availableChoices.includes(choice.value),
+  );
+
+  return displayedChoices;
+};
+
 export const onPromptState = (state: any) => {
   if (state.aborted) {
     // If we don't re-enable the terminal cursor before exiting
@@ -233,30 +261,23 @@ export const askQuestions = async (
       program.engine = engine;
       preferences.engine = engine;
     }
-  }
-
-  if (program.engine !== "simple" && !program.vectorDb) {
-    if (ciInfo.isCI) {
-      program.vectorDb = getPrefOrDefault("vectorDb");
-    } else {
-      const { vectorDb } = await prompts(
-        {
-          type: "select",
-          name: "vectorDb",
-          message: "Would you like to use a vector database?",
-          choices: [
-            {
-              title: "No, just store the data in the file system",
-              value: "none",
-            },
-            { title: "MongoDB", value: "mongo" },
-          ],
-          initial: 0,
-        },
-        handlers,
-      );
-      program.vectorDb = vectorDb;
-      preferences.vectorDb = vectorDb;
+    if (program.engine !== "simple" && !program.vectorDb) {
+      if (ciInfo.isCI) {
+        program.vectorDb = getPrefOrDefault("vectorDb");
+      } else {
+        const { vectorDb } = await prompts(
+          {
+            type: "select",
+            name: "vectorDb",
+            message: "Would you like to use a vector database?",
+            choices: getVectorDbChoices(program.framework),
+            initial: 0,
+          },
+          handlers,
+        );
+        program.vectorDb = vectorDb;
+        preferences.vectorDb = vectorDb;
+      }
     }
   }
 
diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pg/generate.mjs b/packages/create-llama/templates/components/vectordbs/typescript/pg/generate.mjs
new file mode 100644
index 000000000..3d959c698
--- /dev/null
+++ b/packages/create-llama/templates/components/vectordbs/typescript/pg/generate.mjs
@@ -0,0 +1,45 @@
+/* eslint-disable turbo/no-undeclared-env-vars */
+import * as dotenv from "dotenv";
+import {
+  PGVectorStore,
+  SimpleDirectoryReader,
+  VectorStoreIndex,
+  storageContextFromDefaults,
+} from "llamaindex";
+import {
+  PGVECTOR_SCHEMA,
+  PGVECTOR_TABLE,
+  STORAGE_DIR,
+  checkRequiredEnvVars,
+} from "./shared.mjs";
+
+dotenv.config();
+
+async function loadAndIndex() {
+  // load objects from storage and convert them into LlamaIndex Document objects
+  const documents = await new SimpleDirectoryReader().loadData({
+    directoryPath: STORAGE_DIR,
+  });
+
+  // create postgres vector store
+  const vectorStore = new PGVectorStore({
+    connectionString: process.env.PG_CONNECTION_STRING,
+    schemaName: PGVECTOR_SCHEMA,
+    tableName: PGVECTOR_TABLE,
+  });
+  vectorStore.setCollection(STORAGE_DIR);
+  vectorStore.clearCollection();
+
+  // create index from all the Documents
+  console.log("Start creating embeddings...");
+  const storageContext = await storageContextFromDefaults({ vectorStore });
+  await VectorStoreIndex.fromDocuments(documents, { storageContext });
+  console.log(`Successfully created embeddings.`);
+}
+
+(async () => {
+  checkRequiredEnvVars();
+  await loadAndIndex();
+  console.log("Finished generating storage.");
+  process.exit(0);
+})();
diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pg/index.ts b/packages/create-llama/templates/components/vectordbs/typescript/pg/index.ts
new file mode 100644
index 000000000..96a98085a
--- /dev/null
+++ b/packages/create-llama/templates/components/vectordbs/typescript/pg/index.ts
@@ -0,0 +1,39 @@
+/* eslint-disable turbo/no-undeclared-env-vars */
+import {
+  ContextChatEngine,
+  LLM,
+  PGVectorStore,
+  VectorStoreIndex,
+  serviceContextFromDefaults,
+} from "llamaindex";
+import {
+  CHUNK_OVERLAP,
+  CHUNK_SIZE,
+  PGVECTOR_SCHEMA,
+  PGVECTOR_TABLE,
+  checkRequiredEnvVars,
+} from "./shared.mjs";
+
+async function getDataSource(llm: LLM) {
+  checkRequiredEnvVars();
+  const pgvs = new PGVectorStore({
+    connectionString: process.env.PG_CONNECTION_STRING,
+    schemaName: PGVECTOR_SCHEMA,
+    tableName: PGVECTOR_TABLE,
+  });
+  const serviceContext = serviceContextFromDefaults({
+    llm,
+    chunkSize: CHUNK_SIZE,
+    chunkOverlap: CHUNK_OVERLAP,
+  });
+  return await VectorStoreIndex.fromVectorStore(pgvs, serviceContext);
+}
+
+export async function createChatEngine(llm: LLM) {
+  const index = await getDataSource(llm);
+  const retriever = index.asRetriever({ similarityTopK: 5 });
+  return new ContextChatEngine({
+    chatModel: llm,
+    retriever,
+  });
+}
diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pg/shared.mjs b/packages/create-llama/templates/components/vectordbs/typescript/pg/shared.mjs
new file mode 100644
index 000000000..8ad729c0a
--- /dev/null
+++ b/packages/create-llama/templates/components/vectordbs/typescript/pg/shared.mjs
@@ -0,0 +1,24 @@
+export const STORAGE_DIR = "./data";
+export const CHUNK_SIZE = 512;
+export const CHUNK_OVERLAP = 20;
+export const PGVECTOR_SCHEMA = "public";
+export const PGVECTOR_TABLE = "llamaindex_embedding";
+
+const REQUIRED_ENV_VARS = ["PG_CONNECTION_STRING", "OPENAI_API_KEY"];
+
+export function checkRequiredEnvVars() {
+  const missingEnvVars = REQUIRED_ENV_VARS.filter((envVar) => {
+    return !process.env[envVar];
+  });
+
+  if (missingEnvVars.length > 0) {
+    console.log(
+      `The following environment variables are required but missing: ${missingEnvVars.join(
+        ", ",
+      )}`,
+    );
+    throw new Error(
+      `Missing environment variables: ${missingEnvVars.join(", ")}`,
+    );
+  }
+}
diff --git a/packages/create-llama/templates/index.ts b/packages/create-llama/templates/index.ts
index 973a65b85..97a3a4b5f 100644
--- a/packages/create-llama/templates/index.ts
+++ b/packages/create-llama/templates/index.ts
@@ -49,6 +49,11 @@ const createEnvLocalFile = async (
       content += `MONGODB_VECTOR_INDEX=\n`;
       break;
     }
+    case "pg": {
+      content += `# For generating a connection URI, see https://docs.timescale.com/use-timescale/latest/services/create-a-service\n`;
+      content += `PG_CONNECTION_STRING=\n`;
+      break;
+    }
   }
 
   if (content) {
diff --git a/packages/create-llama/templates/types.ts b/packages/create-llama/templates/types.ts
index a0567e10e..e245c8644 100644
--- a/packages/create-llama/templates/types.ts
+++ b/packages/create-llama/templates/types.ts
@@ -4,7 +4,7 @@ export type TemplateType = "simple" | "streaming" | "community";
 export type TemplateFramework = "nextjs" | "express" | "fastapi";
 export type TemplateEngine = "simple" | "context";
 export type TemplateUI = "html" | "shadcn";
-export type TemplateVectorDB = "none" | "mongo";
+export type TemplateVectorDB = "none" | "mongo" | "pg";
 
 export interface InstallTemplateArgs {
   appName: string;
diff --git a/packages/eslint-config-custom/index.js b/packages/eslint-config-custom/index.js
index 37100f452..4383def9e 100644
--- a/packages/eslint-config-custom/index.js
+++ b/packages/eslint-config-custom/index.js
@@ -44,6 +44,8 @@ module.exports = {
           "NOTION_TOKEN",
           "MONGODB_URI",
 
+          "PG_CONNECTION_STRING",
+
           "https_proxy",
           "npm_config_user_agent",
           "NEXT_PUBLIC_CHAT_API",
-- 
GitLab