From 036c00db7394614000aaa0d252eb01c7931b0cc9 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 5 Jan 2024 13:13:52 +0700 Subject: [PATCH] Feat: add postgres vectordb (#308) * feat: integrate create-llama with postgresql * fix: get data for verification before inserting * feat: show available vector DBs based on framework --- examples/pg-vector-store/README.md | 27 +++- .../src/storage/vectorStore/PGVectorStore.ts | 119 ++++++++++-------- packages/create-llama/questions.ts | 69 ++++++---- .../vectordbs/typescript/pg/generate.mjs | 45 +++++++ .../vectordbs/typescript/pg/index.ts | 39 ++++++ .../vectordbs/typescript/pg/shared.mjs | 24 ++++ packages/create-llama/templates/index.ts | 5 + packages/create-llama/templates/types.ts | 2 +- packages/eslint-config-custom/index.js | 2 + 9 files changed, 252 insertions(+), 80 deletions(-) create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pg/generate.mjs create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pg/index.ts create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pg/shared.mjs diff --git a/examples/pg-vector-store/README.md b/examples/pg-vector-store/README.md index fc0316896..d6b4b9fc6 100644 --- a/examples/pg-vector-store/README.md +++ b/examples/pg-vector-store/README.md @@ -1,18 +1,35 @@ # Postgres Vector Store -There are two scripts available here: load-docs.ts and query.ts +There are two scripts available here: `load-docs.ts` and `query.ts` ## Prerequisites +### Start a DB Instance + You'll need a postgres database instance against which to run these scripts. A simple docker command would look like this: > `docker run -d --rm --name vector-db -p 5432:5432 -e "POSTGRES_HOST_AUTH_METHOD=trust" ankane/pgvector` -Set the PGHOST and PGUSER (and PGPASSWORD) environment variables to match your database setup. +**NOTE:** Using `--rm` in the example docker command above means that the vector store will be deleted every time the container is stopped. For production purposes, use a volume to ensure persistence across restarts. -You'll also need a value for OPENAI_API_KEY in your environment. +If you prefer using a managed service, you can use [Timescale](https://docs.timescale.com/use-timescale/latest/services/create-a-service/?ref=timescale.com) to create a PostgreSQL database instance in the cloud as an alternative. -**NOTE:** Using `--rm` in the example docker command above means that the vector store will be deleted every time the container is stopped. For production purposes, use a volume to ensure persistence across restarts. +### Set up Environment + +Having created a DB instance, you can then set up environment variables for your database connection: + +```bash +export PGHOST=<your database host> +export PGUSER=<your database user> +export PGPASSWORD=<your database password> +export PGDATABASE=<your database name> +export PGPORT=<your database port> +export OPENAI_API_KEY=<your openai api key> +``` + +Set the environment variables above to match your database setup. +Note that you'll also need an OpenAI key (`OPENAI_API_KEY`) in your environment. +You're now ready to start the scripts. ## Setup and Loading Docs @@ -22,7 +39,7 @@ To import documents and save the embedding vectors to your database: > `npx ts-node pg-vector-store/load-docs.ts data` -where data is the directory containing your input files. Using the _data_ directory in the example above will read all of the files in that directory using the llamaindexTS default readers for each file type. +where data is the directory containing your input files. Using the `data` directory in the example above will read all of the files in that directory using the LlamaIndexTS default readers for each file type. ## RAG Querying diff --git a/packages/core/src/storage/vectorStore/PGVectorStore.ts b/packages/core/src/storage/vectorStore/PGVectorStore.ts index 7bc0ccd33..46c0127b7 100644 --- a/packages/core/src/storage/vectorStore/PGVectorStore.ts +++ b/packages/core/src/storage/vectorStore/PGVectorStore.ts @@ -11,33 +11,43 @@ export const PGVECTOR_TABLE = "llamaindex_embedding"; /** * Provides support for writing and querying vector data in Postgres. + * Note: Can't be used with data created using the Python version of the vector store (https://docs.llamaindex.ai/en/stable/examples/vector_stores/postgres.html) */ export class PGVectorStore implements VectorStore { storesText: boolean = true; private collection: string = ""; + private schemaName: string = PGVECTOR_SCHEMA; + private tableName: string = PGVECTOR_TABLE; + private connectionString: string | undefined = undefined; - /* - FROM pg LIBRARY: - type Config = { - user?: string, // default process.env.PGUSER || process.env.USER - password?: string or function, //default process.env.PGPASSWORD - host?: string, // default process.env.PGHOST - database?: string, // default process.env.PGDATABASE || user - port?: number, // default process.env.PGPORT - connectionString?: string, // e.g. postgres://user:password@host:5432/database - ssl?: any, // passed directly to node.TLSSocket, supports all tls.connect options - types?: any, // custom type parsers - statement_timeout?: number, // number of milliseconds before a statement in query will time out, default is no timeout - query_timeout?: number, // number of milliseconds before a query call will timeout, default is no timeout - application_name?: string, // The name of the application that created this Client instance - connectionTimeoutMillis?: number, // number of milliseconds to wait for connection, default is no timeout - idle_in_transaction_session_timeout?: number // number of milliseconds before terminating any session with an open idle transaction, default is no timeout - } - */ - db?: pg.Client; - - constructor() {} + private db?: pg.Client; + + /** + * Constructs a new instance of the PGVectorStore + * + * If the `connectionString` is not provided the following env variables are + * used to connect to the DB: + * PGHOST=<your database host> + * PGUSER=<your database user> + * PGPASSWORD=<your database password> + * PGDATABASE=<your database name> + * PGPORT=<your database port> + * + * @param {object} config - The configuration settings for the instance. + * @param {string} config.schemaName - The name of the schema (optional). Defaults to PGVECTOR_SCHEMA. + * @param {string} config.tableName - The name of the table (optional). Defaults to PGVECTOR_TABLE. + * @param {string} config.connectionString - The connection string (optional). + */ + constructor(config?: { + schemaName?: string; + tableName?: string; + connectionString?: string; + }) { + this.schemaName = config?.schemaName ?? PGVECTOR_SCHEMA; + this.tableName = config?.tableName ?? PGVECTOR_TABLE; + this.connectionString = config?.connectionString; + } /** * Setter for the collection property. @@ -66,7 +76,9 @@ export class PGVectorStore implements VectorStore { try { // Create DB connection // Read connection params from env - see comment block above - const db = new pg.Client(); + const db = new pg.Client({ + connectionString: this.connectionString, + }); await db.connect(); // Check vector extension @@ -88,9 +100,9 @@ export class PGVectorStore implements VectorStore { } private async checkSchema(db: pg.Client) { - await db.query(`CREATE SCHEMA IF NOT EXISTS ${PGVECTOR_SCHEMA}`); + await db.query(`CREATE SCHEMA IF NOT EXISTS ${this.schemaName}`); - const tbl = `CREATE TABLE IF NOT EXISTS ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE}( + const tbl = `CREATE TABLE IF NOT EXISTS ${this.schemaName}.${this.tableName}( id uuid DEFAULT gen_random_uuid() PRIMARY KEY, external_id VARCHAR, collection VARCHAR, @@ -100,16 +112,14 @@ export class PGVectorStore implements VectorStore { )`; await db.query(tbl); - const idxs = `CREATE INDEX IF NOT EXISTS idx_${PGVECTOR_TABLE}_external_id ON ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} (external_id); - CREATE INDEX IF NOT EXISTS idx_${PGVECTOR_TABLE}_collection ON ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} (collection);`; + const idxs = `CREATE INDEX IF NOT EXISTS idx_${this.tableName}_external_id ON ${this.schemaName}.${this.tableName} (external_id); + CREATE INDEX IF NOT EXISTS idx_${this.tableName}_collection ON ${this.schemaName}.${this.tableName} (collection);`; await db.query(idxs); // TODO add IVFFlat or HNSW indexing? return db; } - // isEmbeddingQuery?: boolean | undefined; - /** * Connects to the database specified in environment vars. * This method also checks and creates the vector extension, @@ -126,7 +136,7 @@ export class PGVectorStore implements VectorStore { * @returns The result of the delete query. */ async clearCollection() { - const sql: string = `DELETE FROM ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} + const sql: string = `DELETE FROM ${this.schemaName}.${this.tableName} WHERE collection = $1`; const db = (await this.getDb()) as pg.Client; @@ -135,6 +145,29 @@ export class PGVectorStore implements VectorStore { return ret; } + private getDataToInsert(embeddingResults: BaseNode<Metadata>[]) { + const result = []; + for (let index = 0; index < embeddingResults.length; index++) { + const row = embeddingResults[index]; + + let id: any = row.id_.length ? row.id_ : null; + let meta = row.metadata || {}; + meta.create_date = new Date(); + + const params = [ + id, + "", + this.collection, + row.getContent(MetadataMode.EMBED), + meta, + "[" + row.getEmbedding().join(",") + "]", + ]; + + result.push(params); + } + return result; + } + /** * Adds vector record(s) to the table. * NOTE: Uses the collection property controlled by setCollection/getCollection. @@ -147,34 +180,20 @@ export class PGVectorStore implements VectorStore { return Promise.resolve([]); } - const sql: string = `INSERT INTO ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} + const sql: string = `INSERT INTO ${this.schemaName}.${this.tableName} (id, external_id, collection, document, metadata, embeddings) VALUES ($1, $2, $3, $4, $5, $6)`; const db = (await this.getDb()) as pg.Client; + const data = this.getDataToInsert(embeddingResults); let ret: string[] = []; - for (let index = 0; index < embeddingResults.length; index++) { - const row = embeddingResults[index]; - - let id: any = row.id_.length ? row.id_ : null; - let meta = row.metadata || {}; - meta.create_date = new Date(); - - const params = [ - id, - "", - this.collection, - row.getContent(MetadataMode.EMBED), - meta, - "[" + row.getEmbedding().join(",") + "]", - ]; - + for (let index = 0; index < data.length; index++) { + const params = data[index]; try { const result = await db.query(sql, params); - if (result.rows.length) { - id = result.rows[0].id as string; + const id = result.rows[0].id as string; ret.push(id); } } catch (err) { @@ -197,7 +216,7 @@ export class PGVectorStore implements VectorStore { const collectionCriteria = this.collection.length ? "AND collection = $2" : ""; - const sql: string = `DELETE FROM ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} + const sql: string = `DELETE FROM ${this.schemaName}.${this.tableName} WHERE id = $1 ${collectionCriteria}`; const db = (await this.getDb()) as pg.Client; @@ -230,7 +249,7 @@ export class PGVectorStore implements VectorStore { const sql = `SELECT v.*, embeddings <-> $1 s - FROM ${PGVECTOR_SCHEMA}.${PGVECTOR_TABLE} v + FROM ${this.schemaName}.${this.tableName} v ${where} ORDER BY s LIMIT ${max} diff --git a/packages/create-llama/questions.ts b/packages/create-llama/questions.ts index 72c1e0758..b144bfd70 100644 --- a/packages/create-llama/questions.ts +++ b/packages/create-llama/questions.ts @@ -1,9 +1,12 @@ import ciInfo from "ci-info"; +import fs from "fs"; +import path from "path"; import { blue, green } from "picocolors"; import prompts from "prompts"; import { InstallAppArgs } from "./create-app"; import { COMMUNITY_OWNER, COMMUNITY_REPO } from "./helpers/constant"; import { getRepoRootFolders } from "./helpers/repo"; +import { TemplateFramework } from "./templates"; export type QuestionArgs = Omit<InstallAppArgs, "appPath" | "packageManager">; @@ -26,6 +29,31 @@ const handlers = { }, }; +const getVectorDbChoices = (framework: TemplateFramework) => { + const choices = [ + { + title: "No, just store the data in the file system", + value: "none", + }, + { title: "MongoDB", value: "mongo" }, + { title: "PostgreSQL", value: "pg" }, + ]; + + const vectodbLang = framework === "fastapi" ? "python" : "typescript"; + const compPath = path.join(__dirname, "components"); + const vectordbPath = path.join(compPath, "vectordbs", vectodbLang); + + const availableChoices = fs + .readdirSync(vectordbPath) + .filter((file) => fs.statSync(path.join(vectordbPath, file)).isDirectory()); + + const displayedChoices = choices.filter((choice) => + availableChoices.includes(choice.value), + ); + + return displayedChoices; +}; + export const onPromptState = (state: any) => { if (state.aborted) { // If we don't re-enable the terminal cursor before exiting @@ -233,30 +261,23 @@ export const askQuestions = async ( program.engine = engine; preferences.engine = engine; } - } - - if (program.engine !== "simple" && !program.vectorDb) { - if (ciInfo.isCI) { - program.vectorDb = getPrefOrDefault("vectorDb"); - } else { - const { vectorDb } = await prompts( - { - type: "select", - name: "vectorDb", - message: "Would you like to use a vector database?", - choices: [ - { - title: "No, just store the data in the file system", - value: "none", - }, - { title: "MongoDB", value: "mongo" }, - ], - initial: 0, - }, - handlers, - ); - program.vectorDb = vectorDb; - preferences.vectorDb = vectorDb; + if (program.engine !== "simple" && !program.vectorDb) { + if (ciInfo.isCI) { + program.vectorDb = getPrefOrDefault("vectorDb"); + } else { + const { vectorDb } = await prompts( + { + type: "select", + name: "vectorDb", + message: "Would you like to use a vector database?", + choices: getVectorDbChoices(program.framework), + initial: 0, + }, + handlers, + ); + program.vectorDb = vectorDb; + preferences.vectorDb = vectorDb; + } } } diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pg/generate.mjs b/packages/create-llama/templates/components/vectordbs/typescript/pg/generate.mjs new file mode 100644 index 000000000..3d959c698 --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/typescript/pg/generate.mjs @@ -0,0 +1,45 @@ +/* eslint-disable turbo/no-undeclared-env-vars */ +import * as dotenv from "dotenv"; +import { + PGVectorStore, + SimpleDirectoryReader, + VectorStoreIndex, + storageContextFromDefaults, +} from "llamaindex"; +import { + PGVECTOR_SCHEMA, + PGVECTOR_TABLE, + STORAGE_DIR, + checkRequiredEnvVars, +} from "./shared.mjs"; + +dotenv.config(); + +async function loadAndIndex() { + // load objects from storage and convert them into LlamaIndex Document objects + const documents = await new SimpleDirectoryReader().loadData({ + directoryPath: STORAGE_DIR, + }); + + // create postgres vector store + const vectorStore = new PGVectorStore({ + connectionString: process.env.PG_CONNECTION_STRING, + schemaName: PGVECTOR_SCHEMA, + tableName: PGVECTOR_TABLE, + }); + vectorStore.setCollection(STORAGE_DIR); + vectorStore.clearCollection(); + + // create index from all the Documents + console.log("Start creating embeddings..."); + const storageContext = await storageContextFromDefaults({ vectorStore }); + await VectorStoreIndex.fromDocuments(documents, { storageContext }); + console.log(`Successfully created embeddings.`); +} + +(async () => { + checkRequiredEnvVars(); + await loadAndIndex(); + console.log("Finished generating storage."); + process.exit(0); +})(); diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pg/index.ts b/packages/create-llama/templates/components/vectordbs/typescript/pg/index.ts new file mode 100644 index 000000000..96a98085a --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/typescript/pg/index.ts @@ -0,0 +1,39 @@ +/* eslint-disable turbo/no-undeclared-env-vars */ +import { + ContextChatEngine, + LLM, + PGVectorStore, + VectorStoreIndex, + serviceContextFromDefaults, +} from "llamaindex"; +import { + CHUNK_OVERLAP, + CHUNK_SIZE, + PGVECTOR_SCHEMA, + PGVECTOR_TABLE, + checkRequiredEnvVars, +} from "./shared.mjs"; + +async function getDataSource(llm: LLM) { + checkRequiredEnvVars(); + const pgvs = new PGVectorStore({ + connectionString: process.env.PG_CONNECTION_STRING, + schemaName: PGVECTOR_SCHEMA, + tableName: PGVECTOR_TABLE, + }); + const serviceContext = serviceContextFromDefaults({ + llm, + chunkSize: CHUNK_SIZE, + chunkOverlap: CHUNK_OVERLAP, + }); + return await VectorStoreIndex.fromVectorStore(pgvs, serviceContext); +} + +export async function createChatEngine(llm: LLM) { + const index = await getDataSource(llm); + const retriever = index.asRetriever({ similarityTopK: 5 }); + return new ContextChatEngine({ + chatModel: llm, + retriever, + }); +} diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pg/shared.mjs b/packages/create-llama/templates/components/vectordbs/typescript/pg/shared.mjs new file mode 100644 index 000000000..8ad729c0a --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/typescript/pg/shared.mjs @@ -0,0 +1,24 @@ +export const STORAGE_DIR = "./data"; +export const CHUNK_SIZE = 512; +export const CHUNK_OVERLAP = 20; +export const PGVECTOR_SCHEMA = "public"; +export const PGVECTOR_TABLE = "llamaindex_embedding"; + +const REQUIRED_ENV_VARS = ["PG_CONNECTION_STRING", "OPENAI_API_KEY"]; + +export function checkRequiredEnvVars() { + const missingEnvVars = REQUIRED_ENV_VARS.filter((envVar) => { + return !process.env[envVar]; + }); + + if (missingEnvVars.length > 0) { + console.log( + `The following environment variables are required but missing: ${missingEnvVars.join( + ", ", + )}`, + ); + throw new Error( + `Missing environment variables: ${missingEnvVars.join(", ")}`, + ); + } +} diff --git a/packages/create-llama/templates/index.ts b/packages/create-llama/templates/index.ts index 973a65b85..97a3a4b5f 100644 --- a/packages/create-llama/templates/index.ts +++ b/packages/create-llama/templates/index.ts @@ -49,6 +49,11 @@ const createEnvLocalFile = async ( content += `MONGODB_VECTOR_INDEX=\n`; break; } + case "pg": { + content += `# For generating a connection URI, see https://docs.timescale.com/use-timescale/latest/services/create-a-service\n`; + content += `PG_CONNECTION_STRING=\n`; + break; + } } if (content) { diff --git a/packages/create-llama/templates/types.ts b/packages/create-llama/templates/types.ts index a0567e10e..e245c8644 100644 --- a/packages/create-llama/templates/types.ts +++ b/packages/create-llama/templates/types.ts @@ -4,7 +4,7 @@ export type TemplateType = "simple" | "streaming" | "community"; export type TemplateFramework = "nextjs" | "express" | "fastapi"; export type TemplateEngine = "simple" | "context"; export type TemplateUI = "html" | "shadcn"; -export type TemplateVectorDB = "none" | "mongo"; +export type TemplateVectorDB = "none" | "mongo" | "pg"; export interface InstallTemplateArgs { appName: string; diff --git a/packages/eslint-config-custom/index.js b/packages/eslint-config-custom/index.js index 37100f452..4383def9e 100644 --- a/packages/eslint-config-custom/index.js +++ b/packages/eslint-config-custom/index.js @@ -44,6 +44,8 @@ module.exports = { "NOTION_TOKEN", "MONGODB_URI", + "PG_CONNECTION_STRING", + "https_proxy", "npm_config_user_agent", "NEXT_PUBLIC_CHAT_API", -- GitLab