diff --git a/examples/pinecone-vector-store/README.md b/examples/pinecone-vector-store/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b0576bc26c4a6a8b4cff0a74150190f48acffe6e --- /dev/null +++ b/examples/pinecone-vector-store/README.md @@ -0,0 +1,33 @@ +# Pinecone Vector Store + +There are two scripts available here: load-docs.ts and query.ts + +## Prerequisites + +You'll need a Pinecone account, project, and index. Pinecone does not allow automatic creation of indexes on the free plan, +so this vector store does not check and create the index (unlike, e.g., the PGVectorStore) + +Set the **PINECONE_API_KEY** and **PINECONE_ENVIRONMENT** environment variables to match your specific values. You will likely also need to set **PINECONE_INDEX_NAME**, unless your +index is the default value "llama". + +You'll also need a value for OPENAI_API_KEY in your environment. + +## Setup and Loading Docs + +Read and follow the instructions in the README.md file located one directory up to make sure your JS/TS dependencies are set up. The commands listed below are also run from that parent directory. + +To import documents and save the embedding vectors to your database: + +> `npx ts-node pinecone-vector-store/load-docs.ts data` + +where data is the directory containing your input files. Using the _data_ directory in the example above will read all of the files in that directory using the llamaindexTS default readers for each file type. + +**NOTE**: Sending text chunks as part of the Pinecone metadata means that upsert API calls can get arbitrarily large. Set the **PINECONE_CHUNK_SIZE** environment variable to a smaller value if the load script fails + +## RAG Querying + +To query using the resulting vector store: + +> `npx ts-node pinecone-vector-store/query.ts` + +The script will prompt for a question, then process and present the answer using the PineconeVectorStore data and your OpenAI API key. It will continue to prompt until you enter `q`, `quit` or `exit` as the next query. diff --git a/examples/pinecone-vector-store/load-docs.ts b/examples/pinecone-vector-store/load-docs.ts new file mode 100755 index 0000000000000000000000000000000000000000..518de54f386cb000f0f7d3e3df16f097bb820599 --- /dev/null +++ b/examples/pinecone-vector-store/load-docs.ts @@ -0,0 +1,66 @@ +// load-docs.ts +import fs from "fs/promises"; +import { + SimpleDirectoryReader, + storageContextFromDefaults, + VectorStoreIndex, +} from "llamaindex"; +import { PineconeVectorStore } from "llamaindex"; + +async function getSourceFilenames(sourceDir: string) { + return await fs + .readdir(sourceDir) + .then((fileNames) => fileNames.map((file) => sourceDir + "/" + file)); +} + +function callback( + category: string, + name: string, + status: any, + message: string = "", +): boolean { + console.log(category, name, status, message); + return true; +} + +async function main(args: any) { + const sourceDir: string = args.length > 2 ? args[2] : "../data"; + + console.log(`Finding documents in ${sourceDir}`); + const fileList = await getSourceFilenames(sourceDir); + const count = fileList.length; + console.log(`Found ${count} files`); + + console.log(`Importing contents from ${count} files in ${sourceDir}`); + var fileName = ""; + try { + // Passing callback fn to the ctor here + // will enable looging to console. + // See callback fn, defined above. + const rdr = new SimpleDirectoryReader(callback); + const docs = await rdr.loadData({ directoryPath: sourceDir }); + + const pcvs = new PineconeVectorStore(); + + const ctx = await storageContextFromDefaults({ vectorStore: pcvs }); + + console.debug(" - creating vector store"); + const index = await VectorStoreIndex.fromDocuments(docs, { + storageContext: ctx, + }); + console.debug(" - done."); + } catch (err) { + console.error(fileName, err); + console.log( + "If your PineconeVectorStore connection failed, make sure to set env vars for PINECONE_API_KEY and PINECONE_ENVIRONMENT. If the upserts failed, try setting PINECONE_CHUNK_SIZE to limit the content sent per chunk", + ); + process.exit(1); + } + + console.log( + "Done. Try running query.ts to ask questions against the imported embeddings.", + ); + process.exit(0); +} + +main(process.argv).catch((err) => console.error(err)); diff --git a/examples/pinecone-vector-store/query.ts b/examples/pinecone-vector-store/query.ts new file mode 100755 index 0000000000000000000000000000000000000000..51b7465510cde06cd44f6a2e537e90c1e0843e64 --- /dev/null +++ b/examples/pinecone-vector-store/query.ts @@ -0,0 +1,65 @@ +import { VectorStoreIndex } from "llamaindex"; +import { serviceContextFromDefaults } from "llamaindex"; +import { PineconeVectorStore } from "llamaindex"; + +async function main() { + const readline = require("readline").createInterface({ + input: process.stdin, + output: process.stdout, + }); + + try { + const pcvs = new PineconeVectorStore(); + + const ctx = serviceContextFromDefaults(); + const index = await VectorStoreIndex.fromVectorStore(pcvs, ctx); + + // Query the index + const queryEngine = await index.asQueryEngine(); + + let question = ""; + while (!isQuit(question)) { + question = await getUserInput(readline); + + if (isQuit(question)) { + readline.close(); + process.exit(0); + } + + try { + const answer = await queryEngine.query(question); + console.log(answer.response); + } catch (error) { + console.error("Error:", error); + } + } + } catch (err) { + console.error(err); + console.log( + "If your PineconeVectorStore connection failed, make sure to set env vars for PINECONE_API_KEY and PINECONE_ENVIRONMENT.", + ); + process.exit(1); + } +} + +function isQuit(question: string) { + return ["q", "quit", "exit"].includes(question.trim().toLowerCase()); +} + +// Function to get user input as a promise +function getUserInput(readline: any): Promise<string> { + return new Promise((resolve) => { + readline.question( + "What would you like to know?\n>", + (userInput: string) => { + resolve(userInput); + }, + ); + }); +} + +main() + .catch(console.error) + .finally(() => { + process.exit(1); + }); diff --git a/packages/core/package.json b/packages/core/package.json index 881f4686d800fe5ac72464077730642ee4afc3b1..a1a1246226f11087847a0c5ca4dc33bbf8e6a621 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -7,6 +7,7 @@ "@datastax/astra-db-ts": "^0.1.2", "@mistralai/mistralai": "^0.0.7", "@notionhq/client": "^2.2.14", + "@pinecone-database/pinecone": "^1.1.2", "@xenova/transformers": "^2.10.0", "assemblyai": "^4.0.0", "compromise": "^14.10.1", diff --git a/packages/core/src/storage/index.ts b/packages/core/src/storage/index.ts index 764ad15fec94e5bdfbc169b6e273ba5e2b29e66b..cbd82e8ce8bc9a39f61c6bf245e20048a2a9e1c4 100644 --- a/packages/core/src/storage/index.ts +++ b/packages/core/src/storage/index.ts @@ -9,6 +9,7 @@ export { SimpleKVStore } from "./kvStore/SimpleKVStore"; export * from "./kvStore/types"; export { AstraDBVectorStore } from "./vectorStore/AstraDBVectorStore"; export { MongoDBAtlasVectorSearch } from "./vectorStore/MongoDBAtlasVectorStore"; +export { PineconeVectorStore } from "./vectorStore/PineconeVectorStore"; export { SimpleVectorStore } from "./vectorStore/SimpleVectorStore"; export { PGVectorStore } from "./vectorStore/PGVectorStore"; export * from "./vectorStore/types"; diff --git a/packages/core/src/storage/vectorStore/PineconeVectorStore.ts b/packages/core/src/storage/vectorStore/PineconeVectorStore.ts new file mode 100644 index 0000000000000000000000000000000000000000..ae424c050b1298ba77766ba62797e72bf2534b8c --- /dev/null +++ b/packages/core/src/storage/vectorStore/PineconeVectorStore.ts @@ -0,0 +1,218 @@ +import { VectorStore, + VectorStoreQuery, + VectorStoreQueryResult, + ExactMatchFilter, + MetadataFilters } from "./types"; + +import { BaseNode, Document, Metadata, MetadataMode } from "../../Node"; +import { GenericFileSystem } from "../FileSystem"; + +import { + FetchResponse, + Index, + Pinecone, + ScoredPineconeRecord, +} from "@pinecone-database/pinecone"; + +type PineconeParams = { + indexName?: string; + chunkSize?: number; +}; + +/** + * Provides support for writing and querying vector data in Postgres. + */ +export class PineconeVectorStore implements VectorStore { + storesText: boolean = true; + + /* + FROM @pinecone-database/pinecone: + PINECONE_API_KEY="your_api_key" + PINECONE_ENVIRONMENT="your_environment" + Our addition: + PINECONE_INDEX_NAME="llama" + PINECONE_CHUNK_SIZE=100 + */ + db?: Pinecone; + indexName: string; + chunkSize: number; + + constructor(params?: PineconeParams) { + this.indexName = + params?.indexName ?? process.env.PINECONE_INDEX_NAME ?? "llama"; + this.chunkSize = + params?.chunkSize ?? + Number.parseInt(process.env.PINECONE_CHUNK_SIZE ?? "100"); + } + + private async getDb(): Promise<Pinecone> { + if (!this.db) { + this.db = await new Pinecone(); + } + + return Promise.resolve(this.db); + } + + /** + * Connects to the Pinecone account specified in environment vars. + * This method also checks and creates the named index if not found. + * @returns Pinecone client, or the error encountered while connecting/setting up. + */ + client() { + return this.getDb(); + } + + async index() { + const db: Pinecone = await this.getDb(); + return await db.index(this.indexName); + } + + /** + * Delete all records for the current index. + * NOTE: This operation is not supported by Pinecone for "Starter" (free) indexes. + * @returns The result of the delete query. + */ + async clearIndex() { + const db: Pinecone = await this.getDb(); + return await db.index(this.indexName).deleteAll(); + } + + /** + * Adds vector record(s) to the table. + * @TODO Does not create or insert sparse vectors. + * @param embeddingResults The Nodes to be inserted, optionally including metadata tuples. + * @returns Due to limitations in the Pinecone client, does not return the upserted ID list, only a Promise resolve/reject. + */ + async add(embeddingResults: BaseNode<Metadata>[]): Promise<string[]> { + if (embeddingResults.length == 0) { + return Promise.resolve([]); + } + + const idx: Index = await this.index(); + const nodes = embeddingResults.map(this.nodeToRecord); + + for (let i = 0; i < nodes.length; i += this.chunkSize) { + const chunk = nodes.slice(i, i + this.chunkSize); + const result = await this.saveChunk(idx, chunk); + if (!result) { + return Promise.reject(); + } + } + return Promise.resolve([]); + } + + protected async saveChunk(idx: Index, chunk: any) { + try { + await idx.upsert(chunk); + return true; + } catch (err) { + const msg = `${err}`; + console.log(msg, err); + return false; + } + } + + /** + * Deletes a single record from the database by id. + * NOTE: Uses the collection property controlled by setCollection/getCollection. + * @param refDocId Unique identifier for the record to delete. + * @param deleteKwargs Required by VectorStore interface. Currently ignored. + * @returns Promise that resolves if the delete query did not throw an error. + */ + async delete(refDocId: string, deleteKwargs?: any): Promise<void> { + const idx = await this.index(); + return idx.deleteOne(refDocId); + } + + /** + * Query the vector store for the closest matching data to the query embeddings + * @TODO QUERY TYPES + * @param query The VectorStoreQuery to be used + * @param options Required by VectorStore interface. Currently ignored. + * @returns Zero or more Document instances with data from the vector store. + */ + async query( + query: VectorStoreQuery, + options?: any, + ): Promise<VectorStoreQueryResult> { + const filter = this.toPineconeFilter(query.filters); + + var options: any = { + vector: query.queryEmbedding, + topK: query.similarityTopK, + include_values: true, + include_metadara: true, + filter: filter + }; + + const idx = await this.index(); + const results = await idx.query(options); + + const idList = results.matches.map((row) => row.id); + const records: FetchResponse<any> = await idx.fetch(idList); + const rows = Object.values(records.records); + + const nodes = rows.map((row) => { + return new Document({ + id_: row.id, + text: this.textFromResultRow(row), + metadata: this.metaWithoutText(row.metadata), + embedding: row.values, + }); + }); + + const ret = { + nodes: nodes, + similarities: results.matches.map((row) => row.score || 999), + ids: results.matches.map((row) => row.id), + }; + + return Promise.resolve(ret); + } + + /** + * Required by VectorStore interface. Currently ignored. + * @param persistPath + * @param fs + * @returns Resolved Promise. + */ + persist( + persistPath: string, + fs?: GenericFileSystem | undefined, + ): Promise<void> { + return Promise.resolve(); + } + + toPineconeFilter(stdFilters?: MetadataFilters) { + return stdFilters?.filters?.reduce((carry: any, item: ExactMatchFilter) => { + carry[item.key] = item.value; + return carry; + }, {}); + } + + textFromResultRow(row: ScoredPineconeRecord<Metadata>): string { + return row.metadata?.text ?? ""; + } + + metaWithoutText(meta: Metadata): any { + return Object.keys(meta) + .filter((key) => key != "text") + .reduce((acc: any, key: string) => { + acc[key] = meta[key]; + return acc; + }, {}); + } + + nodeToRecord(node: BaseNode<Metadata>) { + let id: any = node.id_.length ? node.id_ : null; + let meta: any = node.metadata || {}; + meta.create_date = new Date(); + meta.text = node.getContent(MetadataMode.EMBED); + + return { + id: id, + values: node.getEmbedding(), + metadata: meta, + }; + } +} diff --git a/packages/eslint-config-custom/index.js b/packages/eslint-config-custom/index.js index d46a122e9c09d53301912adb4e6872cf6ac40025..3fc7c9ef1b9650762f599a9c261acb6346bce91a 100644 --- a/packages/eslint-config-custom/index.js +++ b/packages/eslint-config-custom/index.js @@ -28,6 +28,8 @@ module.exports = { "PINECONE_ENVIRONMENT", "PINECONE_PROJECT_ID", "PINECONE_INDEX_NAME", + "PINECONE_CHUNK_SIZE", + "PINECONE_INDEX_NAME", "AZURE_OPENAI_API_KEY", "AZURE_OPENAI_API_INSTANCE_NAME",