diff --git a/.changeset/purple-camels-walk.md b/.changeset/purple-camels-walk.md new file mode 100644 index 0000000000000000000000000000000000000000..6d972ce3ee515aa0c38e491f48fc83c0f40f875d --- /dev/null +++ b/.changeset/purple-camels-walk.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Feat: Add support for Chroma DB as a vector store diff --git a/examples/astradb/load.ts b/examples/astradb/load.ts index 298ad75511ed89f1d01864311f4637acf1dfcd62..6c422fb145920e24563a7af5666285023023e1bc 100644 --- a/examples/astradb/load.ts +++ b/examples/astradb/load.ts @@ -10,7 +10,7 @@ const collectionName = "movie_reviews"; async function main() { try { const reader = new PapaCSVReader(false); - const docs = await reader.loadData("astradb/data/movie_reviews.csv"); + const docs = await reader.loadData("../data/movie_reviews.csv"); const astraVS = new AstraDBVectorStore(); await astraVS.create(collectionName, { diff --git a/examples/chromadb/README.md b/examples/chromadb/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5b1c6d7b2810363c05e0b13af3120c6f05adbd66 --- /dev/null +++ b/examples/chromadb/README.md @@ -0,0 +1,13 @@ +# Chroma Vector Store Example + +How to run `examples/chromadb/test.ts`: + +Export your OpenAI API Key using `export OPEN_API_KEY=insert your api key here` + +If you haven't installed chromadb, run `pip install chromadb`. Start the server using `chroma run`. + +Now, open a new terminal window and inside `examples`, run `pnpx ts-node chromadb/test.ts`. + +Here's the output for the input query `Tell me about Godfrey Cheshire's rating of La Sapienza.`: + +`Godfrey Cheshire gave La Sapienza a rating of 4 out of 4, describing it as fresh and the most astonishing and important movie to emerge from France in quite some time.` diff --git a/examples/chromadb/test.ts b/examples/chromadb/test.ts new file mode 100644 index 0000000000000000000000000000000000000000..51d2b2692f9025f12d37b027a4a915a82ae29787 --- /dev/null +++ b/examples/chromadb/test.ts @@ -0,0 +1,40 @@ +import { + ChromaVectorStore, + PapaCSVReader, + storageContextFromDefaults, + VectorStoreIndex, +} from "llamaindex"; + +const collectionName = "movie_reviews"; + +async function main() { + const sourceFile: string = "./data/movie_reviews.csv"; + + try { + console.log(`Loading data from ${sourceFile}`); + const reader = new PapaCSVReader(false, ", ", "\n", { + header: true, + }); + const docs = await reader.loadData(sourceFile); + + console.log("Creating ChromaDB vector store"); + const chromaVS = new ChromaVectorStore({ collectionName }); + const ctx = await storageContextFromDefaults({ vectorStore: chromaVS }); + + console.log("Embedding documents and adding to index"); + const index = await VectorStoreIndex.fromDocuments(docs, { + storageContext: ctx, + }); + + console.log("Querying index"); + const queryEngine = index.asQueryEngine(); + const response = await queryEngine.query( + "Tell me about Godfrey Cheshire's rating of La Sapienza.", + ); + console.log(response.toString()); + } catch (e) { + console.error(e); + } +} + +main(); diff --git a/examples/astradb/data/movie_reviews.csv b/examples/data/movie_reviews.csv similarity index 99% rename from examples/astradb/data/movie_reviews.csv rename to examples/data/movie_reviews.csv index d605bdbc0ae9345d56282507076849748b7460ee..eaebe6dc672fcaa58b45f608dd3b17c0a4c61843 100644 Binary files a/examples/astradb/data/movie_reviews.csv and b/examples/data/movie_reviews.csv differ diff --git a/examples/package.json b/examples/package.json index 07e116249f7359ed71125ffb58b0257833f982b3..465ea579d7a4baf6b8ceb71be4ebea2b24c05bf7 100644 --- a/examples/package.json +++ b/examples/package.json @@ -3,12 +3,13 @@ "private": true, "name": "examples", "dependencies": { - "@notionhq/client": "^2.2.13", "@datastax/astra-db-ts": "^0.1.2", + "@notionhq/client": "^2.2.13", "@pinecone-database/pinecone": "^1.1.2", + "chromadb": "^1.7.3", "commander": "^11.1.0", - "llamaindex": "latest", "dotenv": "^16.3.1", + "llamaindex": "latest", "mongodb": "^6.2.0" }, "devDependencies": { diff --git a/packages/core/package.json b/packages/core/package.json index f94497dfdda73c634b753101446e686af407f3d1..733474e9c928764707cb472358297d3c894652bb 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -10,6 +10,7 @@ "@pinecone-database/pinecone": "^1.1.2", "@xenova/transformers": "^2.10.0", "assemblyai": "^4.0.0", + "chromadb": "^1.7.3", "file-type": "^18.7.0", "js-tiktoken": "^1.0.8", "lodash": "^4.17.21", diff --git a/packages/core/src/storage/index.ts b/packages/core/src/storage/index.ts index 83cedd3894d69fc9648c693ef17afe772befa7ed..796f8fb9fd3e4f3032228a6335fd8366d855aaa1 100644 --- a/packages/core/src/storage/index.ts +++ b/packages/core/src/storage/index.ts @@ -8,6 +8,7 @@ export * from "./indexStore/types"; export { SimpleKVStore } from "./kvStore/SimpleKVStore"; export * from "./kvStore/types"; export { AstraDBVectorStore } from "./vectorStore/AstraDBVectorStore"; +export { ChromaVectorStore } from "./vectorStore/ChromaVectorStore"; export { MongoDBAtlasVectorSearch } from "./vectorStore/MongoDBAtlasVectorStore"; export { PGVectorStore } from "./vectorStore/PGVectorStore"; export { PineconeVectorStore } from "./vectorStore/PineconeVectorStore"; diff --git a/packages/core/src/storage/vectorStore/ChromaVectorStore.ts b/packages/core/src/storage/vectorStore/ChromaVectorStore.ts new file mode 100644 index 0000000000000000000000000000000000000000..d6d8d52de11f880b2754fefd74cbfbb8ee6d5ee1 --- /dev/null +++ b/packages/core/src/storage/vectorStore/ChromaVectorStore.ts @@ -0,0 +1,148 @@ +import { + AddParams, + ChromaClient, + ChromaClientParams, + Collection, + IncludeEnum, + QueryResponse, + Where, + WhereDocument, +} from "chromadb"; +import { BaseNode, MetadataMode } from "../../Node"; +import { + VectorStore, + VectorStoreQuery, + VectorStoreQueryMode, + VectorStoreQueryResult, +} from "./types"; +import { metadataDictToNode, nodeToMetadata } from "./utils"; + +type ChromaDeleteOptions = { + where?: Where; + whereDocument?: WhereDocument; +}; + +type ChromaQueryOptions = { + whereDocument?: WhereDocument; +}; + +const DEFAULT_TEXT_KEY = "text"; + +export class ChromaVectorStore implements VectorStore { + storesText: boolean = true; + flatMetadata: boolean = true; + textKey: string; + private chromaClient: ChromaClient; + private collection: Collection | null = null; + private collectionName: string; + + constructor(init: { + collectionName: string; + textKey?: string; + chromaClientParams?: ChromaClientParams; + }) { + this.collectionName = init.collectionName; + this.chromaClient = new ChromaClient(init.chromaClientParams); + this.textKey = init.textKey ?? DEFAULT_TEXT_KEY; + } + + client(): ChromaClient { + return this.chromaClient; + } + + async getCollection(): Promise<Collection> { + if (!this.collection) { + const coll = await this.chromaClient.createCollection({ + name: this.collectionName, + }); + this.collection = coll; + } + return this.collection; + } + + private getDataToInsert(nodes: BaseNode[]): AddParams { + const metadatas = nodes.map((node) => + nodeToMetadata(node, true, this.textKey, this.flatMetadata), + ); + return { + embeddings: nodes.map((node) => node.getEmbedding()), + ids: nodes.map((node) => node.id_), + metadatas, + documents: nodes.map((node) => node.getContent(MetadataMode.NONE)), + }; + } + + async add(nodes: BaseNode[]): Promise<string[]> { + if (!nodes || nodes.length === 0) { + return []; + } + + const dataToInsert = this.getDataToInsert(nodes); + const collection = await this.getCollection(); + await collection.add(dataToInsert); + return nodes.map((node) => node.id_); + } + + async delete( + refDocId: string, + deleteOptions?: ChromaDeleteOptions, + ): Promise<void> { + const collection = await this.getCollection(); + await collection.delete({ + ids: [refDocId], + where: deleteOptions?.where, + whereDocument: deleteOptions?.whereDocument, + }); + } + + async query( + query: VectorStoreQuery, + options?: ChromaQueryOptions, + ): Promise<VectorStoreQueryResult> { + if (query.docIds) { + throw new Error("ChromaDB does not support querying by docIDs"); + } + if (query.mode != VectorStoreQueryMode.DEFAULT) { + throw new Error("ChromaDB does not support querying by mode"); + } + + const chromaWhere: { [x: string]: string | number | boolean } = {}; + if (query.filters) { + query.filters.filters.map((filter) => { + const filterKey = filter.key; + const filterValue = filter.value; + chromaWhere[filterKey] = filterValue; + }); + } + + const collection = await this.getCollection(); + const queryResponse: QueryResponse = await collection.query({ + queryEmbeddings: query.queryEmbedding ?? undefined, + queryTexts: query.queryStr ?? undefined, + nResults: query.similarityTopK, + where: Object.keys(chromaWhere).length ? chromaWhere : undefined, + whereDocument: options?.whereDocument, + //ChromaDB doesn't return the result embeddings by default so we need to include them + include: [ + IncludeEnum.Distances, + IncludeEnum.Metadatas, + IncludeEnum.Documents, + IncludeEnum.Embeddings, + ], + }); + const vectorStoreQueryResult: VectorStoreQueryResult = { + nodes: queryResponse.ids[0].map((id, index) => { + const text = (queryResponse.documents as string[][])[0][index]; + const metaData = queryResponse.metadatas[0][index] ?? {}; + const node = metadataDictToNode(metaData); + node.setContent(text); + return node; + }), + similarities: (queryResponse.distances as number[][])[0].map( + (distance) => 1 - distance, + ), + ids: queryResponse.ids[0], + }; + return vectorStoreQueryResult; + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cb1ee0bed7f48bf3419f5fbdeeb1188ff8184e30..214a3896d3a9e86066af1caa818c16d2684bddd6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -115,6 +115,9 @@ importers: '@pinecone-database/pinecone': specifier: ^1.1.2 version: 1.1.2 + chromadb: + specifier: ^1.7.3 + version: 1.7.3(openai@4.20.1) commander: specifier: ^11.1.0 version: 11.1.0 @@ -158,6 +161,9 @@ importers: assemblyai: specifier: ^4.0.0 version: 4.0.0 + chromadb: + specifier: ^1.7.3 + version: 1.7.3(openai@4.20.1) file-type: specifier: ^18.7.0 version: 18.7.0 @@ -6252,6 +6258,28 @@ packages: resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} engines: {node: '>=10'} + /chromadb@1.7.3(openai@4.20.1): + resolution: {integrity: sha512-3GgvQjpqgk5C89x5EuTDaXKbfrdqYDJ5UVyLQ3ZmwxnpetNc+HhRDGjkvXa5KSvpQ3lmKoyDoqnN4tZepfFkbw==} + engines: {node: '>=14.17.0'} + peerDependencies: + '@google/generative-ai': ^0.1.1 + cohere-ai: ^5.0.0 || ^6.0.0 || ^7.0.0 + openai: ^3.0.0 || ^4.0.0 + peerDependenciesMeta: + '@google/generative-ai': + optional: true + cohere-ai: + optional: true + openai: + optional: true + dependencies: + cliui: 8.0.1 + isomorphic-fetch: 3.0.0 + openai: 4.20.1 + transitivePeerDependencies: + - encoding + dev: false + /chrome-trace-event@1.0.3: resolution: {integrity: sha512-p3KULyQg4S7NIHixdwbGX+nFHkoBiA4YQmyWtjb8XngSKV124nJmRysgAeujbUVb15vh+RvFUfCPqU7rXk+hZg==} engines: {node: '>=6.0'} @@ -6356,7 +6384,6 @@ packages: string-width: 4.2.3 strip-ansi: 6.0.1 wrap-ansi: 7.0.0 - dev: true /clone-deep@4.0.1: resolution: {integrity: sha512-neHB9xuzh/wk0dIHweyAXv2aPGZIVk3pLMe+/RNzINf17fe0OG96QroktYAUm7SM1PBnzTabaLboqqxDyMU+SQ==} @@ -10117,6 +10144,15 @@ packages: resolution: {integrity: sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==} engines: {node: '>=0.10.0'} + /isomorphic-fetch@3.0.0: + resolution: {integrity: sha512-qvUtwJ3j6qwsF3jLxkZ72qCgjMysPzDfeV240JHiGZsANBYd+EEuu35v7dfrJ9Up0Ak07D7GGSkGhCHTqg/5wA==} + dependencies: + node-fetch: 2.7.0(encoding@0.1.13) + whatwg-fetch: 3.6.20 + transitivePeerDependencies: + - encoding + dev: false + /isomorphic-timers-promises@1.0.1: resolution: {integrity: sha512-u4sej9B1LPSxTGKB/HiuzvEQnXH0ECYkSVQU39koSwmFAxhlEAFl9RdTvLv4TOTQUgBS5O3O5fwUxk6byBZ+IQ==} engines: {node: '>=10'} @@ -16377,6 +16413,10 @@ packages: engines: {node: '>=0.8.0'} dev: false + /whatwg-fetch@3.6.20: + resolution: {integrity: sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==} + dev: false + /whatwg-url@13.0.0: resolution: {integrity: sha512-9WWbymnqj57+XEuqADHrCJ2eSXzn8WXIW/YSGaZtb2WKAInQ6CHfaUUcTyyver0p8BDg5StLQq8h1vtZuwmOig==} engines: {node: '>=16'}