From 515f2c1e3df2d96228a7d540b0237b01c8b03d74 Mon Sep 17 00:00:00 2001 From: crisjy <cjy1994116@163.com> Date: Fri, 29 Nov 2024 09:53:09 +0800 Subject: [PATCH] feat: add AzureCosmosDBMongoVectorStore (#1528) --- .changeset/stale-parents-perform.md | 5 + .../AzureCosmosDBMongoVectorStore.ts | 328 ++++++++++++++++++ packages/llamaindex/src/vector-store/index.ts | 1 + 3 files changed, 334 insertions(+) create mode 100644 .changeset/stale-parents-perform.md create mode 100644 packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts diff --git a/.changeset/stale-parents-perform.md b/.changeset/stale-parents-perform.md new file mode 100644 index 000000000..6c9195aa7 --- /dev/null +++ b/.changeset/stale-parents-perform.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Add vector store for CosmosDB diff --git a/packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts b/packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts new file mode 100644 index 000000000..051bdac40 --- /dev/null +++ b/packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts @@ -0,0 +1,328 @@ +import type { BaseNode } from "@llamaindex/core/schema"; +import { MetadataMode } from "@llamaindex/core/schema"; +import { getEnv } from "@llamaindex/env"; +import { Collection, Db, MongoClient } from "mongodb"; +import { + BaseVectorStore, + type VectorStoreBaseParams, + type VectorStoreQuery, + type VectorStoreQueryResult, +} from "./types.js"; +import { metadataDictToNode, nodeToMetadata } from "./utils.js"; + +/** Azure Cosmos DB for MongoDB vCore Similarity type. */ +export const AzureCosmosDBMongoDBSimilarityType = { + /** Cosine similarity */ + COS: "COS", + /** Inner - product */ + IP: "IP", + /** Euclidian distance */ + L2: "L2", +} as const; + +/** Azure Cosmos DB for MongoDB vCore Similarity type. */ +export type AzureCosmosDBMongoDBSimilarityType = + (typeof AzureCosmosDBMongoDBSimilarityType)[keyof typeof AzureCosmosDBMongoDBSimilarityType]; + +/** Azure Cosmos DB for MongoDB vCore Index Options. */ +export type AzureCosmosDBMongoDBIndexOptions = { + readonly indexType?: "ivf" | "hnsw" | "diskann" | undefined; + /** Number of clusters that the inverted file (IVF) index uses to group the vector data. */ + readonly numLists?: number | undefined; + /** Number of dimensions for vector similarity. */ + readonly dimensions?: number | undefined; + /** Similarity metric to use with the IVF index. */ + readonly similarity?: AzureCosmosDBMongoDBSimilarityType | undefined; + /** The max number of connections per layer with the HNSW index. */ + readonly m?: number | undefined; + /** The size of the dynamic candidate list for constructing the graph with the HNSW index. */ + readonly efConstruction?: number | undefined; + /** Max number of neighbors withe the Diskann idnex */ + readonly maxDegree?: number | undefined; + /** L value for index building withe the Diskann idnex */ + readonly lBuild?: number | undefined; + /** L value for index searching withe the Diskann idnex */ + readonly lSearch?: number | undefined; +}; + +/** + * Azure Cosmos DB for MongoDB vCore vector store. + * To use this, you should have both: + * - the `mongodb` NPM package installed + * - a connection string associated with a MongoDB VCore Cluster + * + * You do not need to create a database or collection, it will be created + * automatically. + * + * You also need an index on the collection, which is by default be created + * automatically using the `createIndex` method. + */ +export class AzureCosmosDBMongoDBVectorStore extends BaseVectorStore { + storesText: boolean = true; + flatMetadata: boolean = true; + + dbName: string; + + collectionName: string; + + indexedMetadataFields: string[]; + + /** + * The used MongoClient. If not given, a new MongoClient is created based on the MONGODB_URI env variable. + */ + mongodbClient: MongoClient; + + indexName: string; + + embeddingKey: string; + + idKey: string; + + textKey: string; + + metadataKey: string; + + indexOptions: AzureCosmosDBMongoDBIndexOptions; + + private collection?: Collection; + + private database: Db; + + constructor( + init: Partial<AzureCosmosDBMongoDBVectorStore> & { + dbName: string; + collectionName: string; + indexedMetadataFields?: string[]; + } & VectorStoreBaseParams, + ) { + super(init); + if (init.mongodbClient) { + this.mongodbClient = init.mongodbClient; + } else { + const mongoUri = getEnv("AZURE_COSMOSDB_MONGODB_CONNECTION_STRING"); + if (!mongoUri) { + throw new Error( + "AzureCosmosDBMongoDBVectorStore client or connection string must be set.", + ); + } + this.mongodbClient = new MongoClient(mongoUri); + } + + this.dbName = init.dbName ?? "documentsDB"; + this.collectionName = init.collectionName ?? "documents"; + this.indexedMetadataFields = init.indexedMetadataFields ?? []; + this.indexName = init.indexName ?? "vectorSearchIndex"; + this.embeddingKey = init.embeddingKey ?? "vectorContent"; + this.idKey = init.idKey ?? "id"; + this.textKey = init.textKey ?? "text"; + this.metadataKey = init.metadataKey ?? "metadata"; + this.indexOptions = init.indexOptions ?? {}; + this.database = this.mongodbClient.db(this.dbName); + } + + client() { + return this.mongodbClient; + } + + async ensureCollection() { + if (!this.collection) { + const collection = await this.mongodbClient + .db(this.dbName) + .createCollection(this.collectionName); + + this.collection = collection; + } + + return this.collection; + } + + async add(nodes: BaseNode[]): Promise<string[]> { + if (!nodes || nodes.length === 0) { + return []; + } + + const dataToInsert = nodes.map((node) => { + const metadata = nodeToMetadata( + node, + true, + this.textKey, + this.flatMetadata, + ); + + // Include the specified metadata fields in the top level of the document (to help filter) + const populatedMetadata: Record<string, unknown> = {}; + for (const field of this.indexedMetadataFields) { + populatedMetadata[field] = metadata[field]; + } + + return { + [this.idKey]: node.id_, + [this.embeddingKey]: node.getEmbedding(), + [this.textKey]: node.getContent(MetadataMode.NONE) || "", + [this.metadataKey]: metadata, + ...populatedMetadata, + }; + }); + + const collection = await this.ensureCollection(); + const insertResult = await collection.insertMany(dataToInsert); + return Object.values(insertResult.insertedIds).map((id) => String(id)); + } + + /** + * Removes specified documents from the AzureCosmosDBMongoDBVectorStore. + * @param params Parameters for the delete operation. + * @returns A promise that resolves when the documents have been removed. + */ + async delete(id: string, deleteOptions?: object): Promise<void> { + const collection = await this.ensureCollection(); + await collection.deleteMany( + { + id: id, + }, + deleteOptions, + ); + } + + async query( + query: VectorStoreQuery, + options?: object, + ): Promise<VectorStoreQueryResult> { + const pipeline = [ + { + $search: { + cosmosSearch: { + vector: query.queryEmbedding, + path: this.embeddingKey, + k: query.similarityTopK ?? 4, + }, + returnStoredSource: true, + }, + }, + ]; + + const collection = await this.ensureCollection(); + const cursor = await collection.aggregate(pipeline); + + const nodes: BaseNode[] = []; + const ids: string[] = []; + const similarities: number[] = []; + + for await (const res of await cursor) { + const text = res[this.textKey]; + const score = res.score; + const id = res[this.idKey]; + const metadata = res[this.metadataKey]; + + const node = metadataDictToNode(metadata); + node.setContent(text); + + ids.push(id); + nodes.push(node); + similarities.push(score); + } + + const result = { + nodes, + similarities, + ids, + }; + + return result; + } + + /** + * Creates an index on the collection with the specified index name during + * instance construction. + * + * Setting the numLists parameter correctly is important for achieving good + * accuracy and performance. + * Since the vector store uses IVF as the indexing strategy, you should + * create the index only after you have loaded a large enough sample + * documents to ensure that the centroids for the respective buckets are + * faily distributed. + * + * @param indexType Index Type for Mongo vCore index. + * @param dimensions Number of dimensions for vector similarity. + * The maximum number of supported dimensions is 2000. + * If no number is provided, it will be determined automatically by + * embedding a short text. + * @param similarity Similarity metric to use with the IVF index. + * Possible options are: + * - CosmosDBSimilarityType.COS (cosine distance) + * - CosmosDBSimilarityType.L2 (Euclidean distance) + * - CosmosDBSimilarityType.IP (inner product) + * @returns A promise that resolves when the index has been created. + */ + async createIndex( + dimensions: number | undefined = undefined, + indexType: "ivf" | "hnsw" | "diskann" = "ivf", + similarity: AzureCosmosDBMongoDBSimilarityType = AzureCosmosDBMongoDBSimilarityType.COS, + ): Promise<void> { + let vectorLength = dimensions; + + if (vectorLength === undefined) { + vectorLength = 1536; + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const cosmosSearchOptions: any = { + kind: "", + similarity, + dimensions: vectorLength, + }; + + if (indexType === "hnsw") { + cosmosSearchOptions.kind = "vector-hnsw"; + cosmosSearchOptions.m = this.indexOptions.m ?? 16; + cosmosSearchOptions.efConstruction = + this.indexOptions.efConstruction ?? 200; + } else if (indexType === "diskann") { + cosmosSearchOptions.kind = "vector-diskann"; + cosmosSearchOptions.maxDegree = this.indexOptions.maxDegree ?? 40; + cosmosSearchOptions.lBuild = this.indexOptions.lBuild ?? 50; + cosmosSearchOptions.lSearch = this.indexOptions.lSearch ?? 40; + /** Default to IVF index */ + } else { + cosmosSearchOptions.kind = "vector-ivf"; + cosmosSearchOptions.numLists = this.indexOptions.numLists ?? 100; + } + + const createIndexCommands = { + createIndexes: this.collection?.collectionName, + indexes: [ + { + name: this.indexName, + key: { [this.embeddingKey]: "cosmosSearch" }, + cosmosSearchOptions, + }, + ], + }; + + await this.database.command(createIndexCommands); + } + + /** + * Checks if the specified index name during instance construction exists + * on the collection. + * @returns A promise that resolves to a boolean indicating if the index exists. + */ + async checkIndexExists(): Promise<boolean> { + const collection = await this.ensureCollection(); + const indexes = await collection.listIndexes().toArray(); + return indexes.some((index) => index.name === this.indexName); + } + + /** + * Deletes the index specified during instance construction if it exists. + * @returns A promise that resolves when the index has been deleted. + */ + async deleteIndex(indexName: string): Promise<void> { + const collection = await this.ensureCollection(); + const indexes = await collection.listIndexes().toArray(); + const indexToDelete = indexes.find((index) => index.name === indexName); + if (indexToDelete) { + await collection.dropIndex(indexName); + } + } +} diff --git a/packages/llamaindex/src/vector-store/index.ts b/packages/llamaindex/src/vector-store/index.ts index 90eac71d0..dc64a4d56 100644 --- a/packages/llamaindex/src/vector-store/index.ts +++ b/packages/llamaindex/src/vector-store/index.ts @@ -1,4 +1,5 @@ export * from "./AstraDBVectorStore.js"; +export * from "./AzureCosmosDBMongoVectorStore.js"; export * from "./AzureCosmosDBNoSqlVectorStore.js"; export * from "./ChromaVectorStore.js"; export * from "./MilvusVectorStore.js"; -- GitLab