From 515f2c1e3df2d96228a7d540b0237b01c8b03d74 Mon Sep 17 00:00:00 2001
From: crisjy <cjy1994116@163.com>
Date: Fri, 29 Nov 2024 09:53:09 +0800
Subject: [PATCH] feat: add AzureCosmosDBMongoVectorStore (#1528)

---
 .changeset/stale-parents-perform.md           |   5 +
 .../AzureCosmosDBMongoVectorStore.ts          | 328 ++++++++++++++++++
 packages/llamaindex/src/vector-store/index.ts |   1 +
 3 files changed, 334 insertions(+)
 create mode 100644 .changeset/stale-parents-perform.md
 create mode 100644 packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts

diff --git a/.changeset/stale-parents-perform.md b/.changeset/stale-parents-perform.md
new file mode 100644
index 000000000..6c9195aa7
--- /dev/null
+++ b/.changeset/stale-parents-perform.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Add vector store for CosmosDB
diff --git a/packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts b/packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts
new file mode 100644
index 000000000..051bdac40
--- /dev/null
+++ b/packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts
@@ -0,0 +1,328 @@
+import type { BaseNode } from "@llamaindex/core/schema";
+import { MetadataMode } from "@llamaindex/core/schema";
+import { getEnv } from "@llamaindex/env";
+import { Collection, Db, MongoClient } from "mongodb";
+import {
+  BaseVectorStore,
+  type VectorStoreBaseParams,
+  type VectorStoreQuery,
+  type VectorStoreQueryResult,
+} from "./types.js";
+import { metadataDictToNode, nodeToMetadata } from "./utils.js";
+
+/** Azure Cosmos DB for MongoDB vCore Similarity type. */
+export const AzureCosmosDBMongoDBSimilarityType = {
+  /** Cosine similarity */
+  COS: "COS",
+  /** Inner - product */
+  IP: "IP",
+  /** Euclidian distance */
+  L2: "L2",
+} as const;
+
+/** Azure Cosmos DB for MongoDB vCore Similarity type. */
+export type AzureCosmosDBMongoDBSimilarityType =
+  (typeof AzureCosmosDBMongoDBSimilarityType)[keyof typeof AzureCosmosDBMongoDBSimilarityType];
+
+/** Azure Cosmos DB for MongoDB vCore Index Options. */
+export type AzureCosmosDBMongoDBIndexOptions = {
+  readonly indexType?: "ivf" | "hnsw" | "diskann" | undefined;
+  /** Number of clusters that the inverted file (IVF) index uses to group the vector data. */
+  readonly numLists?: number | undefined;
+  /** Number of dimensions for vector similarity. */
+  readonly dimensions?: number | undefined;
+  /** Similarity metric to use with the IVF index. */
+  readonly similarity?: AzureCosmosDBMongoDBSimilarityType | undefined;
+  /** The max number of connections per layer with the HNSW index. */
+  readonly m?: number | undefined;
+  /** The size of the dynamic candidate list for constructing the graph with the HNSW index. */
+  readonly efConstruction?: number | undefined;
+  /** Max number of neighbors withe the Diskann idnex */
+  readonly maxDegree?: number | undefined;
+  /** L value for index building withe the Diskann idnex */
+  readonly lBuild?: number | undefined;
+  /** L value for index searching withe the Diskann idnex */
+  readonly lSearch?: number | undefined;
+};
+
+/**
+ * Azure Cosmos DB for MongoDB vCore vector store.
+ * To use this, you should have both:
+ * - the `mongodb` NPM package installed
+ * - a connection string associated with a MongoDB VCore Cluster
+ *
+ * You do not need to create a database or collection, it will be created
+ * automatically.
+ *
+ * You also need an index on the collection, which is by default be created
+ * automatically using the `createIndex` method.
+ */
+export class AzureCosmosDBMongoDBVectorStore extends BaseVectorStore {
+  storesText: boolean = true;
+  flatMetadata: boolean = true;
+
+  dbName: string;
+
+  collectionName: string;
+
+  indexedMetadataFields: string[];
+
+  /**
+   * The used MongoClient. If not given, a new MongoClient is created based on the MONGODB_URI env variable.
+   */
+  mongodbClient: MongoClient;
+
+  indexName: string;
+
+  embeddingKey: string;
+
+  idKey: string;
+
+  textKey: string;
+
+  metadataKey: string;
+
+  indexOptions: AzureCosmosDBMongoDBIndexOptions;
+
+  private collection?: Collection;
+
+  private database: Db;
+
+  constructor(
+    init: Partial<AzureCosmosDBMongoDBVectorStore> & {
+      dbName: string;
+      collectionName: string;
+      indexedMetadataFields?: string[];
+    } & VectorStoreBaseParams,
+  ) {
+    super(init);
+    if (init.mongodbClient) {
+      this.mongodbClient = init.mongodbClient;
+    } else {
+      const mongoUri = getEnv("AZURE_COSMOSDB_MONGODB_CONNECTION_STRING");
+      if (!mongoUri) {
+        throw new Error(
+          "AzureCosmosDBMongoDBVectorStore client or connection string must be set.",
+        );
+      }
+      this.mongodbClient = new MongoClient(mongoUri);
+    }
+
+    this.dbName = init.dbName ?? "documentsDB";
+    this.collectionName = init.collectionName ?? "documents";
+    this.indexedMetadataFields = init.indexedMetadataFields ?? [];
+    this.indexName = init.indexName ?? "vectorSearchIndex";
+    this.embeddingKey = init.embeddingKey ?? "vectorContent";
+    this.idKey = init.idKey ?? "id";
+    this.textKey = init.textKey ?? "text";
+    this.metadataKey = init.metadataKey ?? "metadata";
+    this.indexOptions = init.indexOptions ?? {};
+    this.database = this.mongodbClient.db(this.dbName);
+  }
+
+  client() {
+    return this.mongodbClient;
+  }
+
+  async ensureCollection() {
+    if (!this.collection) {
+      const collection = await this.mongodbClient
+        .db(this.dbName)
+        .createCollection(this.collectionName);
+
+      this.collection = collection;
+    }
+
+    return this.collection;
+  }
+
+  async add(nodes: BaseNode[]): Promise<string[]> {
+    if (!nodes || nodes.length === 0) {
+      return [];
+    }
+
+    const dataToInsert = nodes.map((node) => {
+      const metadata = nodeToMetadata(
+        node,
+        true,
+        this.textKey,
+        this.flatMetadata,
+      );
+
+      // Include the specified metadata fields in the top level of the document (to help filter)
+      const populatedMetadata: Record<string, unknown> = {};
+      for (const field of this.indexedMetadataFields) {
+        populatedMetadata[field] = metadata[field];
+      }
+
+      return {
+        [this.idKey]: node.id_,
+        [this.embeddingKey]: node.getEmbedding(),
+        [this.textKey]: node.getContent(MetadataMode.NONE) || "",
+        [this.metadataKey]: metadata,
+        ...populatedMetadata,
+      };
+    });
+
+    const collection = await this.ensureCollection();
+    const insertResult = await collection.insertMany(dataToInsert);
+    return Object.values(insertResult.insertedIds).map((id) => String(id));
+  }
+
+  /**
+   * Removes specified documents from the AzureCosmosDBMongoDBVectorStore.
+   * @param params Parameters for the delete operation.
+   * @returns A promise that resolves when the documents have been removed.
+   */
+  async delete(id: string, deleteOptions?: object): Promise<void> {
+    const collection = await this.ensureCollection();
+    await collection.deleteMany(
+      {
+        id: id,
+      },
+      deleteOptions,
+    );
+  }
+
+  async query(
+    query: VectorStoreQuery,
+    options?: object,
+  ): Promise<VectorStoreQueryResult> {
+    const pipeline = [
+      {
+        $search: {
+          cosmosSearch: {
+            vector: query.queryEmbedding,
+            path: this.embeddingKey,
+            k: query.similarityTopK ?? 4,
+          },
+          returnStoredSource: true,
+        },
+      },
+    ];
+
+    const collection = await this.ensureCollection();
+    const cursor = await collection.aggregate(pipeline);
+
+    const nodes: BaseNode[] = [];
+    const ids: string[] = [];
+    const similarities: number[] = [];
+
+    for await (const res of await cursor) {
+      const text = res[this.textKey];
+      const score = res.score;
+      const id = res[this.idKey];
+      const metadata = res[this.metadataKey];
+
+      const node = metadataDictToNode(metadata);
+      node.setContent(text);
+
+      ids.push(id);
+      nodes.push(node);
+      similarities.push(score);
+    }
+
+    const result = {
+      nodes,
+      similarities,
+      ids,
+    };
+
+    return result;
+  }
+
+  /**
+   * Creates an index on the collection with the specified index name during
+   * instance construction.
+   *
+   * Setting the numLists parameter correctly is important for achieving good
+   * accuracy and performance.
+   * Since the vector store uses IVF as the indexing strategy, you should
+   * create the index only after you have loaded a large enough sample
+   * documents to ensure that the centroids for the respective buckets are
+   * faily distributed.
+   *
+   * @param indexType Index Type for Mongo vCore index.
+   * @param dimensions Number of dimensions for vector similarity.
+   *    The maximum number of supported dimensions is 2000.
+   *    If no number is provided, it will be determined automatically by
+   *    embedding a short text.
+   * @param similarity Similarity metric to use with the IVF index.
+   *    Possible options are:
+   *    - CosmosDBSimilarityType.COS (cosine distance)
+   *    - CosmosDBSimilarityType.L2 (Euclidean distance)
+   *    - CosmosDBSimilarityType.IP (inner product)
+   * @returns A promise that resolves when the index has been created.
+   */
+  async createIndex(
+    dimensions: number | undefined = undefined,
+    indexType: "ivf" | "hnsw" | "diskann" = "ivf",
+    similarity: AzureCosmosDBMongoDBSimilarityType = AzureCosmosDBMongoDBSimilarityType.COS,
+  ): Promise<void> {
+    let vectorLength = dimensions;
+
+    if (vectorLength === undefined) {
+      vectorLength = 1536;
+    }
+
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    const cosmosSearchOptions: any = {
+      kind: "",
+      similarity,
+      dimensions: vectorLength,
+    };
+
+    if (indexType === "hnsw") {
+      cosmosSearchOptions.kind = "vector-hnsw";
+      cosmosSearchOptions.m = this.indexOptions.m ?? 16;
+      cosmosSearchOptions.efConstruction =
+        this.indexOptions.efConstruction ?? 200;
+    } else if (indexType === "diskann") {
+      cosmosSearchOptions.kind = "vector-diskann";
+      cosmosSearchOptions.maxDegree = this.indexOptions.maxDegree ?? 40;
+      cosmosSearchOptions.lBuild = this.indexOptions.lBuild ?? 50;
+      cosmosSearchOptions.lSearch = this.indexOptions.lSearch ?? 40;
+      /** Default to IVF index */
+    } else {
+      cosmosSearchOptions.kind = "vector-ivf";
+      cosmosSearchOptions.numLists = this.indexOptions.numLists ?? 100;
+    }
+
+    const createIndexCommands = {
+      createIndexes: this.collection?.collectionName,
+      indexes: [
+        {
+          name: this.indexName,
+          key: { [this.embeddingKey]: "cosmosSearch" },
+          cosmosSearchOptions,
+        },
+      ],
+    };
+
+    await this.database.command(createIndexCommands);
+  }
+
+  /**
+   * Checks if the specified index name during instance construction exists
+   * on the collection.
+   * @returns A promise that resolves to a boolean indicating if the index exists.
+   */
+  async checkIndexExists(): Promise<boolean> {
+    const collection = await this.ensureCollection();
+    const indexes = await collection.listIndexes().toArray();
+    return indexes.some((index) => index.name === this.indexName);
+  }
+
+  /**
+   * Deletes the index specified during instance construction if it exists.
+   * @returns A promise that resolves when the index has been deleted.
+   */
+  async deleteIndex(indexName: string): Promise<void> {
+    const collection = await this.ensureCollection();
+    const indexes = await collection.listIndexes().toArray();
+    const indexToDelete = indexes.find((index) => index.name === indexName);
+    if (indexToDelete) {
+      await collection.dropIndex(indexName);
+    }
+  }
+}
diff --git a/packages/llamaindex/src/vector-store/index.ts b/packages/llamaindex/src/vector-store/index.ts
index 90eac71d0..dc64a4d56 100644
--- a/packages/llamaindex/src/vector-store/index.ts
+++ b/packages/llamaindex/src/vector-store/index.ts
@@ -1,4 +1,5 @@
 export * from "./AstraDBVectorStore.js";
+export * from "./AzureCosmosDBMongoVectorStore.js";
 export * from "./AzureCosmosDBNoSqlVectorStore.js";
 export * from "./ChromaVectorStore.js";
 export * from "./MilvusVectorStore.js";
-- 
GitLab