diff --git a/server/utils/vectorDbProviders/milvus/index.js b/server/utils/vectorDbProviders/milvus/index.js index d1eec9f7116b646729846929381fe1026f5aefe1..3dac303feb49ba439b26fa0c9cc76d99ef6d63a1 100644 --- a/server/utils/vectorDbProviders/milvus/index.js +++ b/server/utils/vectorDbProviders/milvus/index.js @@ -15,6 +15,12 @@ const { const Milvus = { name: "Milvus", + // Milvus/Zilliz only allows letters, numbers, and underscores in collection names + // so we need to enforce that by re-normalizing the names when communicating with + // the DB. + normalize: function (inputString) { + return inputString.replace(/[^a-zA-Z0-9_]/g, "_"); + }, connect: async function () { if (process.env.VECTOR_DB !== "milvus") throw new Error("Milvus::Invalid ENV settings"); @@ -42,7 +48,7 @@ const Milvus = { const { collection_names } = await client.listCollections(); const total = collection_names.reduce(async (acc, collection_name) => { const statistics = await client.getCollectionStatistics({ - collection_name, + collection_name: this.normalize(collection_name), }); return Number(acc) + Number(statistics?.data?.row_count ?? 0); }, 0); @@ -51,14 +57,14 @@ const Milvus = { namespaceCount: async function (_namespace = null) { const { client } = await this.connect(); const statistics = await client.getCollectionStatistics({ - collection_name: _namespace, + collection_name: this.normalize(_namespace), }); return Number(statistics?.data?.row_count ?? 0); }, namespace: async function (client, namespace = null) { if (!namespace) throw new Error("No namespace value provided."); const collection = await client - .getCollectionStatistics({ collection_name: namespace }) + .getCollectionStatistics({ collection_name: this.normalize(namespace) }) .catch(() => null); return collection; }, @@ -70,7 +76,7 @@ const Milvus = { namespaceExists: async function (client, namespace = null) { if (!namespace) throw new Error("No namespace value provided."); const { value } = await client - .hasCollection({ collection_name: namespace }) + .hasCollection({ collection_name: this.normalize(namespace) }) .catch((e) => { console.error("MilvusDB::namespaceExists", e.message); return { value: false }; @@ -78,7 +84,7 @@ const Milvus = { return value; }, deleteVectorsInNamespace: async function (client, namespace = null) { - await client.dropCollection({ collection_name: namespace }); + await client.dropCollection({ collection_name: this.normalize(namespace) }); return true; }, // Milvus requires a dimension aspect for collection creation @@ -93,7 +99,7 @@ const Milvus = { ); await client.createCollection({ - collection_name: namespace, + collection_name: this.normalize(namespace), fields: [ { name: "id", @@ -116,13 +122,13 @@ const Milvus = { ], }); await client.createIndex({ - collection_name: namespace, + collection_name: this.normalize(namespace), field_name: "vector", index_type: IndexType.AUTOINDEX, metric_type: MetricType.COSINE, }); await client.loadCollectionSync({ - collection_name: namespace, + collection_name: this.normalize(namespace), }); } }, @@ -155,7 +161,7 @@ const Milvus = { return { id, vector: chunk.values, metadata: chunk.metadata }; }); const insertResult = await client.insert({ - collection_name: namespace, + collection_name: this.normalize(namespace), data: newChunks, }); @@ -166,7 +172,9 @@ const Milvus = { } } await DocumentVectors.bulkInsert(documentVectors); - await client.flushSync({ collection_names: [namespace] }); + await client.flushSync({ + collection_names: [this.normalize(namespace)], + }); return { vectorized: true, error: null }; } @@ -212,7 +220,7 @@ const Milvus = { for (const chunk of toChunks(vectors, 100)) { chunks.push(chunk); const insertResult = await client.insert({ - collection_name: namespace, + collection_name: this.normalize(namespace), data: chunk.map((item) => ({ id: item.id, vector: item.values, @@ -227,7 +235,9 @@ const Milvus = { } } await storeVectorResult(chunks, fullFilePath); - await client.flushSync({ collection_names: [namespace] }); + await client.flushSync({ + collection_names: [this.normalize(namespace)], + }); } await DocumentVectors.bulkInsert(documentVectors); @@ -247,7 +257,7 @@ const Milvus = { const vectorIds = knownDocuments.map((doc) => doc.vectorId); const queryIn = vectorIds.map((v) => `'${v}'`).join(","); await client.deleteEntities({ - collection_name: namespace, + collection_name: this.normalize(namespace), expr: `id in [${queryIn}]`, }); @@ -257,7 +267,7 @@ const Milvus = { // Even after flushing Milvus can take some time to re-calc the count // so all we can hope to do is flushSync so that the count can be correct // on a later call. - await client.flushSync({ collection_names: [namespace] }); + await client.flushSync({ collection_names: [this.normalize(namespace)] }); return true; }, performSimilaritySearch: async function ({ @@ -310,7 +320,7 @@ const Milvus = { scores: [], }; const response = await client.search({ - collection_name: namespace, + collection_name: this.normalize(namespace), vectors: queryVector, limit: topN, }); diff --git a/server/utils/vectorDbProviders/zilliz/index.js b/server/utils/vectorDbProviders/zilliz/index.js index 159ea809475449f84f104e22759c07fc4854865a..9b1d5a221ea676e4f080223494c6b5ed07bfe052 100644 --- a/server/utils/vectorDbProviders/zilliz/index.js +++ b/server/utils/vectorDbProviders/zilliz/index.js @@ -17,6 +17,12 @@ const { // to connect to the cloud const Zilliz = { name: "Zilliz", + // Milvus/Zilliz only allows letters, numbers, and underscores in collection names + // so we need to enforce that by re-normalizing the names when communicating with + // the DB. + normalize: function (inputString) { + return inputString.replace(/[^a-zA-Z0-9_]/g, "_"); + }, connect: async function () { if (process.env.VECTOR_DB !== "zilliz") throw new Error("Zilliz::Invalid ENV settings"); @@ -43,7 +49,7 @@ const Zilliz = { const { collection_names } = await client.listCollections(); const total = collection_names.reduce(async (acc, collection_name) => { const statistics = await client.getCollectionStatistics({ - collection_name, + collection_name: this.normalize(collection_name), }); return Number(acc) + Number(statistics?.data?.row_count ?? 0); }, 0); @@ -52,14 +58,14 @@ const Zilliz = { namespaceCount: async function (_namespace = null) { const { client } = await this.connect(); const statistics = await client.getCollectionStatistics({ - collection_name: _namespace, + collection_name: this.normalize(_namespace), }); return Number(statistics?.data?.row_count ?? 0); }, namespace: async function (client, namespace = null) { if (!namespace) throw new Error("No namespace value provided."); const collection = await client - .getCollectionStatistics({ collection_name: namespace }) + .getCollectionStatistics({ collection_name: this.normalize(namespace) }) .catch(() => null); return collection; }, @@ -71,7 +77,7 @@ const Zilliz = { namespaceExists: async function (client, namespace = null) { if (!namespace) throw new Error("No namespace value provided."); const { value } = await client - .hasCollection({ collection_name: namespace }) + .hasCollection({ collection_name: this.normalize(namespace) }) .catch((e) => { console.error("Zilliz::namespaceExists", e.message); return { value: false }; @@ -79,7 +85,7 @@ const Zilliz = { return value; }, deleteVectorsInNamespace: async function (client, namespace = null) { - await client.dropCollection({ collection_name: namespace }); + await client.dropCollection({ collection_name: this.normalize(namespace) }); return true; }, // Zilliz requires a dimension aspect for collection creation @@ -94,7 +100,7 @@ const Zilliz = { ); await client.createCollection({ - collection_name: namespace, + collection_name: this.normalize(namespace), fields: [ { name: "id", @@ -117,13 +123,13 @@ const Zilliz = { ], }); await client.createIndex({ - collection_name: namespace, + collection_name: this.normalize(namespace), field_name: "vector", index_type: IndexType.AUTOINDEX, metric_type: MetricType.COSINE, }); await client.loadCollectionSync({ - collection_name: namespace, + collection_name: this.normalize(namespace), }); } }, @@ -156,7 +162,7 @@ const Zilliz = { return { id, vector: chunk.values, metadata: chunk.metadata }; }); const insertResult = await client.insert({ - collection_name: namespace, + collection_name: this.normalize(namespace), data: newChunks, }); @@ -167,7 +173,9 @@ const Zilliz = { } } await DocumentVectors.bulkInsert(documentVectors); - await client.flushSync({ collection_names: [namespace] }); + await client.flushSync({ + collection_names: [this.normalize(namespace)], + }); return { vectorized: true, error: null }; } @@ -213,7 +221,7 @@ const Zilliz = { for (const chunk of toChunks(vectors, 100)) { chunks.push(chunk); const insertResult = await client.insert({ - collection_name: namespace, + collection_name: this.normalize(namespace), data: chunk.map((item) => ({ id: item.id, vector: item.values, @@ -228,7 +236,9 @@ const Zilliz = { } } await storeVectorResult(chunks, fullFilePath); - await client.flushSync({ collection_names: [namespace] }); + await client.flushSync({ + collection_names: [this.normalize(namespace)], + }); } await DocumentVectors.bulkInsert(documentVectors); @@ -248,7 +258,7 @@ const Zilliz = { const vectorIds = knownDocuments.map((doc) => doc.vectorId); const queryIn = vectorIds.map((v) => `'${v}'`).join(","); await client.deleteEntities({ - collection_name: namespace, + collection_name: this.normalize(namespace), expr: `id in [${queryIn}]`, }); @@ -258,7 +268,7 @@ const Zilliz = { // Even after flushing Zilliz can take some time to re-calc the count // so all we can hope to do is flushSync so that the count can be correct // on a later call. - await client.flushSync({ collection_names: [namespace] }); + await client.flushSync({ collection_names: [this.normalize(namespace)] }); return true; }, performSimilaritySearch: async function ({ @@ -311,7 +321,7 @@ const Zilliz = { scores: [], }; const response = await client.search({ - collection_name: namespace, + collection_name: this.normalize(namespace), vectors: queryVector, limit: topN, });