From 050cd53450ad7bafeff332a1a5c959a31c5edeb2 Mon Sep 17 00:00:00 2001 From: ANKIT VARSHNEY <132201033+AVtheking@users.noreply.github.com> Date: Tue, 18 Mar 2025 21:33:42 +0530 Subject: [PATCH] fix: delete by id in pinecone vector store (#1758) --- .changeset/itchy-seas-enjoy.md | 5 ++ .../tests/node-parser/text-splitter.test.ts | 1 + .../pinecone/src/PineconeVectorStore.ts | 49 +++++++++++++++++-- 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 .changeset/itchy-seas-enjoy.md diff --git a/.changeset/itchy-seas-enjoy.md b/.changeset/itchy-seas-enjoy.md new file mode 100644 index 000000000..a97fec417 --- /dev/null +++ b/.changeset/itchy-seas-enjoy.md @@ -0,0 +1,5 @@ +--- +"@llamaindex/pinecone": minor +--- + +Fix deleting of document by id in PineconeVectorStore diff --git a/packages/core/tests/node-parser/text-splitter.test.ts b/packages/core/tests/node-parser/text-splitter.test.ts index 531896492..89e17fb57 100644 --- a/packages/core/tests/node-parser/text-splitter.test.ts +++ b/packages/core/tests/node-parser/text-splitter.test.ts @@ -126,6 +126,7 @@ describe("sentence splitter", () => { id_: docId, text: "This is a test sentence. This is another test sentence.", }); + const nodes = sentenceSplitter.getNodesFromDocuments([doc]); nodes.forEach((node) => { // test node id should match uuid regex diff --git a/packages/providers/storage/pinecone/src/PineconeVectorStore.ts b/packages/providers/storage/pinecone/src/PineconeVectorStore.ts index 83e335296..e32956838 100644 --- a/packages/providers/storage/pinecone/src/PineconeVectorStore.ts +++ b/packages/providers/storage/pinecone/src/PineconeVectorStore.ts @@ -117,7 +117,15 @@ export class PineconeVectorStore extends BaseVectorStore { } const idx: Index = await this.index(); - const nodes = embeddingResults.map(this.nodeToRecord); + const nodes = embeddingResults.map((node) => { + const nodeRecord = this.nodeToRecord(node); + + if (nodeRecord.metadata.ref_doc_id) { + // adding refDoc id as prefix to the chunk to find them using refDoc id + nodeRecord.id = `${nodeRecord.metadata.ref_doc_id}_chunk_${nodeRecord.id}`; + } + return nodeRecord; + }); for (let i = 0; i < nodes.length; i += this.chunkSize) { const chunk = nodes.slice(i, i + this.chunkSize); @@ -148,8 +156,43 @@ export class PineconeVectorStore extends BaseVectorStore { * @returns Promise that resolves if the delete query did not throw an error. */ async delete(refDocId: string, deleteKwargs?: object): Promise<void> { - const idx = await this.index(); - return idx.deleteOne(refDocId); + const [idx, index] = await Promise.all([ + this.index(), + //to get the information about the index + this.db?.describeIndex(this.indexName), + ]); + + if (index?.spec?.pod) { + //if the index is a pod, delete the document by the metadata + await idx.deleteMany({ + metadata: { + ref_doc_id: refDocId, + }, + }); + } else if (index?.spec?.serverless) { + // filtering on metadata is not supported in serverless indexes + // for serverless indexes, we can delete document by ID prefix + // ref:https://docs.pinecone.io/guides/data/delete-data#delete-records-by-metadata + // get the list of ids with the prefix (not supportered in non serverless indexes) + let list = await idx.listPaginated({ + prefix: refDocId, + }); + //do while loop to delete the document if there is no next paginationToken + do { + const ids = list?.vectors?.map((v) => v.id); + + if (ids && ids.length > 0) { + await idx.deleteMany(ids); + } + + if (list.pagination?.next) { + list = await idx.listPaginated({ + prefix: refDocId, + paginationToken: list.pagination?.next, + }); + } + } while (list.pagination?.next); + } } /** -- GitLab