Skip to content
Snippets Groups Projects
Unverified Commit 3553f56f authored by Javier Torres's avatar Javier Torres Committed by GitHub
Browse files

Refactor kvdocstore delete methods (#12681)

* refactor docstore delete

* sanity check

* sanity check
parent 32e3c1f6
No related branches found
No related tags found
No related merge requests found
......@@ -374,90 +374,94 @@ class KVDocumentStore(BaseDocumentStore):
"""Check if document exists."""
return await self._kvstore.aget(doc_id, self._node_collection) is not None
def _remove_ref_doc_node(self, doc_id: str) -> None:
"""Helper function to remove node doc_id from ref_doc_collection."""
def _get_ref_doc_id(self, doc_id: str) -> Optional[str]:
"""Helper function to get ref_doc_info for a given doc_id."""
metadata = self._kvstore.get(doc_id, collection=self._metadata_collection)
if metadata is None:
return
return None
return metadata.get("ref_doc_id", None)
async def _aget_ref_doc_id(self, doc_id: str) -> Optional[str]:
"""Helper function to get ref_doc_info for a given doc_id."""
metadata = await self._kvstore.aget(
doc_id, collection=self._metadata_collection
)
if metadata is None:
return None
ref_doc_id = metadata.get("ref_doc_id", None)
return metadata.get("ref_doc_id", None)
def _remove_from_ref_doc_node(self, doc_id: str) -> None:
"""
Helper function to remove node doc_id from ref_doc_collection.
If ref_doc has no more doc_ids, delete it from the collection.
"""
ref_doc_id = self._get_ref_doc_id(doc_id)
if ref_doc_id is None:
return
ref_doc_info = self._kvstore.get(
ref_doc_id, collection=self._ref_doc_collection
)
if ref_doc_info is not None:
ref_doc_obj = RefDocInfo(**ref_doc_info)
if ref_doc_info is None:
return
ref_doc_obj = RefDocInfo(**ref_doc_info)
if doc_id in ref_doc_obj.node_ids: # sanity check
ref_doc_obj.node_ids.remove(doc_id)
# delete ref_doc from collection if it has no more doc_ids
if len(ref_doc_obj.node_ids) > 0:
self._kvstore.put(
ref_doc_id,
ref_doc_obj.to_dict(),
collection=self._ref_doc_collection,
)
# delete ref_doc from collection if it has no more doc_ids
if len(ref_doc_obj.node_ids) > 0:
self._kvstore.put(
ref_doc_id,
ref_doc_obj.to_dict(),
collection=self._ref_doc_collection,
)
else:
self._kvstore.delete(ref_doc_id, collection=self._metadata_collection)
self._kvstore.delete(ref_doc_id, collection=self._node_collection)
self._kvstore.delete(ref_doc_id, collection=self._ref_doc_collection)
async def _aremove_ref_doc_node(self, doc_id: str) -> None:
"""Helper function to remove node doc_id from ref_doc_collection."""
metadata = await self._kvstore.aget(
doc_id, collection=self._metadata_collection
)
if metadata is None:
return
ref_doc_id = metadata.get("ref_doc_id", None)
async def _aremove_from_ref_doc_node(self, doc_id: str) -> None:
"""
Helper function to remove node doc_id from ref_doc_collection.
If ref_doc has no more doc_ids, delete it from the collection.
"""
ref_doc_id = await self._aget_ref_doc_id(doc_id)
if ref_doc_id is None:
return
ref_doc_info = await self._kvstore.aget(
ref_doc_id, collection=self._ref_doc_collection
)
if ref_doc_info is not None:
ref_doc_obj = RefDocInfo(**ref_doc_info)
if ref_doc_info is None:
return
ref_doc_obj = RefDocInfo(**ref_doc_info)
if doc_id in ref_doc_obj.node_ids: # sanity check
ref_doc_obj.node_ids.remove(doc_id)
# delete ref_doc from collection if it has no more doc_ids
if len(ref_doc_obj.node_ids) > 0:
await self._kvstore.aput(
ref_doc_id,
ref_doc_obj.to_dict(),
collection=self._ref_doc_collection,
)
# delete ref_doc from collection if it has no more doc_ids
if len(ref_doc_obj.node_ids) > 0:
await self._kvstore.aput(
ref_doc_id,
ref_doc_obj.to_dict(),
collection=self._ref_doc_collection,
)
else:
await self._kvstore.adelete(
ref_doc_id, collection=self._metadata_collection
)
await self._kvstore.adelete(ref_doc_id, collection=self._node_collection)
await self._kvstore.adelete(ref_doc_id, collection=self._ref_doc_collection)
def delete_document(
self, doc_id: str, raise_error: bool = True, remove_ref_doc_node: bool = True
) -> None:
def delete_document(self, doc_id: str, raise_error: bool = True) -> None:
"""Delete a document from the store."""
if remove_ref_doc_node:
self._remove_ref_doc_node(doc_id)
self._remove_from_ref_doc_node(doc_id)
delete_success = self._kvstore.delete(doc_id, collection=self._node_collection)
_ = self._kvstore.delete(doc_id, collection=self._metadata_collection)
if not delete_success and raise_error:
raise ValueError(f"doc_id {doc_id} not found.")
async def adelete_document(
self, doc_id: str, raise_error: bool = True, remove_ref_doc_node: bool = True
) -> None:
async def adelete_document(self, doc_id: str, raise_error: bool = True) -> None:
"""Delete a document from the store."""
if remove_ref_doc_node:
await self._aremove_ref_doc_node(doc_id)
await self._aremove_from_ref_doc_node(doc_id)
delete_success = await self._kvstore.adelete(
doc_id, collection=self._node_collection
)
......@@ -475,11 +479,16 @@ class KVDocumentStore(BaseDocumentStore):
else:
return
for doc_id in ref_doc_info.node_ids:
self.delete_document(doc_id, raise_error=False, remove_ref_doc_node=False)
self._kvstore.delete(doc_id, collection=self._metadata_collection)
original_node_ids = (
ref_doc_info.node_ids.copy()
) # copy to avoid mutation during iteration
for doc_id in original_node_ids:
self.delete_document(doc_id, raise_error=False)
# Deleting all the nodes should already delete the ref_doc, but just to be sure
self._kvstore.delete(ref_doc_id, collection=self._ref_doc_collection)
self._kvstore.delete(ref_doc_id, collection=self._metadata_collection)
self._kvstore.delete(ref_doc_id, collection=self._node_collection)
async def adelete_ref_doc(self, ref_doc_id: str, raise_error: bool = True) -> None:
"""Delete a ref_doc and all it's associated nodes."""
......@@ -490,13 +499,16 @@ class KVDocumentStore(BaseDocumentStore):
else:
return
for doc_id in ref_doc_info.node_ids:
await self.adelete_document(
doc_id, raise_error=False, remove_ref_doc_node=False
)
await self._kvstore.adelete(doc_id, collection=self._metadata_collection)
original_node_ids = (
ref_doc_info.node_ids.copy()
) # copy to avoid mutation during iteration
for doc_id in original_node_ids:
await self.adelete_document(doc_id, raise_error=False)
# Deleting all the nodes should already delete the ref_doc, but just to be sure
await self._kvstore.adelete(ref_doc_id, collection=self._ref_doc_collection)
await self._kvstore.adelete(ref_doc_id, collection=self._metadata_collection)
await self._kvstore.adelete(ref_doc_id, collection=self._node_collection)
def set_document_hash(self, doc_id: str, doc_hash: str) -> None:
"""Set the hash for a given doc_id."""
......
......@@ -4,7 +4,7 @@
from pathlib import Path
import pytest
from llama_index.core.schema import Document, TextNode
from llama_index.core.schema import Document, TextNode, NodeRelationship
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.kvstore.simple_kvstore import SimpleKVStore
......@@ -62,3 +62,94 @@ def test_docstore_dict() -> None:
assert gd1 == doc
gd2 = new_docstore.get_document("d2")
assert gd2 == node
def test_docstore_delete_document() -> None:
doc = Document(text="hello world", id_="d1", metadata={"foo": "bar"})
node = TextNode(text="my node", id_="d2", metadata={"node": "info"})
docstore = SimpleDocumentStore()
docstore.add_documents([doc, node])
docstore.delete_document("d1")
assert docstore._kvstore.get("d1", docstore._node_collection) is None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d2", docstore._node_collection) is not None
assert docstore._kvstore.get("d2", docstore._metadata_collection) is not None
def test_docstore_delete_ref_doc() -> None:
ref_doc = Document(text="hello world", id_="d1", metadata={"foo": "bar"})
doc = Document(text="hello world", id_="d2", metadata={"foo": "bar"})
doc.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
node = TextNode(text="my node", id_="d3", metadata={"node": "info"})
node.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
docstore = SimpleDocumentStore()
docstore.add_documents([ref_doc, doc, node])
docstore.delete_ref_doc("d1")
assert docstore._kvstore.get("d1", docstore._node_collection) is None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d2", docstore._node_collection) is None
assert docstore._kvstore.get("d2", docstore._metadata_collection) is None
assert docstore._kvstore.get("d2", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d3", docstore._node_collection) is None
assert docstore._kvstore.get("d3", docstore._metadata_collection) is None
assert docstore._kvstore.get("d3", docstore._ref_doc_collection) is None
def test_docstore_delete_ref_doc_not_in_docstore() -> None:
ref_doc = Document(text="hello world", id_="d1", metadata={"foo": "bar"})
doc = Document(text="hello world", id_="d2", metadata={"foo": "bar"})
doc.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
node = TextNode(text="my node", id_="d3", metadata={"node": "info"})
node.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
docstore = SimpleDocumentStore()
docstore.add_documents([doc, node])
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is not None
docstore.delete_ref_doc("d1")
assert docstore._kvstore.get("d1", docstore._node_collection) is None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d2", docstore._node_collection) is None
assert docstore._kvstore.get("d2", docstore._metadata_collection) is None
assert docstore._kvstore.get("d2", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d3", docstore._node_collection) is None
assert docstore._kvstore.get("d3", docstore._metadata_collection) is None
assert docstore._kvstore.get("d3", docstore._ref_doc_collection) is None
def test_docstore_delete_all_ref_doc_nodes() -> None:
ref_doc = Document(text="hello world", id_="d1", metadata={"foo": "bar"})
doc = Document(text="hello world", id_="d2", metadata={"foo": "bar"})
doc.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
node = TextNode(text="my node", id_="d3", metadata={"node": "info"})
node.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
docstore = SimpleDocumentStore()
docstore.add_documents([ref_doc, doc, node])
assert docstore._kvstore.get("d1", docstore._ref_doc_collection)["node_ids"] == [
"d2",
"d3",
]
docstore.delete_document("d2")
assert docstore._kvstore.get("d1", docstore._node_collection) is not None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is not None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is not None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection)["node_ids"] == [
"d3"
]
docstore.delete_document("d3")
assert docstore._kvstore.get("d1", docstore._node_collection) is None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is None
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment