Skip to content
Snippets Groups Projects
Unverified Commit 3553f56f authored by Javier Torres's avatar Javier Torres Committed by GitHub
Browse files

Refactor kvdocstore delete methods (#12681)

* refactor docstore delete

* sanity check

* sanity check
parent 32e3c1f6
No related branches found
No related tags found
No related merge requests found
...@@ -374,90 +374,94 @@ class KVDocumentStore(BaseDocumentStore): ...@@ -374,90 +374,94 @@ class KVDocumentStore(BaseDocumentStore):
"""Check if document exists.""" """Check if document exists."""
return await self._kvstore.aget(doc_id, self._node_collection) is not None return await self._kvstore.aget(doc_id, self._node_collection) is not None
def _remove_ref_doc_node(self, doc_id: str) -> None: def _get_ref_doc_id(self, doc_id: str) -> Optional[str]:
"""Helper function to remove node doc_id from ref_doc_collection.""" """Helper function to get ref_doc_info for a given doc_id."""
metadata = self._kvstore.get(doc_id, collection=self._metadata_collection) metadata = self._kvstore.get(doc_id, collection=self._metadata_collection)
if metadata is None: if metadata is None:
return return None
return metadata.get("ref_doc_id", None)
async def _aget_ref_doc_id(self, doc_id: str) -> Optional[str]:
"""Helper function to get ref_doc_info for a given doc_id."""
metadata = await self._kvstore.aget(
doc_id, collection=self._metadata_collection
)
if metadata is None:
return None
ref_doc_id = metadata.get("ref_doc_id", None) return metadata.get("ref_doc_id", None)
def _remove_from_ref_doc_node(self, doc_id: str) -> None:
"""
Helper function to remove node doc_id from ref_doc_collection.
If ref_doc has no more doc_ids, delete it from the collection.
"""
ref_doc_id = self._get_ref_doc_id(doc_id)
if ref_doc_id is None: if ref_doc_id is None:
return return
ref_doc_info = self._kvstore.get( ref_doc_info = self._kvstore.get(
ref_doc_id, collection=self._ref_doc_collection ref_doc_id, collection=self._ref_doc_collection
) )
if ref_doc_info is None:
if ref_doc_info is not None: return
ref_doc_obj = RefDocInfo(**ref_doc_info) ref_doc_obj = RefDocInfo(**ref_doc_info)
if doc_id in ref_doc_obj.node_ids: # sanity check
ref_doc_obj.node_ids.remove(doc_id) ref_doc_obj.node_ids.remove(doc_id)
# delete ref_doc from collection if it has no more doc_ids
# delete ref_doc from collection if it has no more doc_ids if len(ref_doc_obj.node_ids) > 0:
if len(ref_doc_obj.node_ids) > 0: self._kvstore.put(
self._kvstore.put( ref_doc_id,
ref_doc_id, ref_doc_obj.to_dict(),
ref_doc_obj.to_dict(), collection=self._ref_doc_collection,
collection=self._ref_doc_collection, )
) else:
self._kvstore.delete(ref_doc_id, collection=self._metadata_collection) self._kvstore.delete(ref_doc_id, collection=self._metadata_collection)
self._kvstore.delete(ref_doc_id, collection=self._node_collection)
self._kvstore.delete(ref_doc_id, collection=self._ref_doc_collection)
async def _aremove_ref_doc_node(self, doc_id: str) -> None: async def _aremove_from_ref_doc_node(self, doc_id: str) -> None:
"""Helper function to remove node doc_id from ref_doc_collection.""" """
metadata = await self._kvstore.aget( Helper function to remove node doc_id from ref_doc_collection.
doc_id, collection=self._metadata_collection If ref_doc has no more doc_ids, delete it from the collection.
) """
if metadata is None: ref_doc_id = await self._aget_ref_doc_id(doc_id)
return
ref_doc_id = metadata.get("ref_doc_id", None)
if ref_doc_id is None: if ref_doc_id is None:
return return
ref_doc_info = await self._kvstore.aget( ref_doc_info = await self._kvstore.aget(
ref_doc_id, collection=self._ref_doc_collection ref_doc_id, collection=self._ref_doc_collection
) )
if ref_doc_info is None:
if ref_doc_info is not None: return
ref_doc_obj = RefDocInfo(**ref_doc_info) ref_doc_obj = RefDocInfo(**ref_doc_info)
if doc_id in ref_doc_obj.node_ids: # sanity check
ref_doc_obj.node_ids.remove(doc_id) ref_doc_obj.node_ids.remove(doc_id)
# delete ref_doc from collection if it has no more doc_ids
# delete ref_doc from collection if it has no more doc_ids if len(ref_doc_obj.node_ids) > 0:
if len(ref_doc_obj.node_ids) > 0: await self._kvstore.aput(
await self._kvstore.aput( ref_doc_id,
ref_doc_id, ref_doc_obj.to_dict(),
ref_doc_obj.to_dict(), collection=self._ref_doc_collection,
collection=self._ref_doc_collection, )
) else:
await self._kvstore.adelete( await self._kvstore.adelete(
ref_doc_id, collection=self._metadata_collection ref_doc_id, collection=self._metadata_collection
) )
await self._kvstore.adelete(ref_doc_id, collection=self._node_collection)
await self._kvstore.adelete(ref_doc_id, collection=self._ref_doc_collection)
def delete_document( def delete_document(self, doc_id: str, raise_error: bool = True) -> None:
self, doc_id: str, raise_error: bool = True, remove_ref_doc_node: bool = True
) -> None:
"""Delete a document from the store.""" """Delete a document from the store."""
if remove_ref_doc_node: self._remove_from_ref_doc_node(doc_id)
self._remove_ref_doc_node(doc_id)
delete_success = self._kvstore.delete(doc_id, collection=self._node_collection) delete_success = self._kvstore.delete(doc_id, collection=self._node_collection)
_ = self._kvstore.delete(doc_id, collection=self._metadata_collection) _ = self._kvstore.delete(doc_id, collection=self._metadata_collection)
if not delete_success and raise_error: if not delete_success and raise_error:
raise ValueError(f"doc_id {doc_id} not found.") raise ValueError(f"doc_id {doc_id} not found.")
async def adelete_document( async def adelete_document(self, doc_id: str, raise_error: bool = True) -> None:
self, doc_id: str, raise_error: bool = True, remove_ref_doc_node: bool = True
) -> None:
"""Delete a document from the store.""" """Delete a document from the store."""
if remove_ref_doc_node: await self._aremove_from_ref_doc_node(doc_id)
await self._aremove_ref_doc_node(doc_id)
delete_success = await self._kvstore.adelete( delete_success = await self._kvstore.adelete(
doc_id, collection=self._node_collection doc_id, collection=self._node_collection
) )
...@@ -475,11 +479,16 @@ class KVDocumentStore(BaseDocumentStore): ...@@ -475,11 +479,16 @@ class KVDocumentStore(BaseDocumentStore):
else: else:
return return
for doc_id in ref_doc_info.node_ids: original_node_ids = (
self.delete_document(doc_id, raise_error=False, remove_ref_doc_node=False) ref_doc_info.node_ids.copy()
self._kvstore.delete(doc_id, collection=self._metadata_collection) ) # copy to avoid mutation during iteration
for doc_id in original_node_ids:
self.delete_document(doc_id, raise_error=False)
# Deleting all the nodes should already delete the ref_doc, but just to be sure
self._kvstore.delete(ref_doc_id, collection=self._ref_doc_collection) self._kvstore.delete(ref_doc_id, collection=self._ref_doc_collection)
self._kvstore.delete(ref_doc_id, collection=self._metadata_collection)
self._kvstore.delete(ref_doc_id, collection=self._node_collection)
async def adelete_ref_doc(self, ref_doc_id: str, raise_error: bool = True) -> None: async def adelete_ref_doc(self, ref_doc_id: str, raise_error: bool = True) -> None:
"""Delete a ref_doc and all it's associated nodes.""" """Delete a ref_doc and all it's associated nodes."""
...@@ -490,13 +499,16 @@ class KVDocumentStore(BaseDocumentStore): ...@@ -490,13 +499,16 @@ class KVDocumentStore(BaseDocumentStore):
else: else:
return return
for doc_id in ref_doc_info.node_ids: original_node_ids = (
await self.adelete_document( ref_doc_info.node_ids.copy()
doc_id, raise_error=False, remove_ref_doc_node=False ) # copy to avoid mutation during iteration
) for doc_id in original_node_ids:
await self._kvstore.adelete(doc_id, collection=self._metadata_collection) await self.adelete_document(doc_id, raise_error=False)
# Deleting all the nodes should already delete the ref_doc, but just to be sure
await self._kvstore.adelete(ref_doc_id, collection=self._ref_doc_collection) await self._kvstore.adelete(ref_doc_id, collection=self._ref_doc_collection)
await self._kvstore.adelete(ref_doc_id, collection=self._metadata_collection)
await self._kvstore.adelete(ref_doc_id, collection=self._node_collection)
def set_document_hash(self, doc_id: str, doc_hash: str) -> None: def set_document_hash(self, doc_id: str, doc_hash: str) -> None:
"""Set the hash for a given doc_id.""" """Set the hash for a given doc_id."""
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
from pathlib import Path from pathlib import Path
import pytest import pytest
from llama_index.core.schema import Document, TextNode from llama_index.core.schema import Document, TextNode, NodeRelationship
from llama_index.core.storage.docstore import SimpleDocumentStore from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.kvstore.simple_kvstore import SimpleKVStore from llama_index.core.storage.kvstore.simple_kvstore import SimpleKVStore
...@@ -62,3 +62,94 @@ def test_docstore_dict() -> None: ...@@ -62,3 +62,94 @@ def test_docstore_dict() -> None:
assert gd1 == doc assert gd1 == doc
gd2 = new_docstore.get_document("d2") gd2 = new_docstore.get_document("d2")
assert gd2 == node assert gd2 == node
def test_docstore_delete_document() -> None:
doc = Document(text="hello world", id_="d1", metadata={"foo": "bar"})
node = TextNode(text="my node", id_="d2", metadata={"node": "info"})
docstore = SimpleDocumentStore()
docstore.add_documents([doc, node])
docstore.delete_document("d1")
assert docstore._kvstore.get("d1", docstore._node_collection) is None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d2", docstore._node_collection) is not None
assert docstore._kvstore.get("d2", docstore._metadata_collection) is not None
def test_docstore_delete_ref_doc() -> None:
ref_doc = Document(text="hello world", id_="d1", metadata={"foo": "bar"})
doc = Document(text="hello world", id_="d2", metadata={"foo": "bar"})
doc.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
node = TextNode(text="my node", id_="d3", metadata={"node": "info"})
node.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
docstore = SimpleDocumentStore()
docstore.add_documents([ref_doc, doc, node])
docstore.delete_ref_doc("d1")
assert docstore._kvstore.get("d1", docstore._node_collection) is None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d2", docstore._node_collection) is None
assert docstore._kvstore.get("d2", docstore._metadata_collection) is None
assert docstore._kvstore.get("d2", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d3", docstore._node_collection) is None
assert docstore._kvstore.get("d3", docstore._metadata_collection) is None
assert docstore._kvstore.get("d3", docstore._ref_doc_collection) is None
def test_docstore_delete_ref_doc_not_in_docstore() -> None:
ref_doc = Document(text="hello world", id_="d1", metadata={"foo": "bar"})
doc = Document(text="hello world", id_="d2", metadata={"foo": "bar"})
doc.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
node = TextNode(text="my node", id_="d3", metadata={"node": "info"})
node.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
docstore = SimpleDocumentStore()
docstore.add_documents([doc, node])
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is not None
docstore.delete_ref_doc("d1")
assert docstore._kvstore.get("d1", docstore._node_collection) is None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d2", docstore._node_collection) is None
assert docstore._kvstore.get("d2", docstore._metadata_collection) is None
assert docstore._kvstore.get("d2", docstore._ref_doc_collection) is None
assert docstore._kvstore.get("d3", docstore._node_collection) is None
assert docstore._kvstore.get("d3", docstore._metadata_collection) is None
assert docstore._kvstore.get("d3", docstore._ref_doc_collection) is None
def test_docstore_delete_all_ref_doc_nodes() -> None:
ref_doc = Document(text="hello world", id_="d1", metadata={"foo": "bar"})
doc = Document(text="hello world", id_="d2", metadata={"foo": "bar"})
doc.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
node = TextNode(text="my node", id_="d3", metadata={"node": "info"})
node.relationships[NodeRelationship.SOURCE] = ref_doc.as_related_node_info()
docstore = SimpleDocumentStore()
docstore.add_documents([ref_doc, doc, node])
assert docstore._kvstore.get("d1", docstore._ref_doc_collection)["node_ids"] == [
"d2",
"d3",
]
docstore.delete_document("d2")
assert docstore._kvstore.get("d1", docstore._node_collection) is not None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is not None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is not None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection)["node_ids"] == [
"d3"
]
docstore.delete_document("d3")
assert docstore._kvstore.get("d1", docstore._node_collection) is None
assert docstore._kvstore.get("d1", docstore._metadata_collection) is None
assert docstore._kvstore.get("d1", docstore._ref_doc_collection) is None
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment