From 11786caad3bce35e369a7c18be2ff58e3d596567 Mon Sep 17 00:00:00 2001
From: Marcus Schiesser <mail@marcusschiesser.de>
Date: Wed, 3 Apr 2024 07:27:27 +0800
Subject: [PATCH] docs: document duplicate strategies (#12457)

---
 llama-index-core/llama_index/core/ingestion/pipeline.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llama-index-core/llama_index/core/ingestion/pipeline.py b/llama-index-core/llama_index/core/ingestion/pipeline.py
index 874e1d4dd7..25ab6382e4 100644
--- a/llama-index-core/llama_index/core/ingestion/pipeline.py
+++ b/llama-index-core/llama_index/core/ingestion/pipeline.py
@@ -192,15 +192,16 @@ def arun_transformations_wrapper(
 
 
 class DocstoreStrategy(str, Enum):
-    """Document de-duplication strategy.
+    """Document de-duplication de-deduplication strategies work by comparing the hashes or ids stored in the document store.
+       They require a document store to be set which must be persisted across pipeline runs.
 
     Attributes:
         UPSERTS:
-            ('upserts') Use upserts to handle duplicates.
+            ('upserts') Use upserts to handle duplicates. Checks if the a document is already in the doc store based on its id. If it is not, or if the hash of the document is updated, it will update the document in the doc store and run the transformations.
         DUPLICATES_ONLY:
-            ('duplicates_only') Only handle duplicates.
+            ('duplicates_only') Only handle duplicates. Checks if the hash of a document is already in the doc store. Only then it will add the document to the doc store and run the transformations
         UPSERTS_AND_DELETE:
-            ('upserts_and_delete') Use upserts and delete to handle duplicates.
+            ('upserts_and_delete') Use upserts and delete to handle duplicates. Like the upsert strategy but it will also delete non-existing documents from the doc store
     """
 
     UPSERTS = "upserts"
-- 
GitLab