From 61011d7721c5c95b15abfb840630be4b98a9beb5 Mon Sep 17 00:00:00 2001
From: Rana Banerjee <rana1224@gmail.com>
Date: Mon, 5 Feb 2024 22:11:37 +0530
Subject: [PATCH] Fixing issue of ambiguous/duplicate knowledge triplet
 generation because of mismatched case or appended special character (double
 quotes) in generated entities (subject, predicate, object) (#10409)

* While generating triplets, many ambiguous duplicates are created becuase of mismatch in case or presence of double quote charaters appended to the subject or the object. A line of code which strips double qoutes and Capitalizes the entities helps in disambiguation

* Format code with black

* Updated test_base.py with expected output for capitalized text

* Revert "Format code with black"

This reverts commit dd87dc46d7d7630e5bb611a469d6ec14f2e2e194.

* Fix end of files

* Remove unnecessary files
---
 llama_index/indices/knowledge_graph/base.py | 6 ++++++
 tests/indices/knowledge_graph/test_base.py  | 7 ++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/llama_index/indices/knowledge_graph/base.py b/llama_index/indices/knowledge_graph/base.py
index 468019202f..e38a178fee 100644
--- a/llama_index/indices/knowledge_graph/base.py
+++ b/llama_index/indices/knowledge_graph/base.py
@@ -156,6 +156,12 @@ class KnowledgeGraphIndex(BaseIndex[KG]):
             if not subj or not pred or not obj:
                 # skip partial triplets
                 continue
+
+            # Strip double quotes and Capitalize triplets for disambiguation
+            subj, pred, obj = (
+                entity.strip('"').capitalize() for entity in [subj, pred, obj]
+            )
+
             results.append((subj, pred, obj))
         return results
 
diff --git a/tests/indices/knowledge_graph/test_base.py b/tests/indices/knowledge_graph/test_base.py
index 72f322ffbf..c0a0ea365e 100644
--- a/tests/indices/knowledge_graph/test_base.py
+++ b/tests/indices/knowledge_graph/test_base.py
@@ -232,6 +232,7 @@ def test__parse_triplet_response(
         )
     assert len(parsed_triplets) == 1
     assert len(parsed_triplets[0]) == 3
-    assert ("foo", "is", "bar") in parsed_triplets[0]
-    assert ("hello", "is not", "world") in parsed_triplets[0]
-    assert ("Jane", "is mother of", "Bob") in parsed_triplets[0]
+    # Expecting Capitalized triplet Outputs
+    assert ("Foo", "Is", "Bar") in parsed_triplets[0]
+    assert ("Hello", "Is not", "World") in parsed_triplets[0]
+    assert ("Jane", "Is mother of", "Bob") in parsed_triplets[0]
-- 
GitLab