From 61011d7721c5c95b15abfb840630be4b98a9beb5 Mon Sep 17 00:00:00 2001 From: Rana Banerjee <rana1224@gmail.com> Date: Mon, 5 Feb 2024 22:11:37 +0530 Subject: [PATCH] Fixing issue of ambiguous/duplicate knowledge triplet generation because of mismatched case or appended special character (double quotes) in generated entities (subject, predicate, object) (#10409) * While generating triplets, many ambiguous duplicates are created becuase of mismatch in case or presence of double quote charaters appended to the subject or the object. A line of code which strips double qoutes and Capitalizes the entities helps in disambiguation * Format code with black * Updated test_base.py with expected output for capitalized text * Revert "Format code with black" This reverts commit dd87dc46d7d7630e5bb611a469d6ec14f2e2e194. * Fix end of files * Remove unnecessary files --- llama_index/indices/knowledge_graph/base.py | 6 ++++++ tests/indices/knowledge_graph/test_base.py | 7 ++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/llama_index/indices/knowledge_graph/base.py b/llama_index/indices/knowledge_graph/base.py index 468019202f..e38a178fee 100644 --- a/llama_index/indices/knowledge_graph/base.py +++ b/llama_index/indices/knowledge_graph/base.py @@ -156,6 +156,12 @@ class KnowledgeGraphIndex(BaseIndex[KG]): if not subj or not pred or not obj: # skip partial triplets continue + + # Strip double quotes and Capitalize triplets for disambiguation + subj, pred, obj = ( + entity.strip('"').capitalize() for entity in [subj, pred, obj] + ) + results.append((subj, pred, obj)) return results diff --git a/tests/indices/knowledge_graph/test_base.py b/tests/indices/knowledge_graph/test_base.py index 72f322ffbf..c0a0ea365e 100644 --- a/tests/indices/knowledge_graph/test_base.py +++ b/tests/indices/knowledge_graph/test_base.py @@ -232,6 +232,7 @@ def test__parse_triplet_response( ) assert len(parsed_triplets) == 1 assert len(parsed_triplets[0]) == 3 - assert ("foo", "is", "bar") in parsed_triplets[0] - assert ("hello", "is not", "world") in parsed_triplets[0] - assert ("Jane", "is mother of", "Bob") in parsed_triplets[0] + # Expecting Capitalized triplet Outputs + assert ("Foo", "Is", "Bar") in parsed_triplets[0] + assert ("Hello", "Is not", "World") in parsed_triplets[0] + assert ("Jane", "Is mother of", "Bob") in parsed_triplets[0] -- GitLab