diff --git a/gpt_index/data_structs/node_v2.py b/gpt_index/data_structs/node_v2.py index 3face7f5c9f87480b5750944f70f9162ba79d193..d5e78be1da3db9af60863bed7c6a58d8d5ef3cff 100644 --- a/gpt_index/data_structs/node_v2.py +++ b/gpt_index/data_structs/node_v2.py @@ -94,8 +94,9 @@ class Node(BaseDocument): def get_text(self) -> str: """Get text.""" text = super().get_text() + extra_info_exists = self.extra_info is not None and len(self.extra_info) > 0 result_text = ( - text if self.extra_info_str is None else f"{self.extra_info_str}\n\n{text}" + text if not extra_info_exists else f"{self.extra_info_str}\n\n{text}" ) return result_text diff --git a/gpt_index/indices/vector_store/base.py b/gpt_index/indices/vector_store/base.py index 360d776b127604eafdb5d054fcd3edb60348460f..46999a20049614b4d4d6cc0226d9070e17ee708e 100644 --- a/gpt_index/indices/vector_store/base.py +++ b/gpt_index/indices/vector_store/base.py @@ -17,7 +17,6 @@ from gpt_index.indices.vector_store.base_query import GPTVectorStoreIndexQuery from gpt_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT from gpt_index.prompts.prompts import QuestionAnswerPrompt from gpt_index.token_counter.token_counter import llm_token_counter -from gpt_index.utils import get_new_id from gpt_index.vector_stores.simple import SimpleVectorStore from gpt_index.vector_stores.types import NodeEmbeddingResult, VectorStore @@ -83,7 +82,7 @@ class GPTVectorStoreIndex(BaseGPTIndex[IndexDict]): id_to_embed_map: Dict[str, List[float]] = {} for n in nodes: - new_id = get_new_id(existing_node_ids.union(id_to_node_map.keys())) + new_id = n.get_doc_id() if n.embedding is None: self._service_context.embed_model.queue_text_for_embeddding( new_id, n.get_text() @@ -127,7 +126,7 @@ class GPTVectorStoreIndex(BaseGPTIndex[IndexDict]): text_queue: List[Tuple[str, str]] = [] for n in nodes: - new_id = get_new_id(existing_node_ids.union(id_to_node_map.keys())) + new_id = n.get_doc_id() if n.embedding is None: text_queue.append((new_id, n.get_text())) else: diff --git a/gpt_index/vector_stores/chatgpt_plugin.py b/gpt_index/vector_stores/chatgpt_plugin.py index 383cae9ea5b88529fa1f0333a89ae93e7a03851e..6055087e70f11f1f40fcb9616db02e6ece7fa314 100644 --- a/gpt_index/vector_stores/chatgpt_plugin.py +++ b/gpt_index/vector_stores/chatgpt_plugin.py @@ -7,7 +7,7 @@ import requests from requests.adapters import HTTPAdapter, Retry from tqdm.auto import tqdm -from gpt_index.data_structs.data_structs_v2 import Node +from gpt_index.data_structs.node_v2 import Node, DocumentRelationship from gpt_index.vector_stores.types import ( NodeEmbeddingResult, VectorStore, @@ -26,7 +26,8 @@ def convert_docs_to_json(embedding_results: List[NodeEmbeddingResult]) -> List[D "id": embedding_result.id, "text": embedding_result.node.get_text(), # "source": embedding_result.node.source, - # "source_id": ..., + # NOTE: this is the doc_id to reference document + "source_id": embedding_result.doc_id, # "url": "...", # "created_at": ..., # "author": "..."", @@ -142,9 +143,11 @@ class ChatGPTRetrievalPluginClient(VectorStore): result_id = result["id"] result_txt = result["text"] result_score = result["score"] + result_ref_doc_id = result["source_id"] node = Node( doc_id=result_id, text=result_txt, + relationships={DocumentRelationship.SOURCE: result_ref_doc_id}, ) nodes.append(node) similarities.append(result_score) diff --git a/gpt_index/vector_stores/pinecone.py b/gpt_index/vector_stores/pinecone.py index e3929b58dcee676644aed4398fc03f240ccfbdcb..d0d56e98f89579e0b6c6bc72de8a4a9487d7d143 100644 --- a/gpt_index/vector_stores/pinecone.py +++ b/gpt_index/vector_stores/pinecone.py @@ -6,7 +6,7 @@ An index that that is built on top of an existing vector store. from typing import Any, Dict, List, Optional, cast -from gpt_index.data_structs.node_v2 import Node +from gpt_index.data_structs.node_v2 import Node, DocumentRelationship from gpt_index.vector_stores.types import ( NodeEmbeddingResult, VectorStore, @@ -120,7 +120,9 @@ class PineconeVectorStore(VectorStore): metadata = { "text": node.get_text(), + # NOTE: this is the reference to source doc "doc_id": result.doc_id, + "id": new_id, } if node.extra_info: # TODO: check if overlap with default metadata keys @@ -197,9 +199,14 @@ class PineconeVectorStore(VectorStore): extra_info = get_node_info_from_metadata(match.metadata, "extra_info") node_info = get_node_info_from_metadata(match.metadata, "node_info") doc_id = match.metadata["doc_id"] + id = match.metadata["id"] node = Node( - text=text, extra_info=extra_info, node_info=node_info, doc_id=doc_id + text=text, + extra_info=extra_info, + node_info=node_info, + doc_id=id, + relationships={DocumentRelationship.SOURCE: doc_id}, ) top_k_ids.append(match.id) top_k_nodes.append(node) diff --git a/gpt_index/vector_stores/qdrant.py b/gpt_index/vector_stores/qdrant.py index 1ba6c366ae064650299b9736c9e6f4ddfb379cd7..0b2fd8efdc63a93e41c8eaddb3f60b279c6e702d 100644 --- a/gpt_index/vector_stores/qdrant.py +++ b/gpt_index/vector_stores/qdrant.py @@ -207,6 +207,7 @@ class QdrantVectorStore(VectorStore): for point in response: payload = cast(Payload, point.payload) node = Node( + doc_id=str(point.id), text=payload.get("text"), extra_info=payload.get("extra_info"), relationships={ diff --git a/gpt_index/vector_stores/types.py b/gpt_index/vector_stores/types.py index 5e1c84409a944f0c7931f6ea471d4c73bfb65be5..9b7bad5a258573aed0604259da112632e8b4899b 100644 --- a/gpt_index/vector_stores/types.py +++ b/gpt_index/vector_stores/types.py @@ -15,6 +15,7 @@ class NodeEmbeddingResult: id (str): Node id node (Node): Node embedding (List[float]): Embedding + doc_id (str): Document id """ diff --git a/tests/indices/vector_store/test_base.py b/tests/indices/vector_store/test_base.py index a47f6808a9b5df7fe3d2a877914b7b4ad6927118..e2c729bcb1f898616bb1d555f5c74b0d45fa5170 100644 --- a/tests/indices/vector_store/test_base.py +++ b/tests/indices/vector_store/test_base.py @@ -7,6 +7,7 @@ from unittest.mock import MagicMock, patch import numpy as np import pytest +from gpt_index.data_structs.node_v2 import Node from gpt_index.embeddings.openai import OpenAIEmbedding from gpt_index.indices.vector_store.vector_indices import ( GPTFaissIndex, @@ -16,6 +17,7 @@ from gpt_index.readers.schema.base import Document from gpt_index.vector_stores.simple import SimpleVectorStore from tests.mock_utils.mock_decorator import patch_common from tests.mock_utils.mock_prompts import MOCK_REFINE_PROMPT, MOCK_TEXT_QA_PROMPT +from gpt_index.data_structs.node_v2 import DocumentRelationship @pytest.fixture @@ -632,3 +634,100 @@ def test_simple_async( vector_store = cast(SimpleVectorStore, index._vector_store) embedding = vector_store.get(text_id) assert (node.text, embedding) in actual_node_tups + + +@patch_common +@patch.object( + OpenAIEmbedding, "_get_text_embedding", side_effect=mock_get_text_embedding +) +@patch.object( + OpenAIEmbedding, "_get_text_embeddings", side_effect=mock_get_text_embeddings +) +@patch.object( + OpenAIEmbedding, "get_query_embedding", side_effect=mock_get_query_embedding +) +def test_simple_check_ids( + _mock_query_embed: Any, + _mock_text_embeds: Any, + _mock_text_embed: Any, + _mock_init: Any, + _mock_predict: Any, + _mock_total_tokens_used: Any, + _mock_split_text_overlap: Any, + _mock_split_text: Any, + documents: List[Document], + struct_kwargs: Dict, +) -> None: + """Test build GPTSimpleVectorIndex.""" + index_kwargs, query_kwargs = struct_kwargs + + ref_doc_id = "ref_doc_id_test" + source_rel = {DocumentRelationship.SOURCE: ref_doc_id} + nodes = [ + Node("Hello world.", doc_id="node1", relationships=source_rel), + Node("This is a test.", doc_id="node2", relationships=source_rel), + Node("This is another test.", doc_id="node3", relationships=source_rel), + Node("This is a test v2.", doc_id="node4", relationships=source_rel), + ] + index = GPTSimpleVectorIndex(nodes, **index_kwargs) + + # test query + query_str = "What is?" + response = index.query(query_str, **query_kwargs) + assert str(response) == ("What is?:This is another test.") + assert len(response.source_nodes) == 1 + assert response.source_nodes[0].node.ref_doc_id == "ref_doc_id_test" + assert response.source_nodes[0].node.doc_id == "node3" + vector_store = cast(SimpleVectorStore, index._vector_store) + assert "node3" in vector_store._data.embedding_dict + assert "node3" in vector_store._data.text_id_to_doc_id + + +@patch_common +@patch.object( + OpenAIEmbedding, "_get_text_embedding", side_effect=mock_get_text_embedding +) +@patch.object( + OpenAIEmbedding, "_get_text_embeddings", side_effect=mock_get_text_embeddings +) +@patch.object( + OpenAIEmbedding, "get_query_embedding", side_effect=mock_get_query_embedding +) +def test_faiss_check_ids( + _mock_query_embed: Any, + _mock_texts_embed: Any, + _mock_text_embed: Any, + _mock_init: Any, + _mock_predict: Any, + _mock_total_tokens_used: Any, + _mock_split_text_overlap: Any, + _mock_split_text: Any, + documents: List[Document], + struct_kwargs: Dict, +) -> None: + """Test embedding query.""" + # NOTE: mock faiss import + sys.modules["faiss"] = MagicMock() + # NOTE: mock faiss index + faiss_index = MockFaissIndex() + + index_kwargs, query_kwargs = struct_kwargs + + ref_doc_id = "ref_doc_id_test" + source_rel = {DocumentRelationship.SOURCE: ref_doc_id} + nodes = [ + Node("Hello world.", doc_id="node1", relationships=source_rel), + Node("This is a test.", doc_id="node2", relationships=source_rel), + Node("This is another test.", doc_id="node3", relationships=source_rel), + Node("This is a test v2.", doc_id="node4", relationships=source_rel), + ] + + index = GPTFaissIndex(nodes, faiss_index=faiss_index, **index_kwargs) + + # test query + query_str = "What is?" + response = index.query(query_str, **query_kwargs) + assert str(response) == ("What is?:This is another test.") + assert len(response.source_nodes) == 1 + assert response.source_nodes[0].node.ref_doc_id == "ref_doc_id_test" + assert response.source_nodes[0].node.doc_id == "node3"