diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f675d2080361c5bb47133a8dad0dea019f2a65a7..01d1e34f3a1bc4c7748970b9c235e53100ff669d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,7 +90,7 @@ Our vector store classes store embeddings and support lookup via similiarity sea These serve as the main data store and retrieval engine for our vector index. **Interface**: -* `add` takes in a sequence of `NodeEmbeddingResults` and insert the embeddings (and possibly the node contents & metadata) into the vector store. +* `add` takes in a sequence of `NodeWithEmbeddings` and insert the embeddings (and possibly the node contents & metadata) into the vector store. * `delete` removes entries given document IDs. * `query` retrieves top-k most similar entries given a query embedding. diff --git a/llama_index/indices/vector_store/base.py b/llama_index/indices/vector_store/base.py index 24bd09b32d4e1d6308d3f8f1b449ad89a586adad..18354ddfe4493e403e3df8cec80b4aebdcf5b8aa 100644 --- a/llama_index/indices/vector_store/base.py +++ b/llama_index/indices/vector_store/base.py @@ -4,7 +4,7 @@ An index that that is built on top of an existing vector store. """ -from typing import Any, Dict, List, Optional, Sequence, Set, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple from llama_index.callbacks.schema import CBEventType from llama_index.async_utils import run_async_tasks @@ -15,7 +15,10 @@ from llama_index.indices.base_retriever import BaseRetriever from llama_index.indices.service_context import ServiceContext from llama_index.storage.storage_context import StorageContext from llama_index.token_counter.token_counter import llm_token_counter -from llama_index.vector_stores.types import NodeEmbeddingResult, VectorStore +from llama_index.vector_stores.types import ( + NodeWithEmbedding, + VectorStore, +) class GPTVectorStoreIndex(BaseGPTIndex[IndexDict]): @@ -57,29 +60,24 @@ class GPTVectorStoreIndex(BaseGPTIndex[IndexDict]): return VectorIndexRetriever(self, **kwargs) def _get_node_embedding_results( - self, nodes: Sequence[Node], existing_node_ids: Set - ) -> List[NodeEmbeddingResult]: + self, nodes: Sequence[Node] + ) -> List[NodeWithEmbedding]: """Get tuples of id, node, and embedding. Allows us to store these nodes in a vector store. Embeddings are called in batches. """ - id_to_node_map: Dict[str, Node] = {} id_to_embed_map: Dict[str, List[float]] = {} - nodes_embedded = 0 for n in nodes: - new_id = n.get_doc_id() if n.embedding is None: - nodes_embedded += 1 self._service_context.embed_model.queue_text_for_embedding( - new_id, n.get_text() + n.get_doc_id(), n.get_text() ) else: - id_to_embed_map[new_id] = n.embedding + id_to_embed_map[n.get_doc_id()] = n.embedding - id_to_node_map[new_id] = n event_id = self._service_context.callback_manager.on_event_start( CBEventType.EMBEDDING ) @@ -91,45 +89,37 @@ class GPTVectorStoreIndex(BaseGPTIndex[IndexDict]): ) = self._service_context.embed_model.get_queued_text_embeddings() self._service_context.callback_manager.on_event_end( CBEventType.EMBEDDING, - payload={"num_nodes": nodes_embedded}, + payload={"num_nodes": len(result_ids)}, event_id=event_id, ) for new_id, text_embedding in zip(result_ids, result_embeddings): id_to_embed_map[new_id] = text_embedding - result_tups = [] - for id, embed in id_to_embed_map.items(): - doc_id = id_to_node_map[id].ref_doc_id - if doc_id is None: - raise ValueError("Reference doc id is None.") - result_tups.append( - NodeEmbeddingResult(id, id_to_node_map[id], embed, doc_id=doc_id) - ) - return result_tups + results = [] + for node in nodes: + embedding = id_to_embed_map[node.get_doc_id()] + result = NodeWithEmbedding(node=node, embedding=embedding) + results.append(result) + return results async def _aget_node_embedding_results( self, nodes: Sequence[Node], - existing_node_ids: Set, - ) -> List[NodeEmbeddingResult]: + ) -> List[NodeWithEmbedding]: """Asynchronously get tuples of id, node, and embedding. Allows us to store these nodes in a vector store. Embeddings are called in batches. """ - id_to_node_map: Dict[str, Node] = {} id_to_embed_map: Dict[str, List[float]] = {} text_queue: List[Tuple[str, str]] = [] for n in nodes: - new_id = n.get_doc_id() if n.embedding is None: - text_queue.append((new_id, n.get_text())) + text_queue.append((n.get_doc_id(), n.get_text())) else: - id_to_embed_map[new_id] = n.embedding - - id_to_node_map[new_id] = n + id_to_embed_map[n.get_doc_id()] = n.embedding event_id = self._service_context.callback_manager.on_event_start( CBEventType.EMBEDDING @@ -151,25 +141,18 @@ class GPTVectorStoreIndex(BaseGPTIndex[IndexDict]): for new_id, text_embedding in zip(result_ids, result_embeddings): id_to_embed_map[new_id] = text_embedding - result_tups = [] - for id, embed in id_to_embed_map.items(): - doc_id = id_to_node_map[id].ref_doc_id - if doc_id is None: - raise ValueError("Reference doc id is None.") - result_tups.append( - NodeEmbeddingResult(id, id_to_node_map[id], embed, doc_id=doc_id) - ) - return result_tups + results = [] + for node in nodes: + embedding = id_to_embed_map[node.get_doc_id()] + result = NodeWithEmbedding(node=node, embedding=embedding) + results.append(result) + return results async def _async_add_nodes_to_index( self, index_struct: IndexDict, nodes: Sequence[Node] ) -> None: """Asynchronously add nodes to index.""" - embedding_results = await self._aget_node_embedding_results( - nodes, - set(), - ) - + embedding_results = await self._aget_node_embedding_results(nodes) new_ids = self._vector_store.add(embedding_results) # if the vector store doesn't store text, we need to add the nodes to the @@ -185,11 +168,7 @@ class GPTVectorStoreIndex(BaseGPTIndex[IndexDict]): nodes: Sequence[Node], ) -> None: """Add document to index.""" - embedding_results = self._get_node_embedding_results( - nodes, - set(), - ) - + embedding_results = self._get_node_embedding_results(nodes) new_ids = self._vector_store.add(embedding_results) if not self._vector_store.stores_text: diff --git a/llama_index/vector_stores/chatgpt_plugin.py b/llama_index/vector_stores/chatgpt_plugin.py index 99f5edb4668ab610586a1bc9a98723f53c4ca7c9..628d4658ed6708ec2ba79d0ab716d7a9476e1882 100644 --- a/llama_index/vector_stores/chatgpt_plugin.py +++ b/llama_index/vector_stores/chatgpt_plugin.py @@ -9,14 +9,14 @@ from tqdm.auto import tqdm from llama_index.data_structs.node import Node, DocumentRelationship from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQueryResult, VectorStoreQuery, ) -def convert_docs_to_json(embedding_results: List[NodeEmbeddingResult]) -> List[Dict]: +def convert_docs_to_json(embedding_results: List[NodeWithEmbedding]) -> List[Dict]: """Convert docs to JSON.""" docs = [] for embedding_result in embedding_results: @@ -26,9 +26,8 @@ def convert_docs_to_json(embedding_results: List[NodeEmbeddingResult]) -> List[D doc_dict = { "id": embedding_result.id, "text": embedding_result.node.get_text(), - # "source": embedding_result.node.source, # NOTE: this is the doc_id to reference document - "source_id": embedding_result.doc_id, + "source_id": embedding_result.ref_doc_id, # "url": "...", # "created_at": ..., # "author": "..."", @@ -89,7 +88,7 @@ class ChatGPTRetrievalPluginClient(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding_results to index.""" headers = {"Authorization": f"Bearer {self._bearer_token}"} diff --git a/llama_index/vector_stores/chroma.py b/llama_index/vector_stores/chroma.py index 10f446a09d4a376130065a9f470427487c45d4f8..9e078eac18afefff927005fcf0e7f33038177ec9 100644 --- a/llama_index/vector_stores/chroma.py +++ b/llama_index/vector_stores/chroma.py @@ -6,7 +6,7 @@ from typing import Any, List, cast from llama_index.data_structs.node import DocumentRelationship, Node from llama_index.utils import truncate_text from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQuery, VectorStoreQueryResult, @@ -44,11 +44,11 @@ class ChromaVectorStore(VectorStore): self._collection = cast(Collection, chroma_collection) - def add(self, embedding_results: List[NodeEmbeddingResult]) -> List[str]: + def add(self, embedding_results: List[NodeWithEmbedding]) -> List[str]: """Add embedding results to index. Args - embedding_results: List[NodeEmbeddingResult]: list of embedding results + embedding_results: List[NodeWithEmbedding]: list of embedding results """ if not self._collection: @@ -61,7 +61,7 @@ class ChromaVectorStore(VectorStore): for result in embedding_results: embeddings.append(result.embedding) extra_info = result.node.extra_info or {} - metadatas.append({**extra_info, **{"document_id": result.doc_id}}) + metadatas.append({**extra_info, **{"document_id": result.ref_doc_id}}) ids.append(result.id) documents.append(result.node.get_text()) diff --git a/llama_index/vector_stores/deeplake.py b/llama_index/vector_stores/deeplake.py index b8f82edf209102139ac269ab2603ebd747a6e8e3..3789ff77f9ce0ca94616250c824bb2071f753c88 100644 --- a/llama_index/vector_stores/deeplake.py +++ b/llama_index/vector_stores/deeplake.py @@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional, cast import numpy as np from llama_index.indices.query.embedding_utils import get_top_k_embeddings from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQuery, VectorStoreQueryResult, @@ -139,11 +139,11 @@ class DeepLakeVectorStore(VectorStore): """Get client.""" return self.ds - def add(self, embedding_results: List[NodeEmbeddingResult]) -> List[str]: + def add(self, embedding_results: List[NodeWithEmbedding]) -> List[str]: """Add the embeddings and their nodes into DeepLake. Args: - embedding_results (List[NodeEmbeddingResult]): The embeddings and their data + embedding_results (List[NodeWithEmbedding]): The embeddings and their data to insert. Raises: @@ -162,7 +162,7 @@ class DeepLakeVectorStore(VectorStore): for result in embedding_results: embedding = result.embedding extra_info = result.node.extra_info or {} - metadata = {**extra_info, **{"document_id": result.doc_id}} + metadata = {**extra_info, **{"document_id": result.ref_doc_id}} id = result.id text = result.node.get_text() diff --git a/llama_index/vector_stores/faiss.py b/llama_index/vector_stores/faiss.py index 465adbe3ac4e4b1b0621007ae718ca29242bbdf8..39e529dd32fdc8984fb7eafe266179b1b475c27c 100644 --- a/llama_index/vector_stores/faiss.py +++ b/llama_index/vector_stores/faiss.py @@ -12,7 +12,7 @@ import numpy as np from llama_index.vector_stores.types import ( DEFAULT_PERSIST_DIR, DEFAULT_PERSIST_FNAME, - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQueryResult, VectorStoreQuery, @@ -79,14 +79,14 @@ class FaissVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding results to index. NOTE: in the Faiss vector store, we do not store text in Faiss. Args - embedding_results: List[NodeEmbeddingResult]: list of embedding results + embedding_results: List[NodeWithEmbedding]: list of embedding results """ new_ids = [] diff --git a/llama_index/vector_stores/lancedb.py b/llama_index/vector_stores/lancedb.py index 6a4e02b036b90924db31a49008694ec1a81179db..35a7f8bff9621a9ea5eabf4614dc69d477949b41 100644 --- a/llama_index/vector_stores/lancedb.py +++ b/llama_index/vector_stores/lancedb.py @@ -3,7 +3,7 @@ from typing import Any, List, Optional from llama_index.data_structs.node import DocumentRelationship, Node from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQueryResult, VectorStoreQuery, @@ -65,7 +65,7 @@ class LanceDBVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: data = [] ids = [] @@ -73,7 +73,7 @@ class LanceDBVectorStore(VectorStore): data.append( { "id": result.id, - "doc_id": result.doc_id, + "doc_id": result.ref_doc_id, "vector": result.embedding, "text": result.node.get_text(), } diff --git a/llama_index/vector_stores/metal.py b/llama_index/vector_stores/metal.py index c5cfdc2f190a850a5c6f760dc3cbe2fc25cb865e..e1a3d4378baf86f51c98647e955950b507c3c59b 100644 --- a/llama_index/vector_stores/metal.py +++ b/llama_index/vector_stores/metal.py @@ -1,11 +1,23 @@ +import json import math from typing import Any, Dict, List, Optional from llama_index.data_structs.node import Node, DocumentRelationship -from llama_index.vector_stores.types import VectorStore, NodeEmbeddingResult, VectorStoreQuery, VectorStoreQueryResult +from llama_index.vector_stores.types import ( + NodeWithEmbedding, + VectorStore, + VectorStoreQuery, + VectorStoreQueryResult, +) class MetalVectorStore(VectorStore): - def __init__(self, api_key: str, client_id: str, index_id: str, filters: Optional[Dict[str, Any]] = None): + def __init__( + self, + api_key: str, + client_id: str, + index_id: str, + filters: Optional[Dict[str, Any]] = None, + ): """Init params.""" import_err_msg = ( "`metal_sdk` package not found, please run `pip install metal_sdk`" @@ -14,8 +26,7 @@ class MetalVectorStore(VectorStore): import metal_sdk # noqa: F401 except ImportError: raise ImportError(import_err_msg) - from metal_sdk.metal import Metal # noqa: F401 - + from metal_sdk.metal import Metal # noqa: F401 self.api_key = api_key self.client_id = client_id @@ -39,10 +50,17 @@ class MetalVectorStore(VectorStore): for item in response["data"]: text = item["text"] - extra_info = item["metadata"] - doc_id = item["metadata"]["doc_id"] + metadata = item["metadata"] + ref_doc_id = metadata["doc_id"] + if "extra_info" in metadata: + extra_info = json.loads(metadata["extra_info"]) id = item["id"] - node = Node(text=text, extra_info=extra_info, doc_id=id, relationships={DocumentRelationship.SOURCE: doc_id}) + node = Node( + text=text, + extra_info=extra_info, + doc_id=id, + relationships={DocumentRelationship.SOURCE: ref_doc_id}, + ) nodes.append(node) ids.append(item["id"]) @@ -56,8 +74,7 @@ class MetalVectorStore(VectorStore): """Return Metal client.""" return self.metal_client - - def add(self, embedding_results: List[NodeEmbeddingResult]) -> List[str]: + def add(self, embedding_results: List[NodeWithEmbedding]) -> List[str]: """Add embedding results to index. Args @@ -67,27 +84,27 @@ class MetalVectorStore(VectorStore): if not self.metal_client: raise ValueError("metal_client not initialized") - ids = [] for result in embedding_results: ids.append(result.id) + metadata = {} + metadata["doc_id"] = result.ref_doc_id + metadata["text"] = result.node.get_text() + if result.node.extra_info is not None: + metadata["extra_info"] = json.dumps(result.node.extra_info) + payload = { "embedding": result.embedding, - "metadata": result.node.extra_info or {}, + "metadata": metadata, "id": result.id, } - payload["metadata"]["doc_id"] = result.doc_id - - if result.node.get_text(): - payload["metadata"]["text"] = result.node.get_text() - self.metal_client.index(payload) return ids - def delete(self, doc_id: str) -> None: + def delete(self, doc_id: str, **delete_kwargs: Any) -> None: """Delete nodes from index. Args: diff --git a/llama_index/vector_stores/milvus.py b/llama_index/vector_stores/milvus.py index 637cd4abd89212e7d100cae8f9bb0e6a5a61d06f..c3b9c323b3b6abcd42e913ed9e4283ea8cfe3a57 100644 --- a/llama_index/vector_stores/milvus.py +++ b/llama_index/vector_stores/milvus.py @@ -9,7 +9,7 @@ from uuid import uuid4 from llama_index.data_structs.node import DocumentRelationship, Node from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQuery, VectorStoreQueryMode, @@ -287,11 +287,11 @@ class MilvusVectorStore(VectorStore): """Get client.""" return self.collection - def add(self, embedding_results: List[NodeEmbeddingResult]) -> List[str]: + def add(self, embedding_results: List[NodeWithEmbedding]) -> List[str]: """Add the embeddings and their nodes into Milvus. Args: - embedding_results (List[NodeEmbeddingResult]): The embeddings and their data + embedding_results (List[NodeWithEmbedding]): The embeddings and their data to insert. Raises: @@ -318,7 +318,7 @@ class MilvusVectorStore(VectorStore): # Process that data we are going to insert for result in embedding_results: ids.append(result.id) - doc_ids.append(result.doc_id) + doc_ids.append(result.ref_doc_id) texts.append(result.node.get_text()) embeddings.append(result.embedding) diff --git a/llama_index/vector_stores/myscale.py b/llama_index/vector_stores/myscale.py index 436ab14f3bd40d4bc8f8063c9728eb1079a74acc..7f1e38ca546a9f176dc628dcca0ce01d137326ac 100644 --- a/llama_index/vector_stores/myscale.py +++ b/llama_index/vector_stores/myscale.py @@ -16,7 +16,7 @@ from llama_index.readers.myscale import ( ) from llama_index.utils import iter_batch from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQuery, VectorStoreQueryResult, @@ -99,10 +99,10 @@ class MyScaleVectorStore(VectorStore): # schema column name, type, and construct format method self.column_config: Dict = { "id": {"type": "String", "extract_func": lambda x: x.id}, - "doc_id": {"type": "String", "extract_func": lambda x: x.doc_id}, + "doc_id": {"type": "String", "extract_func": lambda x: x.ref_doc_id}, "text": { "type": "String", - "extract_func": lambda x: escape_str(x.node.text), + "extract_func": lambda x: escape_str(x.node.get_text()), }, "vector": { "type": "Array(Float32)", @@ -151,7 +151,7 @@ class MyScaleVectorStore(VectorStore): def _build_insert_statement( self, - values: List[NodeEmbeddingResult], + values: List[NodeWithEmbedding], ) -> str: _data = [] for item in values: @@ -173,12 +173,12 @@ class MyScaleVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding results to index. Args - embedding_results: List[NodeEmbeddingResult]: list of embedding results + embedding_results: List[NodeWithEmbedding]: list of embedding results """ diff --git a/llama_index/vector_stores/opensearch.py b/llama_index/vector_stores/opensearch.py index 11b6d2367a97034640b2ed5402ee2b0a9244d4eb..ef2f8d40a29ff7b9178d915337faa3b8c09f3c8a 100644 --- a/llama_index/vector_stores/opensearch.py +++ b/llama_index/vector_stores/opensearch.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, cast from llama_index.data_structs import Node from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQueryResult, VectorStoreQuery, @@ -96,14 +96,14 @@ class OpensearchVectorClient: # will 400 if the index already existed, so allow 400 errors right here assert res.status_code == 200 or res.status_code == 400 - def index_results(self, results: List[NodeEmbeddingResult]) -> List[str]: + def index_results(self, results: List[NodeWithEmbedding]) -> List[str]: """Store results in the index.""" bulk_req: List[Dict[Any, Any]] = [] for result in results: bulk_req.append({"index": {"_index": self._index, "_id": result.id}}) bulk_req.append( { - self._text_field: result.node.text, + self._text_field: result.node.get_text(), self._embedding_field: result.embedding, } ) @@ -180,12 +180,12 @@ class OpensearchVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding results to index. Args - embedding_results: List[NodeEmbeddingResult]: list of embedding results + embedding_results: List[NodeWithEmbedding]: list of embedding results """ self._client.index_results(embedding_results) diff --git a/llama_index/vector_stores/pinecone.py b/llama_index/vector_stores/pinecone.py index 5c5d5e233b85963cc3ea57fa2a722bd16160bfc9..081b015d3a7e859ecfa5c53f8ae3807314ac5833 100644 --- a/llama_index/vector_stores/pinecone.py +++ b/llama_index/vector_stores/pinecone.py @@ -12,7 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, cast from llama_index.data_structs.node import DocumentRelationship, Node from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQuery, VectorStoreQueryMode, @@ -190,25 +190,24 @@ class PineconeVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding results to index. Args - embedding_results: List[NodeEmbeddingResult]: list of embedding results + embedding_results: List[NodeWithEmbedding]: list of embedding results """ ids = [] for result in embedding_results: - new_id = result.id + node_id = result.id node = result.node - text_embedding = result.embedding metadata = { "text": node.get_text(), # NOTE: this is the reference to source doc - "doc_id": result.doc_id, - "id": new_id, + "doc_id": node.ref_doc_id, + "id": node_id, } if node.extra_info: # TODO: check if overlap with default metadata keys @@ -233,8 +232,8 @@ class PineconeVectorStore(VectorStore): metadata.update(self._metadata_filters) entry = { - "id": new_id, - "values": text_embedding, + "id": node_id, + "values": result.embedding, "metadata": metadata, } if self._add_sparse_vector: @@ -245,7 +244,7 @@ class PineconeVectorStore(VectorStore): self._pinecone_index.upsert( [entry], namespace=self._namespace, **self._pinecone_kwargs ) - ids.append(new_id) + ids.append(node_id) return ids def delete(self, doc_id: str, **delete_kwargs: Any) -> None: diff --git a/llama_index/vector_stores/qdrant.py b/llama_index/vector_stores/qdrant.py index 7f767057ed1c768f0a9602f27567f7c5564f417a..591047e3fbe3f678fe78e7292e5ad456959f0053 100644 --- a/llama_index/vector_stores/qdrant.py +++ b/llama_index/vector_stores/qdrant.py @@ -9,7 +9,7 @@ from typing import Any, List, Optional, cast from llama_index.data_structs.node import DocumentRelationship, Node from llama_index.utils import iter_batch from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQueryResult, VectorStoreQuery, @@ -55,11 +55,11 @@ class QdrantVectorStore(VectorStore): self._batch_size = kwargs.get("batch_size", 100) - def add(self, embedding_results: List[NodeEmbeddingResult]) -> List[str]: + def add(self, embedding_results: List[NodeWithEmbedding]) -> List[str]: """Add embedding results to index. Args - embedding_results: List[NodeEmbeddingResult]: list of embedding results + embedding_results: List[NodeWithEmbedding]: list of embedding results """ from qdrant_client.http import models as rest @@ -72,16 +72,16 @@ class QdrantVectorStore(VectorStore): ids = [] for result_batch in iter_batch(embedding_results, self._batch_size): - new_ids = [] + node_ids = [] vectors = [] payloads = [] for result in result_batch: - new_ids.append(result.id) + node_ids.append(result.id) vectors.append(result.embedding) node = result.node payloads.append( { - "doc_id": result.doc_id, + "doc_id": result.ref_doc_id, "text": node.get_text(), "extra_info": node.extra_info, } @@ -90,12 +90,12 @@ class QdrantVectorStore(VectorStore): self._client.upsert( collection_name=self._collection_name, points=rest.Batch.construct( - ids=new_ids, + ids=node_ids, vectors=vectors, payloads=payloads, ), ) - ids.extend(new_ids) + ids.extend(node_ids) return ids def delete(self, doc_id: str, **delete_kwargs: Any) -> None: diff --git a/llama_index/vector_stores/simple.py b/llama_index/vector_stores/simple.py index 7748e333e21adc074a018b3f417f0ef97529f38a..3df5649ed0454f7f1539a353c35f180ea7c05e69 100644 --- a/llama_index/vector_stores/simple.py +++ b/llama_index/vector_stores/simple.py @@ -14,7 +14,7 @@ from llama_index.indices.query.embedding_utils import ( from llama_index.vector_stores.types import ( DEFAULT_PERSIST_DIR, DEFAULT_PERSIST_FNAME, - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQueryResult, VectorStoreQuery, @@ -85,13 +85,12 @@ class SimpleVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding_results to index.""" for result in embedding_results: - text_id = result.id - self._data.embedding_dict[text_id] = result.embedding - self._data.text_id_to_doc_id[text_id] = result.doc_id + self._data.embedding_dict[result.id] = result.embedding + self._data.text_id_to_doc_id[result.id] = result.ref_doc_id return [result.id for result in embedding_results] def delete(self, doc_id: str, **delete_kwargs: Any) -> None: diff --git a/llama_index/vector_stores/types.py b/llama_index/vector_stores/types.py index 7891f67f7ac39df55498cd9a59772c0563f34b85..c38ed99e626962cc12bca3a3926213b969796c48 100644 --- a/llama_index/vector_stores/types.py +++ b/llama_index/vector_stores/types.py @@ -13,21 +13,25 @@ DEFAULT_PERSIST_FNAME = "vector_store.json" @dataclass -class NodeEmbeddingResult: - """Node embedding result. +class NodeWithEmbedding: + """Node with embedding. Args: - id (str): Node id node (Node): Node embedding (List[float]): Embedding - doc_id (str): Document id """ - id: str node: Node embedding: List[float] - doc_id: str + + @property + def id(self) -> str: + return self.node.get_doc_id() + + @property + def ref_doc_id(self) -> str: + return self.node.ref_doc_id or "None" @dataclass @@ -83,7 +87,7 @@ class VectorStore(Protocol): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding results to vector store.""" ... diff --git a/llama_index/vector_stores/weaviate.py b/llama_index/vector_stores/weaviate.py index 5db29827c90d556186589fd148bcc9de1aa5f8a5..d1f863470c9c77e99883427fdfebb49c5d403668 100644 --- a/llama_index/vector_stores/weaviate.py +++ b/llama_index/vector_stores/weaviate.py @@ -14,7 +14,7 @@ from llama_index.readers.weaviate.client import ( ) from llama_index.readers.weaviate.utils import get_default_class_prefix from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQueryResult, VectorStoreQuery, @@ -75,12 +75,12 @@ class WeaviateVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding results to index. Args - embedding_results: List[NodeEmbeddingResult]: list of embedding results + embedding_results: List[NodeWithEmbedding]: list of embedding results """ for result in embedding_results: diff --git a/tests/indices/composability/test_utils.py b/tests/indices/composability/test_utils.py index ac1d523161bf636eadab24dcb54215bc5e975494..0f4eefe9257007aef0b9c06be415aadced56597e 100644 --- a/tests/indices/composability/test_utils.py +++ b/tests/indices/composability/test_utils.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Optional from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQueryResult, VectorStoreQuery, @@ -24,7 +24,7 @@ class MockVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: """Add embedding results to vector store.""" raise NotImplementedError() diff --git a/tests/indices/vector_store/test_faiss.py b/tests/indices/vector_store/test_faiss.py index 7b2bb6a67f4fe20c28460054f6cb7dcdddb64560..37a1676290de8495c9e9a9e9367262d8e6e06254 100644 --- a/tests/indices/vector_store/test_faiss.py +++ b/tests/indices/vector_store/test_faiss.py @@ -13,7 +13,7 @@ from llama_index.indices.vector_store.base import GPTVectorStoreIndex from llama_index.readers.schema.base import Document from llama_index.storage.storage_context import StorageContext from llama_index.vector_stores.faiss import FaissVectorStore -from llama_index.vector_stores.types import NodeEmbeddingResult, VectorStoreQuery +from llama_index.vector_stores.types import NodeWithEmbedding, VectorStoreQuery @pytest.mark.skipif("CI" in os.environ, reason="no FAISS in CI") @@ -71,11 +71,9 @@ def test_persist(tmp_path: Path) -> None: vector_store.add( [ - NodeEmbeddingResult( - id="test id", + NodeWithEmbedding( node=Node("test text"), embedding=[0, 0, 0, 1, 1], - doc_id="test_doc", ) ] ) diff --git a/tests/indices/vector_store/test_lancedb.py b/tests/indices/vector_store/test_lancedb.py index 275d92e3ea43cad673b51db971ce3b0ce574f97c..f7a153020321021b1e431c4a2b1f79c7b599101e 100644 --- a/tests/indices/vector_store/test_lancedb.py +++ b/tests/indices/vector_store/test_lancedb.py @@ -6,7 +6,7 @@ from llama_index.indices.service_context import ServiceContext from llama_index.storage.storage_context import StorageContext from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQuery, VectorStoreQueryResult, @@ -37,7 +37,7 @@ class MockLanceDBVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: return [] diff --git a/tests/indices/vector_store/test_milvus.py b/tests/indices/vector_store/test_milvus.py index 7454adbf45398108e931ef90272431b420f618b0..c69cec0a5a2d6832c930fe3dcff60e5a98ec4058 100644 --- a/tests/indices/vector_store/test_milvus.py +++ b/tests/indices/vector_store/test_milvus.py @@ -7,7 +7,7 @@ from llama_index.indices.vector_store import GPTVectorStoreIndex from llama_index.storage.storage_context import StorageContext from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQuery, VectorStoreQueryResult, @@ -48,7 +48,7 @@ class MockMilvusVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: return [] diff --git a/tests/indices/vector_store/test_weaviate.py b/tests/indices/vector_store/test_weaviate.py index fcf35b9d0c355401dc982c86d00f83dca292423b..8286d31f63316fa1088f216a688965bd05479fc0 100644 --- a/tests/indices/vector_store/test_weaviate.py +++ b/tests/indices/vector_store/test_weaviate.py @@ -5,7 +5,7 @@ from llama_index.indices.service_context import ServiceContext from llama_index.indices.vector_store import GPTVectorStoreIndex from llama_index.storage.storage_context import StorageContext from llama_index.vector_stores.types import ( - NodeEmbeddingResult, + NodeWithEmbedding, VectorStore, VectorStoreQuery, VectorStoreQueryResult, @@ -30,7 +30,7 @@ class MockWeaviateVectorStore(VectorStore): def add( self, - embedding_results: List[NodeEmbeddingResult], + embedding_results: List[NodeWithEmbedding], ) -> List[str]: return [] diff --git a/tests/vector_stores/test_qdrant.py b/tests/vector_stores/test_qdrant.py index 6ebcae25c131859ff3c97b53e633cf3790856656..9dd280ce420292a7ef8660c85ba61c720a3d875f 100644 --- a/tests/vector_stores/test_qdrant.py +++ b/tests/vector_stores/test_qdrant.py @@ -7,36 +7,35 @@ try: except ImportError: qdrant_client = None # type: ignore -from llama_index.data_structs import Node +from llama_index.data_structs.node import Node, DocumentRelationship from llama_index.vector_stores import QdrantVectorStore -from llama_index.vector_stores.types import NodeEmbeddingResult, VectorStoreQuery +from llama_index.vector_stores.types import NodeWithEmbedding, VectorStoreQuery @pytest.fixture -def node() -> Node: - return Node(text="lorem ipsum") - - -@pytest.fixture -def node_embeddings(node: Node) -> List[NodeEmbeddingResult]: +def node_embeddings() -> List[NodeWithEmbedding]: return [ - NodeEmbeddingResult( - id="c330d77f-90bd-4c51-9ed2-57d8d693b3b0", + NodeWithEmbedding( embedding=[1.0, 0.0], - doc_id="test-0", - node=node, + node=Node( + text="lorem ipsum", + doc_id="c330d77f-90bd-4c51-9ed2-57d8d693b3b0", + relationships={DocumentRelationship.SOURCE: "test-0"}, + ), ), - NodeEmbeddingResult( - id="c3d1e1dd-8fb4-4b8f-b7ea-7fa96038d39d", + NodeWithEmbedding( embedding=[0.0, 1.0], - doc_id="test-1", - node=node, + node=Node( + text="lorem ipsum", + doc_id="c3d1e1dd-8fb4-4b8f-b7ea-7fa96038d39d", + relationships={DocumentRelationship.SOURCE: "test-1"}, + ), ), ] @pytest.mark.skipif(qdrant_client is None, reason="qdrant-client not installed") -def test_add_stores_data(node_embeddings: List[NodeEmbeddingResult]) -> None: +def test_add_stores_data(node_embeddings: List[NodeWithEmbedding]) -> None: client = qdrant_client.QdrantClient(":memory:") qdrant_vector_store = QdrantVectorStore(collection_name="test", client=client) diff --git a/tests/vector_stores/test_weaviate.py b/tests/vector_stores/test_weaviate.py index 05d1b4ebc778b0b434c75f0febc7e12f43f6e18f..89a4fc583df65345c5b9966417e974efead60a8c 100644 --- a/tests/vector_stores/test_weaviate.py +++ b/tests/vector_stores/test_weaviate.py @@ -1,7 +1,7 @@ import sys from unittest.mock import MagicMock -from llama_index.data_structs.node import Node -from llama_index.vector_stores.types import NodeEmbeddingResult +from llama_index.data_structs.node import DocumentRelationship, Node +from llama_index.vector_stores.types import NodeWithEmbedding from llama_index.vector_stores.weaviate import WeaviateVectorStore @@ -17,11 +17,13 @@ def test_weaviate_add() -> None: vector_store.add( [ - NodeEmbeddingResult( - id="test node id", - node=Node(text="test node text"), + NodeWithEmbedding( + node=Node( + text="test node text", + doc_id="test node id", + relationships={DocumentRelationship.SOURCE: "test doc id"}, + ), embedding=[0.5, 0.5], - doc_id="test doc id", ) ] )