diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d37f38e1c85b81d8c47fad1e262d40d389b39c1..51e2565242c379b02036e1c20dfd605c1e15c1e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,11 @@ ## Unreleased +### New Features +- Added `Konko` LLM support (#7775) + ### Bug Fixes / Nits +- Normalize scores returned from ElasticSearch vector store (#7792) - Fixed `refresh_ref_docs()` bug with order of operations (#7664) ## [0.8.33] - 2023-09-25 diff --git a/llama_index/vector_stores/elasticsearch.py b/llama_index/vector_stores/elasticsearch.py index 7133792984b7ad37ae8a5218bcb0f4ee7645ae94..46d0dda4657fb0d0a5e483ad701f3327552f511c 100644 --- a/llama_index/vector_stores/elasticsearch.py +++ b/llama_index/vector_stores/elasticsearch.py @@ -5,6 +5,8 @@ import uuid from logging import getLogger from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast +import numpy as np + from llama_index.schema import BaseNode, MetadataMode, TextNode from llama_index.vector_stores.types import ( MetadataFilters, @@ -124,6 +126,11 @@ def _to_elasticsearch_filter(standard_filters: MetadataFilters) -> Dict[str, Any return {"bool": {"must": operands}} +def _to_llama_similarities(scores: List[float]) -> List[float]: + scores_to_norm: np.ndarray = np.array(scores) + return np.exp(scores_to_norm - np.max(scores_to_norm)).tolist() + + class ElasticsearchStore(VectorStore): """Elasticsearch vector store. @@ -546,5 +553,7 @@ class ElasticsearchStore(VectorStore): top_k_ids.append(node_id) top_k_scores.append(hit["_score"]) return VectorStoreQueryResult( - nodes=top_k_nodes, ids=top_k_ids, similarities=top_k_scores + nodes=top_k_nodes, + ids=top_k_ids, + similarities=_to_llama_similarities(top_k_scores), ) diff --git a/tests/vector_stores/test_elasticsearch.py b/tests/vector_stores/test_elasticsearch.py index 7425c527fae3e466b8e00030bca1bb4ed1330568..a54c2c27e4bf39e14a747a8982169a63dd957883 100644 --- a/tests/vector_stores/test_elasticsearch.py +++ b/tests/vector_stores/test_elasticsearch.py @@ -3,6 +3,7 @@ import os import uuid from typing import Any, Dict, Generator, List, Union +import pandas as pd import pytest from llama_index.schema import NodeRelationship, RelatedNodeInfo, TextNode @@ -25,8 +26,12 @@ try: es_client.info() elasticsearch_not_available = False + + es_license = es_client.license.get() + basic_license: bool = "basic" == es_license["license"]["type"] except (ImportError, Exception): elasticsearch_not_available = True + basic_license = True @pytest.fixture(scope="session") @@ -110,6 +115,39 @@ def node_embeddings() -> List[TextNode]: }, embedding=[0.0, 0.0, 1.0], ), + TextNode( + text="I was taught that the way of progress was neither swift nor easy.", + id_="0b31ae71-b797-4e88-8495-031371a7752e", + relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="text-3")}, + metadate={ + "author": "Marie Curie", + }, + embedding=[0.0, 0.0, 0.9], + ), + TextNode( + text=( + "The important thing is not to stop questioning." + + " Curiosity has its own reason for existing." + ), + id_="bd2e080b-159a-4030-acc3-d98afd2ba49b", + relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="text-4")}, + metadate={ + "author": "Albert Einstein", + }, + embedding=[0.0, 0.0, 0.5], + ), + TextNode( + text=( + "I am no bird; and no net ensnares me;" + + " I am a free human being with an independent will." + ), + id_="f658de3b-8cef-4d1c-8bed-9a263c907251", + relationships={NodeRelationship.SOURCE: RelatedNodeInfo(node_id="text-5")}, + metadate={ + "author": "Charlotte Bronte", + }, + embedding=[0.0, 0.0, 0.3], + ), ] @@ -124,22 +162,25 @@ def test_instance_creation(index_name: str, elasticsearch_connection: Dict) -> N assert isinstance(es_store, ElasticsearchStore) +@pytest.fixture(scope="function") +def es_store(index_name: str, elasticsearch_connection: Dict) -> ElasticsearchStore: + return ElasticsearchStore( + **elasticsearch_connection, + index_name=index_name, + distance_strategy="EUCLIDEAN_DISTANCE", + ) + + @pytest.mark.skipif( elasticsearch_not_available, reason="elasticsearch is not available" ) @pytest.mark.asyncio @pytest.mark.parametrize("use_async", [True, False]) async def test_add_to_es_and_query( - index_name: str, - elasticsearch_connection: Dict, + es_store: ElasticsearchStore, node_embeddings: List[TextNode], use_async: bool, ) -> None: - es_store = ElasticsearchStore( - **elasticsearch_connection, - index_name=index_name, - distance_strategy="COSINE", - ) if use_async: await es_store.async_add(node_embeddings) res = await es_store.aquery( @@ -160,16 +201,10 @@ async def test_add_to_es_and_query( @pytest.mark.asyncio @pytest.mark.parametrize("use_async", [True, False]) async def test_add_to_es_and_text_query( - index_name: str, - elasticsearch_connection: Dict, + es_store: ElasticsearchStore, node_embeddings: List[TextNode], use_async: bool, ) -> None: - es_store = ElasticsearchStore( - **elasticsearch_connection, - index_name=index_name, - distance_strategy="COSINE", - ) if use_async: await es_store.async_add(node_embeddings) res = await es_store.aquery( @@ -193,21 +228,17 @@ async def test_add_to_es_and_text_query( @pytest.mark.skipif( - elasticsearch_not_available, reason="elasticsearch is not available" + elasticsearch_not_available, + basic_license, + reason="elasticsearch is not available or license is basic", ) @pytest.mark.asyncio @pytest.mark.parametrize("use_async", [True, False]) async def test_add_to_es_and_hybrid_query( - index_name: str, - elasticsearch_connection: Dict, + es_store: ElasticsearchStore, node_embeddings: List[TextNode], use_async: bool, ) -> None: - es_store = ElasticsearchStore( - **elasticsearch_connection, - index_name=index_name, - distance_strategy="COSINE", - ) if use_async: await es_store.async_add(node_embeddings) res = await es_store.aquery( @@ -238,16 +269,10 @@ async def test_add_to_es_and_hybrid_query( @pytest.mark.asyncio @pytest.mark.parametrize("use_async", [True, False]) async def test_add_to_es_query_with_filters( - index_name: str, - elasticsearch_connection: Dict, + es_store: ElasticsearchStore, node_embeddings: List[TextNode], use_async: bool, ) -> None: - es_store = ElasticsearchStore( - **elasticsearch_connection, - index_name=index_name, - distance_strategy="COSINE", - ) filters = MetadataFilters( filters=[ExactMatchFilter(key="author", value="Stephen King")] ) @@ -271,16 +296,10 @@ async def test_add_to_es_query_with_filters( @pytest.mark.asyncio @pytest.mark.parametrize("use_async", [True, False]) async def test_add_to_es_query_with_es_filters( - index_name: str, - elasticsearch_connection: Dict, + es_store: ElasticsearchStore, node_embeddings: List[TextNode], use_async: bool, ) -> None: - es_store = ElasticsearchStore( - **elasticsearch_connection, - index_name=index_name, - distance_strategy="COSINE", - ) q = VectorStoreQuery(query_embedding=[1.0, 0.0, 0.0], similarity_top_k=10) if use_async: await es_store.async_add(node_embeddings) @@ -303,16 +322,10 @@ async def test_add_to_es_query_with_es_filters( @pytest.mark.asyncio @pytest.mark.parametrize("use_async", [True, False]) async def test_add_to_es_query_and_delete( - index_name: str, - elasticsearch_connection: Dict, + es_store: ElasticsearchStore, node_embeddings: List[TextNode], use_async: bool, ) -> None: - es_store = ElasticsearchStore( - **elasticsearch_connection, - index_name=index_name, - distance_strategy="COSINE", - ) q = VectorStoreQuery(query_embedding=[1.0, 0.0, 0.0], similarity_top_k=1) if use_async: @@ -333,4 +346,82 @@ async def test_add_to_es_query_and_delete( res = es_store.query(q) assert res.nodes assert len(res.nodes) == 1 - assert res.nodes[0].node_id == "c3d1e1dd-8fb4-4b8f-b7ea-7fa96038d39d" + assert res.nodes[0].node_id == "f658de3b-8cef-4d1c-8bed-9a263c907251" + + +@pytest.mark.skipif( + elasticsearch_not_available, reason="elasticsearch is not available" +) +@pytest.mark.asyncio +@pytest.mark.parametrize("use_async", [True, False]) +async def test_add_to_es_and_embed_query_ranked( + es_store: ElasticsearchStore, + node_embeddings: List[TextNode], + use_async: bool, +) -> None: + einstein_bronte_curie = [ + "bd2e080b-159a-4030-acc3-d98afd2ba49b", + "f658de3b-8cef-4d1c-8bed-9a263c907251", + "0b31ae71-b797-4e88-8495-031371a7752e", + ] + query_get_1_first = VectorStoreQuery( + query_embedding=[0.0, 0.0, 0.5], similarity_top_k=3 + ) + await check_top_match( + es_store, node_embeddings, use_async, query_get_1_first, *einstein_bronte_curie + ) + + +@pytest.mark.skipif( + elasticsearch_not_available, reason="elasticsearch is not available" +) +@pytest.mark.asyncio +@pytest.mark.parametrize("use_async", [True, False]) +async def test_add_to_es_and_text_query_ranked( + es_store: ElasticsearchStore, + node_embeddings: List[TextNode], + use_async: bool, +) -> None: + node1 = "0b31ae71-b797-4e88-8495-031371a7752e" + node2 = "f658de3b-8cef-4d1c-8bed-9a263c907251" + + query_get_1_first = VectorStoreQuery( + query_str="I was", mode=VectorStoreQueryMode.TEXT_SEARCH, similarity_top_k=2 + ) + await check_top_match( + es_store, node_embeddings, use_async, query_get_1_first, node1, node2 + ) + + query_get_2_first = VectorStoreQuery( + query_str="I am", mode=VectorStoreQueryMode.TEXT_SEARCH, similarity_top_k=2 + ) + await check_top_match( + es_store, node_embeddings, use_async, query_get_2_first, node2, node1 + ) + + +async def check_top_match( + es_store: ElasticsearchStore, + node_embeddings: List[TextNode], + use_async: bool, + query: VectorStoreQuery, + *expected_nodes: str, +) -> None: + if use_async: + await es_store.async_add(node_embeddings) + res = await es_store.aquery(query) + else: + es_store.add(node_embeddings) + res = es_store.query(query) + assert res.nodes + # test the nodes are return in the expected order + for i, node in enumerate(expected_nodes): + assert res.nodes[i].node_id == node + # test the returned order is in descending order w.r.t. similarities + # test similarities are normalized (0, 1) + df = pd.DataFrame({"node": res.nodes, "sim": res.similarities, "id": res.ids}) + sorted_by_sim = df.sort_values(by="sim", ascending=False) + for idx, item in enumerate(sorted_by_sim.itertuples()): + res_node = res.nodes[idx] + assert res_node.node_id == item.id + assert 0 <= item.sim <= 1