Skip to content
Snippets Groups Projects
Unverified Commit 53095d8b authored by Adam Schiller's avatar Adam Schiller Committed by GitHub
Browse files

Allow using different runtime kwargs for different evaluators in BatchEvalRunner (#11727)

* update batch_runner to allow multiple runtime kwargs when using multiple evaluators while maintaining backwards compatibility

* add tests for batch_runner

* remove unused mocker argument from tests

* mark tests as async to appease the CI/CD gods

* Combine two tests into one to appease the CI/CD gods
parent 529b5f6a
Branches
Tags
No related merge requests found
......@@ -57,7 +57,8 @@ async def response_worker(
class BatchEvalRunner:
"""Batch evaluation runner.
"""
Batch evaluation runner.
Args:
evaluators (Dict[str, BaseEvaluator]): Dictionary of evaluators.
......@@ -96,7 +97,8 @@ class BatchEvalRunner:
self,
*inputs_list: Any,
) -> List[Any]:
"""Validate and clean input lists.
"""
Validate and clean input lists.
Enforce that at least one of the inputs is not None.
Make sure that all inputs have the same length.
......@@ -123,10 +125,50 @@ class BatchEvalRunner:
new_inputs_list.append(inputs)
return new_inputs_list
def _validate_nested_eval_kwargs_types(
self, eval_kwargs_lists: Dict[str, Any]
) -> Dict[str, Any]:
"""
Ensure eval kwargs are acceptable format.
either a Dict[str, List] or a Dict[str, Dict[str, List]].
Allows use of different kwargs (e.g. references) with different evaluators
while keeping backwards compatibility for single evaluators
"""
if not isinstance(eval_kwargs_lists, dict):
raise ValueError(
f"eval_kwargs_lists must be a dict. Got {eval_kwargs_lists}"
)
for evaluator, eval_kwargs in eval_kwargs_lists.items():
if isinstance(eval_kwargs, list):
# maintain backwards compatibility - for use with single evaluator
eval_kwargs_lists[evaluator] = self._validate_and_clean_inputs(
eval_kwargs
)[0]
elif isinstance(eval_kwargs, dict):
# for use with multiple evaluators
for k in eval_kwargs:
v = eval_kwargs[k]
if not isinstance(v, list):
raise ValueError(
f"nested inner values in eval_kwargs must be a list. Got {evaluator}: {k}: {v}"
)
eval_kwargs_lists[evaluator][k] = self._validate_and_clean_inputs(
v
)[0]
else:
raise ValueError(
f"eval_kwargs must be a list or a dict. Got {evaluator}: {eval_kwargs}"
)
return eval_kwargs_lists
def _get_eval_kwargs(
self, eval_kwargs_lists: Dict[str, Any], idx: int
) -> Dict[str, Any]:
"""Get eval kwargs from eval_kwargs_lists at a given idx.
"""
Get eval kwargs from eval_kwargs_lists at a given idx.
Since eval_kwargs_lists is a dict of lists, we need to get the
value at idx for each key.
......@@ -139,9 +181,10 @@ class BatchEvalRunner:
queries: Optional[List[str]] = None,
response_strs: Optional[List[str]] = None,
contexts_list: Optional[List[List[str]]] = None,
**eval_kwargs_lists: List,
**eval_kwargs_lists: Dict[str, Any],
) -> Dict[str, List[EvaluationResult]]:
"""Evaluate query, response pairs.
"""
Evaluate query, response pairs.
This evaluates queries, responses, contexts as string inputs.
Can supply additional kwargs to the evaluator in eval_kwargs_lists.
......@@ -152,28 +195,30 @@ class BatchEvalRunner:
Defaults to None.
contexts_list (Optional[List[List[str]]]): List of context lists.
Defaults to None.
**eval_kwargs_lists (Dict[str, Any]): Dict of lists of kwargs to
pass to evaluator. Defaults to None.
**eval_kwargs_lists (Dict[str, Any]): Dict of either dicts or lists
of kwargs to pass to evaluator. Defaults to None.
multiple evaluators: {evaluator: {kwarg: [list of values]},...}
single evaluator: {kwarg: [list of values]}
"""
queries, response_strs, contexts_list = self._validate_and_clean_inputs(
queries, response_strs, contexts_list
)
for k in eval_kwargs_lists:
v = eval_kwargs_lists[k]
if not isinstance(v, list):
raise ValueError(
f"Each value in eval_kwargs must be a list. Got {k}: {v}"
)
eval_kwargs_lists[k] = self._validate_and_clean_inputs(v)[0]
eval_kwargs_lists = self._validate_nested_eval_kwargs_types(eval_kwargs_lists)
# run evaluations
eval_jobs = []
for idx, query in enumerate(cast(List[str], queries)):
response_str = cast(List, response_strs)[idx]
contexts = cast(List, contexts_list)[idx]
eval_kwargs = self._get_eval_kwargs(eval_kwargs_lists, idx)
for name, evaluator in self.evaluators.items():
if name in eval_kwargs_lists:
# multi-evaluator
kwargs = eval_kwargs_lists[name]
else:
# single evaluator (maintain backwards compatibility)
kwargs = eval_kwargs_lists
eval_kwargs = self._get_eval_kwargs(kwargs, idx)
eval_jobs.append(
eval_worker(
self.semaphore,
......@@ -196,7 +241,8 @@ class BatchEvalRunner:
responses: Optional[List[Response]] = None,
**eval_kwargs_lists: Dict[str, Any],
) -> Dict[str, List[EvaluationResult]]:
"""Evaluate query, response pairs.
"""
Evaluate query, response pairs.
This evaluates queries and response objects.
......@@ -204,25 +250,27 @@ class BatchEvalRunner:
queries (Optional[List[str]]): List of query strings. Defaults to None.
responses (Optional[List[Response]]): List of response objects.
Defaults to None.
**eval_kwargs_lists (Dict[str, Any]): Dict of lists of kwargs to
pass to evaluator. Defaults to None.
**eval_kwargs_lists (Dict[str, Any]): Dict of either dicts or lists
of kwargs to pass to evaluator. Defaults to None.
multiple evaluators: {evaluator: {kwarg: [list of values]},...}
single evaluator: {kwarg: [list of values]}
"""
queries, responses = self._validate_and_clean_inputs(queries, responses)
for k in eval_kwargs_lists:
v = eval_kwargs_lists[k]
if not isinstance(v, list):
raise ValueError(
f"Each value in eval_kwargs must be a list. Got {k}: {v}"
)
eval_kwargs_lists[k] = self._validate_and_clean_inputs(v)[0]
eval_kwargs_lists = self._validate_nested_eval_kwargs_types(eval_kwargs_lists)
# run evaluations
eval_jobs = []
for idx, query in enumerate(cast(List[str], queries)):
response = cast(List, responses)[idx]
eval_kwargs = self._get_eval_kwargs(eval_kwargs_lists, idx)
for name, evaluator in self.evaluators.items():
if name in eval_kwargs_lists:
# multi-evaluator
kwargs = eval_kwargs_lists[name]
else:
# single evaluator (maintain backwards compatibility)
kwargs = eval_kwargs_lists
eval_kwargs = self._get_eval_kwargs(kwargs, idx)
eval_jobs.append(
eval_response_worker(
self.semaphore,
......@@ -244,7 +292,8 @@ class BatchEvalRunner:
queries: Optional[List[str]] = None,
**eval_kwargs_lists: Dict[str, Any],
) -> Dict[str, List[EvaluationResult]]:
"""Evaluate queries.
"""
Evaluate queries.
Args:
query_engine (BaseQueryEngine): Query engine.
......@@ -275,7 +324,8 @@ class BatchEvalRunner:
contexts_list: Optional[List[List[str]]] = None,
**eval_kwargs_lists: List,
) -> Dict[str, List[EvaluationResult]]:
"""Evaluate query, response pairs.
"""
Evaluate query, response pairs.
Sync version of aevaluate_response_strs.
......@@ -295,7 +345,8 @@ class BatchEvalRunner:
responses: Optional[List[Response]] = None,
**eval_kwargs_lists: Dict[str, Any],
) -> Dict[str, List[EvaluationResult]]:
"""Evaluate query, response objs.
"""
Evaluate query, response objs.
Sync version of aevaluate_responses.
......@@ -314,7 +365,8 @@ class BatchEvalRunner:
queries: Optional[List[str]] = None,
**eval_kwargs_lists: Dict[str, Any],
) -> Dict[str, List[EvaluationResult]]:
"""Evaluate queries.
"""
Evaluate queries.
Sync version of aevaluate_queries.
......
from typing import Any, Optional, Sequence
from llama_index.core.base.response.schema import Response
from llama_index.core.evaluation import BaseEvaluator
from llama_index.core.evaluation.base import EvaluationResult
from llama_index.core.prompts.mixin import PromptDictType
from llama_index.core.evaluation.batch_runner import BatchEvalRunner
class MockEvaluator(BaseEvaluator):
def __init__(
self,
mock_score: float = 1.0,
mock_passing: bool = True,
mock_feedback: str = "test feedback",
) -> None:
self._mock_score = mock_score
self._mock_passing = mock_passing
self._mock_feedback = mock_feedback
def _get_prompts(self) -> PromptDictType:
"""Get prompts."""
return {}
def _update_prompts(self, prompts: PromptDictType) -> None:
"""Update prompts."""
async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
reference: Optional[str] = None,
**kwargs: Any,
) -> EvaluationResult:
return EvaluationResult(
query=query,
contexts=contexts,
response=response,
passing=(
str(response) == str(reference) if reference else self._mock_passing
),
score=self._mock_score,
feedback=self._mock_feedback,
)
def get_eval_results(key, eval_results):
results = eval_results[key]
correct = 0
for result in results:
if result.passing:
correct += 1
return correct / len(results)
def test_batch_runner() -> None:
# single evaluator
runner = BatchEvalRunner(evaluators={"evaluator1": MockEvaluator()})
exp_queries = ["query1", "query2"]
exp_response_strs = ["response1", "response2"]
exp_responses = [
Response(response="response1", source_nodes=[]),
Response(response="response2", source_nodes=[]),
]
# original eval_kwargs_lists format - Dict[str, List]
exp_kwargs = {"reference": ["response1", "response1"]}
# test evaluate_response_strs()
results = runner.evaluate_response_strs(
queries=exp_queries, response_strs=exp_response_strs, **exp_kwargs
)
assert get_eval_results("evaluator1", results) == 0.5
# test evaluate_responses()
results = runner.evaluate_responses(
queries=exp_queries, responses=exp_responses, **exp_kwargs
)
assert get_eval_results("evaluator1", results) == 0.5
# multiple evaluators
runner.evaluators = {
"evaluator1": MockEvaluator(),
"evaluator2": MockEvaluator(),
}
exp_queries = ["query1", "query2"]
exp_response_strs = ["response1", "response2"]
exp_responses = [
Response(response="response1", source_nodes=[]),
Response(response="response2", source_nodes=[]),
]
# updated eval_kwargs_lists format - Dict[str, Dict[str, List]]
exp_kwargs = {
"evaluator1": {"reference": ["response1", "response1"]},
"evaluator2": {"reference": ["response1", "response2"]},
}
# test evaluate_response_strs()
results = runner.evaluate_response_strs(
queries=exp_queries, response_strs=exp_response_strs, **exp_kwargs
)
assert get_eval_results("evaluator1", results) == 0.5
assert get_eval_results("evaluator2", results) == 1.0
# test evaluate_responses()
results = runner.evaluate_responses(
queries=exp_queries, responses=exp_responses, **exp_kwargs
)
assert get_eval_results("evaluator1", results) == 0.5
assert get_eval_results("evaluator2", results) == 1.0
assert get_eval_results("evaluator1", results) == 0.5
assert get_eval_results("evaluator2", results) == 1.0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment