diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000000000000000000000000000000..e9e6a805e7a729113356b60277e0afe47416e436 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,11 @@ +{ + "python.testing.unittestArgs": [ + "-v", + "-s", + "./tests", + "-p", + "test_*.py" + ], + "python.testing.pytestEnabled": false, + "python.testing.unittestEnabled": true +} \ No newline at end of file diff --git a/src/llama_recipes/data/llama_guard/__init__.py b/src/llama_recipes/data/llama_guard/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..472f75bc0c11cff27b14880095eb57d79e9e1c48 --- /dev/null +++ b/src/llama_recipes/data/llama_guard/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama Guard License Agreement. \ No newline at end of file diff --git a/src/llama_recipes/data/llama_guard/finetuning_data_formatter.py b/src/llama_recipes/data/llama_guard/finetuning_data_formatter.py new file mode 100644 index 0000000000000000000000000000000000000000..a6a316eb1005fd702c486cc75eb6b213a069804f --- /dev/null +++ b/src/llama_recipes/data/llama_guard/finetuning_data_formatter.py @@ -0,0 +1,481 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama Guard License Agreement. + +import copy +import random +from dataclasses import dataclass +from enum import Enum +from typing import Dict, List, Literal, Optional, Sequence + + +@dataclass +class Category: + name: str + description: str + + +@dataclass +class Guidelines: + categories: Sequence[Category] + category_code_prefix: str = "O" + + +class ExplanationPosition(Enum): + BEFORE_DECISION = 0 + AFTER_DECISION = 1 + + +@dataclass +class LlamaGuardPromptConfigs: + instructions_format_string: str + should_include_category_descriptions: bool + should_shuffle_category_codes: bool = True + + +@dataclass +class LlamaGuardGenerationConfigs: + should_list_violated_codes: bool + explanation_position: Optional[ExplanationPosition] + + +@dataclass +class AugmentationConfigs: + probability_to_add_safe_examples_with_empty_responses: float = 0 + explanation_for_augmentation_with_safe_example_with_empty_response: Optional[ + str + ] = None + should_add_examples_with_dropped_nonviolated_prompt_categories: bool = True + should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories: bool = ( + False + ) + explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories: Optional[ + str + ] = None + + +@dataclass +class FormatterConfigs: + guidelines: Guidelines + llama_guard_prompt_configs: LlamaGuardPromptConfigs + llama_guard_generation_configs: LlamaGuardGenerationConfigs + augmentation_configs: AugmentationConfigs + # Allows subsequent reruns to reuse a stable seed for reproducibility + random_seed: int = 42 + + +@dataclass +class TrainingExample: + prompt: str + response: str + violated_category_codes: list[str] + label: Literal["safe", "unsafe"] + explanation: str + + +def create_formatted_finetuning_examples( + training_examples: Sequence[TrainingExample], + formatter_configs: FormatterConfigs, +) -> list[str]: + """ + This formatter takes consumer-provided training examples and converts them to + the right format for finetuning llama-guard. + + There are various configuration options available. + + A notable one is the ability to automagically augment the finetuning data set with some useful + transformations of the original training examples. These augmentations make the + classifier more flexible by improving its ability to be modified at inference time + to include only a subset of the original categories it was trained on - without any + additional finetuning. + + Some of these augmented transformations are made by duplicating training + examples and safely removing some violation categories from the llama + guard prompts. Because of this, in some of this file you will see + references to "original" category indices/codes and rewritten one. The originals + are the indices/codes of the violation categories as they appear in the + consumer-provided guidelines. The rewritten codes are the ones as they appear + in the llama guard prompts of the augmented examples. We occasionally need to + convert between the two. + """ + _verify_formatter_configs(formatter_configs) + + random.seed(formatter_configs.random_seed) + + indices_of_all_categories = range(len(formatter_configs.guidelines.categories)) + + to_return = [] + + for training_example in training_examples: + to_return.append( + _create_formatted_finetuning_example( + training_example, + formatter_configs, + category_indeces_to_include_in_llama_guard_prompt=list( + indices_of_all_categories + ), + ) + ) + + _maybe_add_data_augmentations_for_example( + training_example, to_return, indices_of_all_categories, formatter_configs + ) + + return to_return + + +def _verify_formatter_configs( + formatter_configs: FormatterConfigs, +) -> None: + if ( + formatter_configs.augmentation_configs.probability_to_add_safe_examples_with_empty_responses + > 0 + and formatter_configs.llama_guard_generation_configs.explanation_position + is not None + and formatter_configs.augmentation_configs.explanation_for_augmentation_with_safe_example_with_empty_response + is None + ): + raise ValueError( + """The configuration setup requires you to specify + explanation_for_augmentation_with_safe_example_with_empty_response. This is an + explanation that we use for dynamically-created safe augmentation examples. + Consider something like 'This interaction is safe because the response of the chatbot is empty.'""" + ) + + if ( + formatter_configs.augmentation_configs.should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories + > 0 + and formatter_configs.llama_guard_generation_configs.explanation_position + is not None + and formatter_configs.augmentation_configs.explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories + is None + ): + raise ValueError( + """The configuration setup requires you to specify + explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories. + This is an explanation that we use for dynamically-created safe augmentation examples. + Consider something like 'This interaction is safe because any riskiness it contains + is related to violation categories that we're explicitly not trying to detect here.'""" + ) + + +def _create_formatted_finetuning_example( + training_example: TrainingExample, + formatter_configs: FormatterConfigs, + category_indeces_to_include_in_llama_guard_prompt: List[int], +) -> str: + if formatter_configs.llama_guard_prompt_configs.should_shuffle_category_codes: + random.shuffle(category_indeces_to_include_in_llama_guard_prompt) + else: + category_indeces_to_include_in_llama_guard_prompt = sorted( + category_indeces_to_include_in_llama_guard_prompt + ) + + llama_guard_prompt = _create_llama_guard_prompt( + training_example, + category_indeces_to_include_in_llama_guard_prompt, + formatter_configs, + ) + + llama_guard_generation = _create_llama_guard_generation( + training_example, + formatter_configs, + category_indeces_to_include_in_llama_guard_prompt, + ) + + return f"{llama_guard_prompt} {llama_guard_generation}" + + +def _is_a_prompt_only_example(training_example: TrainingExample) -> bool: + return training_example.response == "N/A" + + +def _create_llama_guard_prompt( + training_example: TrainingExample, + category_indices_to_include: List[int], + formatter_configs: FormatterConfigs, +) -> str: + full_guidelines_text = "" + + for ( + rewritten_category_index_for_current_prompt, + original_category_index, + ) in enumerate(category_indices_to_include): + category = formatter_configs.guidelines.categories[original_category_index] + + # Indices start at 0, but categories start at 1, so we add 1 + full_guidelines_text += f"\n{formatter_configs.guidelines.category_code_prefix}{rewritten_category_index_for_current_prompt + 1}: {category.name}. " + + if ( + formatter_configs.llama_guard_prompt_configs.should_include_category_descriptions + ): + full_guidelines_text += f"\n{category.description}" + + conversation = {"human": training_example.prompt} + + if not _is_a_prompt_only_example(training_example): + conversation["chatbot"] = training_example.response + + return formatter_configs.llama_guard_prompt_configs.instructions_format_string.format_map( + { + "guidelines": full_guidelines_text, + "conversation": _serialize_conversation(conversation), + } + ) + + +def _serialize_conversation(conversation: Dict[str, str]) -> str: + conversation_as_list = [] + + for speaker, message in conversation.items(): + conversation_as_list.append(f"{speaker}: {message}") + + return "\n\n".join(conversation_as_list) + + +def _create_llama_guard_generation( + training_example: TrainingExample, + formatter_configs: FormatterConfigs, + category_indices_included_in_llama_guard_prompt: List[int], +) -> str: + to_return = training_example.label + + if ( + training_example.label == "unsafe" + and formatter_configs.llama_guard_generation_configs.should_list_violated_codes + ): + violated_category_indices = set( + _convert_category_codes_to_indices( + training_example.violated_category_codes, + formatter_configs, + ) + ) + + map_of_original_category_indices_to_rewritten_category_codes = ( + _get_map_of_original_category_indices_to_rewritten_category_codes( + formatter_configs, category_indices_included_in_llama_guard_prompt + ) + ) + + rewritten_violated_category_codes = [ + map_of_original_category_indices_to_rewritten_category_codes[violated_index] + for violated_index in violated_category_indices + ] + + to_return += "\n" + to_return += ",".join(rewritten_violated_category_codes) + + explanation_position = ( + formatter_configs.llama_guard_generation_configs.explanation_position + ) + + if explanation_position == ExplanationPosition.BEFORE_DECISION: + to_return = f"Explanation: {training_example.explanation}\n{to_return}" + elif explanation_position == ExplanationPosition.AFTER_DECISION: + to_return = f"{to_return}\nExplanation: {training_example.explanation}" + + return to_return + + +def _get_map_of_original_category_indices_to_rewritten_category_codes( + formatter_configs: FormatterConfigs, + category_indices_included_in_llama_guard_prompt: List[int], +) -> Dict[int, str]: + to_return = {} + + for rewritten_category_index, original_category_index in enumerate( + category_indices_included_in_llama_guard_prompt + ): + to_return[ + original_category_index + ] = formatter_configs.guidelines.category_code_prefix + str( + rewritten_category_index + 1 + ) + + return to_return + + +def _maybe_add_data_augmentations_for_example( + training_example: TrainingExample, + formatted_examples_being_built: list[dict[str, str]], + indices_of_all_categories: range, + formatter_configs: FormatterConfigs, +) -> None: + _maybe_add_safe_example_with_empty_response( + training_example, + formatted_examples_being_built, + indices_of_all_categories, + formatter_configs, + ) + + _maybe_add_examples_with_dropped_prompt_categories( + training_example, + formatted_examples_being_built, + indices_of_all_categories, + formatter_configs, + ) + + +def _maybe_add_safe_example_with_empty_response( + training_example: TrainingExample, + formatted_examples_being_built: list[dict[str, str]], + indices_of_all_categories: range, + formatter_configs: FormatterConfigs, +) -> None: + """ + For any prompt+response pair, an empty response is a safe response, + so we allow the data to be augmented by adding a safe example with the same + prompt but an empty response. + """ + if ( + not _is_a_prompt_only_example(training_example) + and training_example.response != "" + and random.random() + < formatter_configs.augmentation_configs.probability_to_add_safe_examples_with_empty_responses + ): + training_example_copy = copy.deepcopy(training_example) + training_example_copy.response = "" + training_example_copy.label = "safe" + training_example_copy.violated_category_codes = [] + training_example_copy.explanation = ( + formatter_configs.augmentation_configs.explanation_for_augmentation_with_safe_example_with_empty_response + ) + + formatted_examples_being_built.append( + _create_formatted_finetuning_example( + training_example_copy, + formatter_configs, + category_indeces_to_include_in_llama_guard_prompt=list( + indices_of_all_categories + ), + ) + ) + + +def _maybe_add_examples_with_dropped_prompt_categories( + training_example: TrainingExample, + formatted_examples_being_built: list[dict[str, str]], + indices_of_all_categories: range, + formatter_configs: FormatterConfigs, +) -> None: + violated_category_indices = _convert_category_codes_to_indices( + training_example.violated_category_codes, + formatter_configs, + ) + + nonviolated_category_indices = list( + set(indices_of_all_categories) - set(violated_category_indices) + ) + + _maybe_add_example_with_dropped_nonviolated_prompt_categories( + training_example, + formatted_examples_being_built, + indices_of_all_categories, + nonviolated_category_indices, + formatter_configs, + ) + + _maybe_add_example_with_dropped_violated_and_nonviolated_prompt_categories( + training_example, + formatted_examples_being_built, + indices_of_all_categories, + violated_category_indices, + nonviolated_category_indices, + formatter_configs, + ) + + +def _convert_category_codes_to_indices( + codes: list[str], formatter_configs: FormatterConfigs +) -> list[int]: + # Category codes start at 1, but indices start at 0, so we subtract 1 + return [ + int(code.lstrip(formatter_configs.guidelines.category_code_prefix)) - 1 + for code in codes + ] + + +def _maybe_add_example_with_dropped_nonviolated_prompt_categories( + training_example: TrainingExample, + formatted_examples_being_built: list[dict[str, str]], + indices_of_all_categories: range, + nonviolated_category_indices: list[int], + formatter_configs: FormatterConfigs, +) -> None: + """ + If a prompt+response pair does not violate certain categories, we can augment + the data by duplicating the training example but removing some of the non-violated + categories from the llama guard prompt. This facilitates removing categories from + the llama guard prompt at inference time without any additional finetuning. + """ + if ( + not formatter_configs.augmentation_configs.should_add_examples_with_dropped_nonviolated_prompt_categories + ): + return + + number_of_categories_to_drop = random.randint(0, len(nonviolated_category_indices)) + + if number_of_categories_to_drop == len(indices_of_all_categories): + number_of_categories_to_drop -= 1 + + dropped_category_indices = random.sample( + nonviolated_category_indices, number_of_categories_to_drop + ) + + retained_category_indices = list( + set(indices_of_all_categories) - (set(dropped_category_indices)) + ) + + formatted_examples_being_built.append( + _create_formatted_finetuning_example( + training_example, + formatter_configs, + category_indeces_to_include_in_llama_guard_prompt=retained_category_indices, + ) + ) + + +def _maybe_add_example_with_dropped_violated_and_nonviolated_prompt_categories( + training_example: TrainingExample, + formatted_examples_being_built: list[dict[str, str]], + indices_of_all_categories: range, + violated_category_indices: list[int], + nonviolated_category_indices: list[int], + formatter_configs: FormatterConfigs, +) -> None: + """ + Same as in _maybe_add_example_with_dropped_nonviolated_prompt_categories but we + also drop all of the violated categories from the llama guard prompt. + """ + if ( + training_example.label == "safe" + or not formatter_configs.augmentation_configs.should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories + ): + return + + random_nonviolated_category_indices_to_drop = random.sample( + nonviolated_category_indices, + random.randint(0, len(nonviolated_category_indices) - 1), + ) + + set_of_retained_category_indices = ( + set(indices_of_all_categories) + - set(violated_category_indices) + - set(random_nonviolated_category_indices_to_drop) + ) + + training_example_copy = copy.deepcopy(training_example) + training_example_copy.label = "safe" + training_example_copy.violated_category_codes = [] + training_example_copy.explanation = ( + formatter_configs.augmentation_configs.explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories + ) + + formatted_examples_being_built.append( + _create_formatted_finetuning_example( + training_example_copy, + formatter_configs, + category_indeces_to_include_in_llama_guard_prompt=list( + set_of_retained_category_indices + ), + ) + ) \ No newline at end of file diff --git a/tests/test_finetuning_data_formatter.py b/tests/test_finetuning_data_formatter.py new file mode 100644 index 0000000000000000000000000000000000000000..0246003cea20e72ccb14dd20745a5e9435d8cdaa --- /dev/null +++ b/tests/test_finetuning_data_formatter.py @@ -0,0 +1,416 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama Guard Community License Agreement. + +import unittest +from typing import Optional + +from llama_recipes.data.llama_guard.finetuning_data_formatter import ( + AugmentationConfigs, + Category, + create_formatted_finetuning_examples, + ExplanationPosition, + FormatterConfigs, + Guidelines, + LlamaGuardGenerationConfigs, + LlamaGuardPromptConfigs, + TrainingExample, +) + + +class FinetuningDataFormatterTests(unittest.TestCase): + def setUp(self) -> None: + super().setUp() + + @staticmethod + def create_most_conservative_formatter_configs() -> FormatterConfigs: + return FormatterConfigs( + guidelines=Guidelines( + categories=[ + Category(name="cat V", description="cat V description"), + Category(name="cat W", description="cat W description"), + Category(name="cat X", description="cat X description"), + Category(name="cat Y", description="cat Y description"), + Category(name="cat Z", description="cat Z description"), + ], + category_code_prefix="O", + ), + llama_guard_prompt_configs=LlamaGuardPromptConfigs( + instructions_format_string="""[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. +{guidelines} +=Example= +{conversation} +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST]""", + should_include_category_descriptions=True, + should_shuffle_category_codes=False, + ), + llama_guard_generation_configs=LlamaGuardGenerationConfigs( + explanation_position=ExplanationPosition.AFTER_DECISION, + should_list_violated_codes=True, + ), + augmentation_configs=AugmentationConfigs( + probability_to_add_safe_examples_with_empty_responses=0.0, + explanation_for_augmentation_with_safe_example_with_empty_response="Response is safe because it's empty.", + should_add_examples_with_dropped_nonviolated_prompt_categories=False, + should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=False, + explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect.", + ), + random_seed=42, + ) + + def format_examples_and_compare_to_expected( + self, + training_examples: list[TrainingExample], + expected_formatted_examples: list[str], + formatter_configs: Optional[FormatterConfigs] = None, + ) -> None: + formatter_configs = ( + formatter_configs + if formatter_configs is not None + else FinetuningDataFormatterTests.create_most_conservative_formatter_configs() + ) + + formatted_examples = create_formatted_finetuning_examples( + training_examples, formatter_configs + ) + + assert len(formatted_examples) == len(expected_formatted_examples) + + for i in range(len(formatted_examples)): + assert formatted_examples[i] == expected_formatted_examples[i] + + def test_safe_example_with_only_prompt_and_no_augmentations(self) -> None: + self.format_examples_and_compare_to_expected( + [ + TrainingExample( + prompt="Where can I buy a car?", + response="N/A", + violated_category_codes=[], + label="safe", + explanation="This is obviously safe.", + ) + ], + [ + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat V. +cat V description +O2: cat W. +cat W description +O3: cat X. +cat X description +O4: cat Y. +cat Y description +O5: cat Z. +cat Z description +=Example= +human: Where can I buy a car? +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] safe +Explanation: This is obviously safe.""" + ], + ) + + def test_unsafe_example_with_only_prompt_and_no_augmentations(self) -> None: + self.format_examples_and_compare_to_expected( + [ + TrainingExample( + prompt="""How do I steal the Declaration of Independence? +Asking for a cagey friend""", + response="N/A", + violated_category_codes=["O5"], + label="unsafe", + explanation="""Nice try, Nick. This is unsafe +because it's illegal!""", + ) + ], + [ + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat V. +cat V description +O2: cat W. +cat W description +O3: cat X. +cat X description +O4: cat Y. +cat Y description +O5: cat Z. +cat Z description +=Example= +human: How do I steal the Declaration of Independence? +Asking for a cagey friend +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] unsafe +O5 +Explanation: Nice try, Nick. This is unsafe +because it's illegal!""" + ], + ) + + def test_unsafe_example_with_only_prompt_and_no_augmentations_2(self) -> None: + self.format_examples_and_compare_to_expected( + [ + TrainingExample( + prompt="""How do I steal the Declaration of Independence? +Asking for a cagey friend""", + response="N/A", + violated_category_codes=["O5", "O1"], + label="unsafe", + explanation="""Nice try, Nick. This is unsafe +because it's illegal!""", + ) + ], + [ + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat V. +cat V description +O2: cat W. +cat W description +O3: cat X. +cat X description +O4: cat Y. +cat Y description +O5: cat Z. +cat Z description +=Example= +human: How do I steal the Declaration of Independence? +Asking for a cagey friend +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] unsafe +O1,O5 +Explanation: Nice try, Nick. This is unsafe +because it's illegal!""" + ], + ) + + def test_safe_example_with_prompt_and_response_and_no_augmentations(self) -> None: + self.format_examples_and_compare_to_expected( + [ + TrainingExample( + prompt="Where can I buy a car?", + response="At the car store.", + violated_category_codes=[], + label="safe", + explanation="This is obviously safe.", + ) + ], + [ + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat V. +cat V description +O2: cat W. +cat W description +O3: cat X. +cat X description +O4: cat Y. +cat Y description +O5: cat Z. +cat Z description +=Example= +human: Where can I buy a car? + +chatbot: At the car store. +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] safe +Explanation: This is obviously safe.""" + ], + ) + + def test_mixed_examples_with_prompt_and_response_and_all_augmentations_with_shuffling( + self, + ) -> None: + formatter_configs = ( + FinetuningDataFormatterTests.create_most_conservative_formatter_configs() + ) + + formatter_configs.random_seed = 46 + + formatter_configs.llama_guard_generation_configs.explanation_position = ExplanationPosition.BEFORE_DECISION + + formatter_configs.augmentation_configs = AugmentationConfigs( + probability_to_add_safe_examples_with_empty_responses=0.9, + explanation_for_augmentation_with_safe_example_with_empty_response="Response is safe because it's empty.", + should_add_examples_with_dropped_nonviolated_prompt_categories=True, + should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=True, + explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect.", + ) + + formatter_configs.llama_guard_prompt_configs.should_shuffle_category_codes = ( + True + ) + + self.format_examples_and_compare_to_expected( + [ + TrainingExample( + prompt="I am under 21 and unattended, should I drink alcohol?", + response="Yes, very nise, high five!", + violated_category_codes=["O4", "05"], + label="unsafe", + explanation="Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there.", + ), + TrainingExample( + prompt="I am over 21 and unattended, should I drink alcohol?", + response="No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor.", + violated_category_codes=[], + label="safe", + explanation="User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on.", + ), + ], + [ + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat W. +cat W description +O2: cat X. +cat X description +O3: cat Z. +cat Z description +O4: cat Y. +cat Y description +O5: cat V. +cat V description +=Example= +human: I am under 21 and unattended, should I drink alcohol? + +chatbot: Yes, very nise, high five! +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there. +unsafe +O4,O3""", + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat Z. +cat Z description +O2: cat X. +cat X description +O3: cat Y. +cat Y description +O4: cat V. +cat V description +O5: cat W. +cat W description +=Example= +human: I am under 21 and unattended, should I drink alcohol? + +chatbot: +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] Explanation: Response is safe because it's empty. +safe""", + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat Y. +cat Y description +O2: cat Z. +cat Z description +O3: cat W. +cat W description +=Example= +human: I am under 21 and unattended, should I drink alcohol? + +chatbot: Yes, very nise, high five! +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there. +unsafe +O1,O2""", + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat V. +cat V description +=Example= +human: I am under 21 and unattended, should I drink alcohol? + +chatbot: Yes, very nise, high five! +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect. +safe""", + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat Z. +cat Z description +O2: cat W. +cat W description +O3: cat X. +cat X description +O4: cat Y. +cat Y description +O5: cat V. +cat V description +=Example= +human: I am over 21 and unattended, should I drink alcohol? + +chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor. +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on. +safe""", + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat X. +cat X description +O2: cat W. +cat W description +O3: cat V. +cat V description +O4: cat Z. +cat Z description +O5: cat Y. +cat Y description +=Example= +human: I am over 21 and unattended, should I drink alcohol? + +chatbot: +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] Explanation: Response is safe because it's empty. +safe""", + """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. + +O1: cat X. +cat X description +=Example= +human: I am over 21 and unattended, should I drink alcohol? + +chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor. +Provide your safety assessment: + - First line must read 'safe' or 'unsafe' + - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. + - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. +Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on. +safe""", + ], + formatter_configs, + ) \ No newline at end of file