From c8efe841fd543c34f822d49d3b4c7aea061079b4 Mon Sep 17 00:00:00 2001 From: Mingchen Zhuge <mczhuge@gmail.com> Date: Mon, 26 Feb 2024 23:53:21 +0300 Subject: [PATCH] upload experiments --- .github/workflows/pytest.yml | 54 +++ experiments/README.md | 64 ++++ experiments/crosswords/evaluate.py | 63 ++++ experiments/crosswords/train.py | 44 +++ experiments/evaluator/accuracy.py | 18 + .../evaluator/datasets/base_dataset.py | 30 ++ .../evaluator/datasets/mmlu_dataset.py | 94 ++++++ experiments/evaluator/evaluator.py | 308 ++++++++++++++++++ experiments/run_crosswords.py | 50 +++ experiments/run_gaia.py | 167 ++++++++++ experiments/run_humaneval.py | 122 +++++++ experiments/run_mmlu.py | 122 +++++++ 12 files changed, 1136 insertions(+) create mode 100644 .github/workflows/pytest.yml create mode 100644 experiments/README.md create mode 100644 experiments/crosswords/evaluate.py create mode 100644 experiments/crosswords/train.py create mode 100644 experiments/evaluator/accuracy.py create mode 100644 experiments/evaluator/datasets/base_dataset.py create mode 100644 experiments/evaluator/datasets/mmlu_dataset.py create mode 100644 experiments/evaluator/evaluator.py create mode 100644 experiments/run_crosswords.py create mode 100644 experiments/run_gaia.py create mode 100644 experiments/run_humaneval.py create mode 100644 experiments/run_mmlu.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 0000000..4e65025 --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,54 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Pytest GPTSwarm + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +permissions: + contents: read + +jobs: + pytest_test: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + with: + lfs: true + + - name: Checkout LFS objects + run: git lfs pull + + - name: Set up Python environment and install dependencies + uses: actions/setup-python@v3 + with: + python-version: "3.10" + + - uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ hashFiles('requirements_py310_linux.txt') }} + + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements_py310_linux.txt + + - name: Test with pytest + env: + GOOGLE_API_KEY: "${{ secrets.GOOGLE_API_KEY }}" + GOOGLE_CSE_ID: "${{ secrets.GOOGLE_CSE_ID }}" + run: | + pytest -s -m mock_llm test/ \ No newline at end of file diff --git a/experiments/README.md b/experiments/README.md new file mode 100644 index 0000000..1abeba4 --- /dev/null +++ b/experiments/README.md @@ -0,0 +1,64 @@ +## Run the following commands to reproduce our experiments in the paper + +### Download dataset + +The datasets are stored in git LFS. If git LFS is already installed in your system, no action is required. To check if git LFS is installed, run: +```bash +git lfs +``` +If you see an error message, install git LFS via this instruction](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage). Then run: +```bash +git lfs install +git lfs pull +``` + +### Include necessary submodules + +If the project has already been cloned: +```bash +git submodule init +git submodule update +``` +or a one-liner: +```bash +git submodule update --init --recursive +``` + +### **MMLU** +Run the baseline: +```bash +PYTHONPATH=. python experiments/run_mmlu.py --mode=DirectAnswer +``` + +Run fully-connected swarm ablation: +```bash +PYTHONPATH=. python experiments/run_mmlu.py --num-truthful-agents=3 --mode=FullConnectedSwarm +``` + +Run randomly-connected swarm ablation: +```bash +PYTHONPATH=. python experiments/run_mmlu.py --num-truthful-agents=3 --mode=RandomSwarm +``` + +Run the main experiment with optimization and eventual evaluation: +```bash +PYTHONPATH=. python experiments/run_mmlu.py --num-truthful-agents=3 --mode=OptimizedSwarm +``` + +### **Mini Crosswords** +Run the REINFORCE algorithm for edge optimization with three agents as described in the paper. +```bash +PYTHONPATH=. python experiments/run_crosswords.py +``` + +### **HumanEval** +Run node optimization that improves the demonstration examples of each node. +```bash +PYTHONPATH=. python experiments/run_humaneval.py --learn_demonstration True +``` + +### **GAIA** +Run the general assistant tasks. +```bash +PYTHONPATH=. python experiments/run_gaia.py +``` \ No newline at end of file diff --git a/experiments/crosswords/evaluate.py b/experiments/crosswords/evaluate.py new file mode 100644 index 0000000..49ae8d4 --- /dev/null +++ b/experiments/crosswords/evaluate.py @@ -0,0 +1,63 @@ +import json +from tqdm import tqdm +import asyncio +import numpy as np +from copy import deepcopy +import pickle +import torch +import sys +import random + +from swarm.environment.domain.crosswords.env import MiniCrosswordsEnv +from swarm.environment.agents.agent_registry import AgentRegistry +from swarm.graph.swarm import Swarm +from swarm.optimizer.edge_optimizer.optimization import optimize +from swarm.environment.domain.crosswords.evaluator import CrosswordsEvaluator + + +def batched_evaluator(evaluator, batch_size, graph, loop): + tasks = [] + for _ in range(batch_size): + tasks.append(evaluator.evaluate(deepcopy(graph))) + return loop.run_until_complete(asyncio.gather(*tasks)) + +if __name__ == "__main__": + file_path = "datasets/crosswords/mini0505_0_100_5.json" + with open(file_path, "r") as file: + test_data = json.load(file) + + experiment_id = "experiment1" + init_connection_probability = .1 + epochs = 1 + batch_size = 4 + use_learned_order = True + num_batches = int(len(test_data) / batch_size) + evaluator = CrosswordsEvaluator(test_data, batch_size=batch_size, metric="words", window_size=num_batches) + swarm = Swarm(["CrosswordsReflection", "CrosswordsToT"], "crosswords", "gpt-4-1106-preview", #"gpt-3.5-turbo-1106", + final_node_class="TakeBest", final_node_kwargs={}, edge_optimize=True, + init_connection_probability=init_connection_probability, connect_output_nodes_to_final_node=True) + swarm.connection_dist.load_state_dict(torch.load(f"result/crosswords_Jan15/{experiment_id}_edge_logits_{int(epochs * len(test_data) / batch_size) - 1}.pkl")) + + num_edges = [] + for _ in range(100): + graph = swarm.connection_dist.realize(swarm.composite_graph, use_learned_order=use_learned_order)[0] + num_edges.append(graph.num_edges) + num_edges = int(np.array(num_edges).mean()) + print(f"Expected number of edges: {num_edges}") + + graphs = [ + swarm.connection_dist.random_sample_num_edges(swarm.composite_graph, num_edges), + swarm.connection_dist.realize(swarm.composite_graph, threshold=init_connection_probability, use_learned_order=use_learned_order)[0], + swarm.connection_dist.realize(swarm.composite_graph, use_learned_order=use_learned_order)[0], + swarm.composite_graph, + ] + loop = asyncio.get_event_loop() + for i, graph in tqdm(enumerate(graphs)): + print(f"{graph.num_edges} edges") + utilities = [] + evaluator.reset() + for k in range(num_batches): + utilities += batched_evaluator(evaluator, batch_size, graph, loop) + print(f"avg. utility = {np.mean(utilities):.3f}") + with open(f"result/crosswords/{experiment_id}_final_utilities_{i}.pkl", "wb") as file: + pickle.dump(utilities, file) \ No newline at end of file diff --git a/experiments/crosswords/train.py b/experiments/crosswords/train.py new file mode 100644 index 0000000..c735154 --- /dev/null +++ b/experiments/crosswords/train.py @@ -0,0 +1,44 @@ +import json +from tqdm import tqdm +import asyncio +import numpy as np +from copy import deepcopy +import pickle +import torch +import sys +import random + +from swarm.environment.domain.crosswords.env import MiniCrosswordsEnv +from swarm.environment.agents.agent_registry import AgentRegistry +from swarm.graph.swarm import Swarm +from swarm.optimizer.edge_optimizer.optimization import optimize +from swarm.environment.domain.crosswords.evaluator import CrosswordsEvaluator + + +if __name__ == "__main__": + if len(sys.argv) == 2: + id = int(sys.argv[1]) + experiment_id = f"experiment{id}" + torch.manual_seed(id) + np.random.seed(id) + random.seed(id) + else: + experiment_id = "experiment" + + file_path = "datasets/crosswords/mini0505_0_100_5.json" + with open(file_path, "r") as file: + test_data = json.load(file) + + init_connection_probability = .1 + epochs = 2 + batch_size = 4 + use_learned_order = True + include_inner_agent_connections = True + window_size = int(len(test_data) / batch_size) + evaluator = CrosswordsEvaluator(test_data, batch_size=batch_size, metric="words", window_size=window_size) + swarm = Swarm(["CrosswordsReflection", "CrosswordsToT"], "crosswords", "gpt-4-1106-preview", #"gpt-3.5-turbo-1106", + final_node_class="TakeBest", final_node_kwargs={}, edge_optimize=True, + init_connection_probability=init_connection_probability, connect_output_nodes_to_final_node=True, + include_inner_agent_connections=include_inner_agent_connections) + optimize(swarm, evaluator, batch_size=batch_size, num_iter=int(epochs * len(test_data) / batch_size), display_freq=1, record=True, + experiment_id=experiment_id, lr=.25, use_learned_order=use_learned_order) \ No newline at end of file diff --git a/experiments/evaluator/accuracy.py b/experiments/evaluator/accuracy.py new file mode 100644 index 0000000..c9b5468 --- /dev/null +++ b/experiments/evaluator/accuracy.py @@ -0,0 +1,18 @@ + +class Accuracy: + def __init__(self): + self._num_correct = 0 + self._num_total = 0 + + def update(self, predicted: str, target: str) -> None: + is_correct = predicted == target + self._num_correct += int(is_correct) + self._num_total += 1 + + def get(self) -> float: + return self._num_correct / self._num_total + + def print(self): + accuracy = self.get() + print(f"Accuracy: {accuracy*100:.1f}% " + f"({self._num_correct}/{self._num_total})") diff --git a/experiments/evaluator/datasets/base_dataset.py b/experiments/evaluator/datasets/base_dataset.py new file mode 100644 index 0000000..fddd2d1 --- /dev/null +++ b/experiments/evaluator/datasets/base_dataset.py @@ -0,0 +1,30 @@ +import pandas as pd +from typing import Dict, Any, Union, List +from collections.abc import Sequence +from abc import ABC, abstractmethod + + +SwarmInput = Dict[str, Any] + + +class BaseDataset(ABC, Sequence[Any]): + @staticmethod + @abstractmethod + def get_domain() -> str: + """ To be overriden. """ + + @abstractmethod + def split(self) -> str: + """ To be overriden. """ + + @abstractmethod + def record_to_swarm_input(self, record: pd.DataFrame) -> SwarmInput: + """ To be overriden. """ + + @abstractmethod + def postprocess_answer(self, answer: Union[str, List[str]]) -> str: + """ To be overriden. """ + + @abstractmethod + def record_to_target_answer(self, record: pd.DataFrame) -> str: + """ To be overriden. """ diff --git a/experiments/evaluator/datasets/mmlu_dataset.py b/experiments/evaluator/datasets/mmlu_dataset.py new file mode 100644 index 0000000..966a1db --- /dev/null +++ b/experiments/evaluator/datasets/mmlu_dataset.py @@ -0,0 +1,94 @@ +import glob +import pandas as pd +from typing import Union, List, Literal +import numpy as np + +from experiments.evaluator.datasets.base_dataset import BaseDataset, SwarmInput + + +class MMLUDataset(BaseDataset): + def __init__(self, + split: Union[Literal['dev'], Literal['val'], Literal['test']], + ) -> None: + + self._split = split + + data_path = f"datasets/MMLU/data/{self._split}/" + self._total_df: pd.DataFrame = self._load_data(data_path) + + @staticmethod + def get_domain() -> str: + return 'mmlu' + + @staticmethod + def _load_data( + data_path: str, + ) -> pd.DataFrame: + + rng = np.random.default_rng(888) + + csv_paths = glob.glob(data_path + "*.csv") + csv_paths = sorted(csv_paths) + print("Number of topics: ", len(csv_paths)) + + names = ['question', 'A', 'B', 'C', 'D', 'correct_answer'] + + total_df = pd.DataFrame(columns=names) + for path in csv_paths: + single_df = pd.read_csv(path, header=None, + names=names) + total_df = pd.concat([total_df, single_df]) + + total_df = total_df.reset_index(drop=True) + + # Pseudorandom shuffle + total_df = total_df.reindex(rng.permutation(total_df.index)) + + print("Total number of questions: ", len(total_df)) + + return total_df + + @property + def split(self) -> str: + return self._split + + def __len__(self) -> int: + return len(self._total_df) + + def __getitem__(self, index: int) -> pd.DataFrame: + record = self._total_df.iloc[index] + assert isinstance(record, pd.DataFrame) or isinstance(record, pd.Series) + return record + + @staticmethod + def record_to_swarm_input(record: pd.DataFrame) -> SwarmInput: + demo_question = ( + f"{record['question']}\n" + f"Option A: {record['A']}\n" + f"Option B: {record['B']}\n" + f"Option C: {record['C']}\n" + f"Option D: {record['D']}\n" + ) + input_dict = {"task": demo_question} + return input_dict + + def postprocess_answer(self, answer: Union[str, List[str]]) -> str: + if isinstance(answer, list): + if len(answer) > 0: + answer = answer[0] + else: + answer = "" + if not isinstance(answer, str): + raise Exception("Expected string") + if len(answer) > 0: + answer = answer[0] # Try to format the answer by taking the first letter + return answer + + @staticmethod + def record_to_target_answer(record: pd.DataFrame) -> str: + correct_answer = record['correct_answer'] + assert isinstance(correct_answer, str), ( + f"String expected but got {correct_answer} " + f"of type {type(correct_answer)} (2)" \ + f" record={record}") + return correct_answer diff --git a/experiments/evaluator/evaluator.py b/experiments/evaluator/evaluator.py new file mode 100644 index 0000000..53d2733 --- /dev/null +++ b/experiments/evaluator/evaluator.py @@ -0,0 +1,308 @@ +import os +import asyncio +import pandas as pd +from typing import Iterable, Optional, Iterator, Union, Literal, List, Dict, Any +from tqdm import tqdm +import torch +import time +import datetime +from torch.utils.tensorboard.writer import SummaryWriter +import numpy as np +import json +import math + +from swarm.graph import Graph +from swarm.environment.agents import IO +from swarm.graph.swarm import Swarm +from experiments.evaluator.datasets.base_dataset import BaseDataset +from experiments.evaluator.accuracy import Accuracy + + +class Evaluator(): + def __init__( + self, + swarm: Optional[Swarm], + train_dataset: BaseDataset, + val_dataset: BaseDataset, + model_name: Optional[str] = None, + enable_tensorboard: bool = False, + enable_artifacts: bool = False, + tensorboard_tag: Optional[str] = None, + ) -> None: + + self._swarm: Optional[Swarm] = swarm + self._train_dataset: BaseDataset = train_dataset + self._val_dataset: BaseDataset = val_dataset + self._model_name: Optional[str] = model_name + + datetime_str = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + art_dir_name = (f"{datetime_str}" + + (f"_{tensorboard_tag}" if tensorboard_tag is not None else "")) + + if enable_artifacts or enable_tensorboard: + self._art_dir_name = os.path.join("runs", art_dir_name) + os.makedirs(self._art_dir_name, exist_ok=True) + else: + self._art_dir_name = None + + if enable_tensorboard: + self._logger = SummaryWriter(log_dir=self._art_dir_name) + else: + self._logger = None + + async def evaluate_direct_answer(self, + limit_questions: Optional[int] = None, + ) -> float: + + dataset = self._val_dataset + + print(f"Evaluating DirectAnswer on {dataset.get_domain()} split {dataset.split}") + + io_agent = IO(dataset.get_domain(), self._model_name) + + accuracy = Accuracy() + + for i_question, record in tqdm(enumerate(dataset)): + print(80*'-') + if limit_questions is not None: + if i_question >= limit_questions: + break + + input_dict = dataset.record_to_swarm_input(record) + print(input_dict) + + raw_answer = await io_agent.run(input_dict) + + print("Raw answer:", raw_answer) + answer = dataset.postprocess_answer(raw_answer) + print("Postprocessed answer:", answer) + correct_answer = dataset.record_to_target_answer(record) + accuracy.update(answer, correct_answer) + accuracy.print() + + print("Final accuracy:") + accuracy.print() + + self._dump_eval_results(dict( + accuracy=accuracy.get(), + limit_questions=limit_questions)) + + print("Done!") + return accuracy.get() + + async def evaluate_swarm( + self, + mode: Union[ + Literal['full_connected_swarm'], + Literal['randomly_connected_swarm'], + Literal['external_edge_probs'], + ], + edge_probs: Optional[torch.Tensor] = None, + limit_questions: Optional[int] = None, + eval_batch_size: int = 4, + ) -> float: + + assert self._swarm is not None + + dataset = self._val_dataset + + print(f"Evaluating swarm on {dataset.__class__.__name__} split {dataset.split}") + + realized_graph: Optional[Graph] + if mode == 'full_connected_swarm': + realized_graph = self._swarm.connection_dist.realize_full(self._swarm.composite_graph) + elif mode == 'external_edge_probs': + assert edge_probs is not None + edge_mask = edge_probs > 0.5 + realized_graph = self._swarm.connection_dist.realize_mask(self._swarm.composite_graph, edge_mask) + realized_graph.display() + else: + realized_graph = None + + accuracy = Accuracy() + + def eval_loader(batch_size: int) -> Iterator[List[Any]]: + records = [] + for i_record, record in enumerate(dataset): + if limit_questions is not None: + if i_record >= limit_questions: + break + records.append(record) + if len(records) >= batch_size: + yield records + records = [] + if len(records) > 0: + yield records + return + + data_len = min(len(dataset), limit_questions) if limit_questions is not None else len(dataset) + num_batches = int(math.ceil(data_len / eval_batch_size)) + + for i_batch, record_batch in tqdm(enumerate(eval_loader(batch_size=eval_batch_size)), total=num_batches): + print(80*'-') + + start_ts = time.time() + + future_answers = [] + for record in record_batch: + if mode == 'randomly_connected_swarm': + realized_graph, _ = self._swarm.connection_dist.realize(self._swarm.composite_graph) + assert realized_graph is not None + + input_dict = dataset.record_to_swarm_input(record) + print(input_dict) + + future_answer = self._swarm.arun(input_dict, realized_graph) + future_answers.append(future_answer) + + raw_answers = await asyncio.gather(*future_answers) + + print(f"Batch time {time.time() - start_ts:.3f}") + + for raw_answer, record in zip(raw_answers, record_batch): + print("Raw answer:", raw_answer) + answer = dataset.postprocess_answer(raw_answer) + print("Postprocessed answer:", answer) + correct_answer = dataset.record_to_target_answer(record) + print("Correct answer:", correct_answer) + accuracy.update(answer, correct_answer) + accuracy.print() + + accuracy.print() + print("Done!") + + self._dump_eval_results(dict( + accuracy=accuracy.get(), + limit_questions=limit_questions)) + + return accuracy.get() + + def _dump_eval_results(self, dct: Dict[str, Any]) -> None: + if self._art_dir_name is not None: + eval_json_name = os.path.join(self._art_dir_name, "evaluation.json") + with open(eval_json_name, "w") as f: + json.dump(dct, f) + + def _print_conns(self, edge_probs: torch.Tensor, save_to_file: bool = False): + assert self._swarm is not None + msgs = [] + for i_conn, (conn, prob) in enumerate(zip( + self._swarm.connection_dist.potential_connections, edge_probs)): + src_id, dst_id = conn + src_node = self._swarm.composite_graph.find_node(src_id) + dst_node = self._swarm.composite_graph.find_node(dst_id) + msg = (f"{i_conn}: src={src_node.node_name}({src_node.id}), " + f"dst={dst_node.node_name}({dst_node.id}), prob={prob.item():.3f}") + msgs.append(msg+"\n") + print(msg) + if save_to_file: + if self._art_dir_name is not None: + txt_name = os.path.join(self._art_dir_name, "connections.txt") + with open(txt_name, "w") as f: + f.writelines(msgs) + + async def optimize_swarm( + self, + num_iters: int, + lr: float, + batch_size: int = 4, + ) -> torch.Tensor: + + assert self._swarm is not None + + dataset = self._train_dataset + + print(f"Optimizing swarm on {dataset.__class__.__name__} split {dataset.split}") + + optimizer = torch.optim.Adam(self._swarm.connection_dist.parameters(), lr=lr) + + if self._art_dir_name is not None: + hp_json_name = os.path.join(self._art_dir_name, "hp.json") + with open(hp_json_name, "w") as f: + json.dump(dict(lr=lr, + batch_size=batch_size, + num_iters=num_iters, + model_name=self._model_name + ), f) + + def infinite_data_loader() -> Iterator[pd.DataFrame]: + perm = np.random.permutation(len(dataset)) + while True: + for idx in perm: + record = dataset[idx.item()] + yield record + + loader = infinite_data_loader() + + edge_probs = None + for i_iter in range(num_iters): + print(f"Iter {i_iter}", 80*'-') + + start_ts = time.time() + + future_answers = [] + log_probs = [] + correct_answers = [] + for i_record, record in zip(range(batch_size), loader): + + realized_graph, log_prob = self._swarm.connection_dist.realize( + self._swarm.composite_graph, + # temperature=3.0, # DEBUG + ) + + input_dict = dataset.record_to_swarm_input(record) + answer = self._swarm.arun(input_dict, realized_graph) + future_answers.append(answer) + log_probs.append(log_prob) + correct_answer = dataset.record_to_target_answer(record) + correct_answers.append(correct_answer) + + raw_answers = await asyncio.gather(*future_answers) + + print(f"Batch time {time.time() - start_ts:.3f}") + + loss_list: List[torch.Tensor] = [] + utilities: List[float] = [] + for raw_answer, log_prob, correct_answer in zip(raw_answers, log_probs, correct_answers): + answer = dataset.postprocess_answer(raw_answer) + assert isinstance(correct_answer, str), \ + f"String expected but got {correct_answer} of type {type(correct_answer)} (1)" + accuracy = Accuracy() + accuracy.update(answer, correct_answer) + utility = accuracy.get() + utilities.append(utility) + single_loss = - log_prob * utility + loss_list.append(single_loss) + + print("utilities:", utilities) + mean_utility = np.mean(np.array(utilities)) + total_loss = torch.mean(torch.stack(loss_list)) + + print("loss:", total_loss.item()) + optimizer.zero_grad() + total_loss.backward() + print("Grad:", self._swarm.connection_dist.edge_logits.grad) + optimizer.step() + + print("edge_logits:", self._swarm.connection_dist.edge_logits) + edge_probs = torch.sigmoid(self._swarm.connection_dist.edge_logits) + print("edge_probs:", edge_probs) + + self._print_conns(edge_probs) + + if self._logger is not None: + self._logger.add_scalar("train/loss", total_loss.item(), i_iter) + self._logger.add_scalar("train/utility", mean_utility.item(), i_iter) + if self._art_dir_name is not None: + log_jsonl_name = os.path.join(self._art_dir_name, "training.jsonl") + with open(log_jsonl_name, "a") as f: + json.dump(dict(iter=i_iter, train_loss=total_loss.item(), train_utility=mean_utility.item()), f) + f.write("\n") + print("end of iteration") + + if edge_probs is not None: + self._print_conns(edge_probs, save_to_file=True) + + print("Done!") + edge_probs = torch.sigmoid(self._swarm.connection_dist.edge_logits) + return edge_probs diff --git a/experiments/run_crosswords.py b/experiments/run_crosswords.py new file mode 100644 index 0000000..04736ac --- /dev/null +++ b/experiments/run_crosswords.py @@ -0,0 +1,50 @@ +import json +from tqdm import tqdm +import asyncio +import numpy as np +from copy import deepcopy +import pickle +import torch +import sys +import random + +from swarm.environment.domain.crosswords.env import MiniCrosswordsEnv +from swarm.environment.agents.agent_registry import AgentRegistry +from swarm.graph.swarm import Swarm +from swarm.optimizer.edge_optimizer.optimization import optimize +from swarm.environment.domain.crosswords.evaluator import CrosswordsEvaluator + + +if __name__ == "__main__": + if len(sys.argv) == 2: + id = int(sys.argv[1]) + experiment_id = f"experiment{id}" + torch.manual_seed(id) + np.random.seed(id) + random.seed(id) + else: + experiment_id = "experiment" + id = 0 + + print(experiment_id) + + file_path = "datasets/crosswords/mini0505_0_100_5.json" + with open(file_path, "r") as file: + test_data = json.load(file) + + init_connection_probability = .1 + batch_size = 20 + use_learned_order = False + include_inner_agent_connections = True + connect_output_nodes_to_final_node = True + window_size = 10 + evaluator = CrosswordsEvaluator(test_data, batch_size=batch_size, metric="words", window_size=window_size, init_socre=0.4, use_init_score=True) + swarm = Swarm(["CrosswordsReflection", "CrosswordsToT", "CrosswordsBruteForceOpt"], "crosswords", "gpt-3.5-turbo-1106", #"gpt-4-1106-preview" + final_node_class="ReturnAll", + final_node_kwargs={}, + edge_optimize=True, + init_connection_probability=init_connection_probability, + connect_output_nodes_to_final_node=connect_output_nodes_to_final_node, + include_inner_agent_connections=include_inner_agent_connections) + optimize(swarm, evaluator, batch_size=batch_size, num_iter=11, display_freq=1, record=True, + experiment_id=experiment_id, lr=.4, use_learned_order=use_learned_order) \ No newline at end of file diff --git a/experiments/run_gaia.py b/experiments/run_gaia.py new file mode 100644 index 0000000..a22ad41 --- /dev/null +++ b/experiments/run_gaia.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import argparse +import yaml +import json +import time +import asyncio +from pathlib import Path + +from swarm.graph.swarm import Swarm +from swarm.environment.tools.reader.readers import JSONReader, YAMLReader +from swarm.environment.agents.io import IO +from swarm.environment.agents.gaia.normal_io import NormalIO +from swarm.environment.agents.gaia.tool_io import ToolIO +from swarm.environment.agents.gaia.web_io import WebIO +from swarm.environment.agents.gaia.tool_tot import ToolTOT +from swarm.environment.operations import DirectAnswer +from swarm.memory.memory import GlobalMemory +from swarm.utils.globals import Time +from swarm.utils.const import GPTSWARM_ROOT +from swarm.utils.log import logger +from swarm.environment.domain.gaia import question_scorer +from swarm.environment.operations.final_decision import MergingStrategy + + +def dataloader(data_list): + for data in data_list: + yield data + +def load_config(config_path): + with open(config_path, 'r') as file: + return yaml.safe_load(file) + +async def main(): + parser = argparse.ArgumentParser(description="GPTSwarm Experiments on GAIA") + parser.add_argument("--config", type=str, help="Path to configuration YAML file.") + parser.add_argument("--domain", type=str, default="gaia") + parser.add_argument("--agents", nargs='+', default=["IO"]) + parser.add_argument("--dataset_json", type=str, default="datasets/gaia/level_1_val.json") #level_1_val_solveable.json + parser.add_argument("--dataset_files", type=str, default="datasets/gaia/val_files") + parser.add_argument("--result_file", type=str, default=None) + parser.add_argument("--llm", type=str, default="gpt-4-1106-preview") #gpt-4-1106-preview gpt-3.5-turbo-1106 gpt-3.5-turbo gpt-4 + args = parser.parse_args() + + result_path = GPTSWARM_ROOT / "result" + os.makedirs(result_path, exist_ok=True) + + if args.config: + config_args = YAMLReader.parse(args.config, return_str=False) + for key, value in config_args.items(): + setattr(args, key, value) + + start_index = 0 + result_file = None + + dataset = JSONReader.parse_file(args.dataset_json) + + #################################### + + # strategy = MergingStrategy.SelfConsistency #MergingStrategy.SelectBest #MergingStrategy.SelfConsistency #MergingStrategy.SelectBest #MergingStrategy.SelectBest #MergingStrategy.SelfConsistency # MergingStrategy.MajorityVote MergingStrategy.RandomChoice + + experiment_name = "ToolTOT" + + # swarm = Swarm(["ToolTOT"]*7, + # "gaia", + # model_name="mock", #args.llm, #"mock", #args.llm,#args.llm, + # final_node_class="FinalDecision", + # final_node_kwargs=dict(strategy=strategy) + # ) + # swarm.composite_graph.display() + + print(args.llm) + + #agent = IO(domain="gaia", model_name=args.llm) + #agent = WebIO(domain="gaia", model_name=args.llm) + #agent = ToolIO(domain="gaia", model_name=args.llm) + agent = ToolTOT(domain="gaia", model_name=args.llm) + + #io = DirectAnswer(domain="gaia", model_name=args.llm) + + agent.display() + + #################################### + + for i, item in enumerate(dataloader(dataset)): + + if i < start_index: + print(f"Skipping index {i}...") + continue + + start_time = time.time() + task = item["Question"] + files = [os.path.join(args.dataset_files, item["file_name"])] if item["file_name"] else item["file_name"] + ground_truth = item["Final answer"] + inputs = {"task": task, "files": files, "GT": ground_truth} + + # Swarm + # answer = await swarm.composite_graph.run(inputs) + # answer = answer[-1].split("FINAL ANSWER: ")[-1] + + # end_time = time.time() + # exe_time = end_time - start_time + + # print("-----") + # print(f"SWARM ANSWER: {answer}") + # print("-----") + + # Agent + answer = await agent.run(inputs=inputs) + answer = answer[-1].split("FINAL ANSWER: ")[-1] + + end_time = time.time() + exe_time = end_time - start_time + + + print("-----") + print(f"AGENT ANSWER: {answer}") + print("-----") + + """ + answer = await io._execute(inputs=inputs) + answer = answer[-1]["output"].split("FINAL ANSWER: ")[-1] + + print("-----") + print(f"OPERATION ANSWER: {answer}") + print("-----") + """ + + current_time = Time.instance().value or time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + Time.instance().value = current_time + + result_dir = Path(f"{GPTSWARM_ROOT}/result/eval") + result_file = result_file or (result_dir / f"{'_'.join(experiment_name.split())}_{args.llm}_{current_time}.json") + + result_dir.mkdir(parents=True, exist_ok=True) + + if not result_file.exists(): + with open(result_file, 'w') as file: + json.dump([], file) + + with open(result_file, 'r') as file: + data = json.load(file) + + total_solved, total_executed = (0, 0) if not data else (data[-1]["Total solved"], data[-1]["Total executed"]) + is_solved = question_scorer(answer, item['Final answer']) + + updated_item = { + "Question": item["Question"], + "GT": item['Final answer'], + "Attempt answer": answer, + "Solved": is_solved, + "Total solved": total_solved + is_solved, + "Total executed": total_executed + 1, + "Accuracy": (total_solved + is_solved) / (total_executed + 1), + "Time": exe_time, + } + data.append(updated_item) + + with open(result_file, 'w') as file: + json.dump(data, file, indent=4) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/experiments/run_humaneval.py b/experiments/run_humaneval.py new file mode 100644 index 0000000..48cd4c2 --- /dev/null +++ b/experiments/run_humaneval.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import argparse +import yaml +import json +import time +import asyncio +from pathlib import Path + +from swarm.environment.tools.reader.readers import JSONLReader, YAMLReader +from swarm.environment.agents.humaneval.code_react import CodeReact +from swarm.environment.tools.coding.python_executor import PyExecutor +from swarm.memory.memory import GlobalMemory +from swarm.utils.globals import Time +from swarm.utils.const import GPTSWARM_ROOT +from swarm.utils.log import logger +from swarm.environment.operations.optimizable_operation import OptimizableOperation +from swarm.optimizer.node_optimizer.node_optimization import optimize + + +def load_result(result_file): + if not result_file.exists(): + with open(result_file, 'w') as file: + json.dump([], file) + + with open(result_file, 'r') as file: + data = json.load(file) + return data + +def dataloader(data_list): + for data in data_list: + yield data + +def load_config(config_path): + with open(config_path, 'r') as file: + return yaml.safe_load(file) + +def parse_args(): + parser = argparse.ArgumentParser(description="GPTSwarm Experiments on HumanEval") + parser.add_argument("--config", type=str, help="Path to configuration YAML file.") + parser.add_argument("--dataset_json", type=str, default="datasets/humaneval/humaneval-py.jsonl") + parser.add_argument("--result_file", type=str, default=None) + parser.add_argument("--llm", type=str, default="gpt-4-1106-preview") + parser.add_argument("--learn_prompt", type=bool, default=False) + parser.add_argument("--learn_demonstration", type=bool, default=False) + + args = parser.parse_args() + result_path = GPTSWARM_ROOT / "result" + os.makedirs(result_path, exist_ok=True) + if args.config: + config_args = YAMLReader.parse(args.config, return_str=False) + for key, value in config_args.items(): + setattr(args, key, value) + return args + +async def main(): + args = parse_args() + result_file = None + + dataset = JSONLReader.parse_file(args.dataset_json) + + #################################### + + + current_time = Time.instance().value or time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + Time.instance().value = current_time + result_dir = Path(f"{GPTSWARM_ROOT}/result/eval") + result_dir.mkdir(parents=True, exist_ok=True) + result_file = result_dir / f"{'' if args.learn_prompt else 'not'}_learn_prompt_{'' if args.learn_demonstration else 'not'}_learn_demo_{args.llm}_{current_time}.json" + agent = CodeReact(domain="humaneval", + model_name=args.llm, + ) + memory = GlobalMemory.instance() + #################################### + opt_frequency = 4 + for i, item in enumerate(dataloader(dataset)): + task = item["prompt"] + tests = item["test"] + inputs = {"task": task, "tests": tests} + + # Agent + answer = await agent.run(inputs=inputs) + answer = answer[-1] + # Evaluate the answer against the test cases here + + data = load_result(result_file) + total_solved, total_executed = (0, 0) if not data else (data[-1]["Total solved"], data[-1]["Total executed"]) + + is_solved, _, _ = PyExecutor().execute(answer, [tests], timeout=100) + memory.add(task, is_solved) + + total_solved = total_solved + is_solved + total_executed = total_executed + 1 + accuracy = total_solved/ total_executed + + logger.info(f"total_solved: \n{total_solved}") + logger.info(f"total_executed: \n{total_executed}") + logger.info(f"accuracy: \n{accuracy}") + + updated_item = { + "Question": task, + "Tests": tests, + "Attempt answer": answer, + "Solved": is_solved, + "Solution": answer, + "Total solved": total_solved, + "Total executed": total_executed, + "Accuracy": accuracy + } + data.append(updated_item) + + with open(result_file, 'w') as file: + json.dump(data, file, indent=4) + + if i % opt_frequency == opt_frequency - 1 and (args.learn_prompt or args.learn_demonstration): + tasks = [optimize(node, args.learn_demonstration, args.learn_prompt) for node in agent.nodes.values() if isinstance(node, OptimizableOperation)] + await asyncio.gather(*tasks) + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/experiments/run_mmlu.py b/experiments/run_mmlu.py new file mode 100644 index 0000000..d70db1e --- /dev/null +++ b/experiments/run_mmlu.py @@ -0,0 +1,122 @@ +import asyncio +from typing import Union, Literal, Optional +import argparse + +from swarm.graph.swarm import Swarm +from swarm.environment.operations.final_decision import MergingStrategy +from experiments.evaluator.evaluator import Evaluator +from experiments.evaluator.datasets.mmlu_dataset import MMLUDataset + + +def parse_args(): + parser = argparse.ArgumentParser(description="Process some parameters.") + + parser.add_argument('--mode', type=str, default='OptimizedSwarm', + choices=['DirectAnswer', 'FullConnectedSwarm', 'RandomSwarm', 'OptimizedSwarm'], + help="Mode of operation. Default is 'OptimizedSwarm'.") + + parser.add_argument('--num-truthful-agents', type=int, default=1, + help="Number of truthful agents. The total will be N truthful and N adversarial.") + + parser.add_argument('--num-iterations', type=int, default=200, + help="Number of optimization iterations. Default 200.") + + parser.add_argument('--model_name', type=str, default=None, + help="Model name, None runs the default ChatGPT4.") + + parser.add_argument('--domain', type=str, default="mmlu", + help="Domain (the same as dataset name), default 'MMLU'") + + parser.add_argument('--debug', action='store_true', default=False, + help="Set for a quick debug cycle") + + args = parser.parse_args() + return args + + +async def main(): + + args = parse_args() + + debug: bool = args.debug + + model_name: Optional[str] = args.model_name + + mode: Union[Literal['DirectAnswer'], + Literal['FullConnectedSwarm'], + Literal['RandomSwarm'], + Literal['OptimizedSwarm']] + + mode = args.mode + + strategy = MergingStrategy.MajorityVote + + domain: str = args.domain + + if mode == 'DirectAnswer': + swarm_name = None + swarm = None + else: + N = args.num_truthful_agents + M = N + agent_name_list = N * ["IO"] + M * ["AdversarialAgent"] + + swarm_name = f"{N}true_{M}adv" + + swarm = Swarm( + agent_name_list, + domain, + model_name=model_name, + final_node_class="FinalDecision", + final_node_kwargs=dict(strategy=strategy), + edge_optimize=True, + ) + + tag = f"{domain}_{swarm_name}_{strategy.name}_{mode}" + + dataset_train = MMLUDataset('dev') + dataset_val = MMLUDataset('val') + + evaluator = Evaluator( + swarm, + dataset_train, + dataset_val, + model_name=model_name, + enable_tensorboard = mode=='OptimizedSwarm', + enable_artifacts=True, + tensorboard_tag=tag) + + limit_questions = 5 if debug else 153 + + if mode == 'DirectAnswer': + score = await evaluator.evaluate_direct_answer( + limit_questions=limit_questions) + elif mode == 'FullConnectedSwarm': + score = await evaluator.evaluate_swarm( + mode='full_connected_swarm', + limit_questions=limit_questions) + elif mode == 'RandomSwarm': + score = await evaluator.evaluate_swarm( + mode='randomly_connected_swarm', + limit_questions=limit_questions) + elif mode == 'OptimizedSwarm': + + num_iters = 5 if debug else args.num_iterations + + lr = 0.1 + + edge_probs = await evaluator.optimize_swarm(num_iters=num_iters, lr=lr) + + score = await evaluator.evaluate_swarm( + mode='external_edge_probs', + edge_probs=edge_probs, + limit_questions=limit_questions, + ) + else: + raise Exception(f"Unsupported mode {mode}") + + print(f"Score: {score}") + + +if __name__ == "__main__": + asyncio.run(main()) -- GitLab