LongBench.py

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import datasets
import json

_DESCRIPTION = """\
LongBench is a comprehensive benchmark for multilingual and multi-task purposes, with the goal to fully measure and evaluate the ability of pre-trained language models to understand long text. This dataset consists of twenty different tasks, covering key long-text application scenarios such as multi-document QA, single-document QA, summarization, few-shot learning, synthetic tasks, and code completion.
"""

_HOMEPAGE = "https://github.com/THUDM/LongBench"


# _URL = r"https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip"
_URLS = {
    "2wikimqa": "./data/2wikimqa.jsonl", 
    "dureader": "./data/dureader.jsonl", 
    "qasper": "./data/qasper.jsonl", 
    "hotpotqa": "./data/hotpotqa.jsonl", 
    "narrativeqa": "./data/narrativeqa.jsonl", 
    "musique": "./data/musique.jsonl", 
    "multifieldqa_zh":"./data/multifieldqa_zh.jsonl",
    "multifieldqa_en":"./data/multifieldqa_en.jsonl",
}

task_list = [
    "multifieldqa_en",
    "qasper",
    "2wikimqa",
    "dureader",
    "hotpotqa",
    "narrativeqa",
    "musique",
    "multifieldqa_zh"
]


class LongBenchConfig(datasets.BuilderConfig):
    def __init__(self, **kwargs):
        super().__init__(version=datasets.Version("1.0.0"), **kwargs)


class LongBench(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        LongBenchConfig(
            name=task_name,
        )
        for task_name in task_list
    ]

    def _info(self):
        features = datasets.Features(
            {
                "input": datasets.Value("string"), 
                "context": datasets.Value("string"), 
                "answers": [datasets.Value("string")], 
                "length": datasets.Value("int32"), 
                "dataset": datasets.Value("string"), 
                "language": datasets.Value("string"), 
                "all_classes": [datasets.Value("string")],
                "retrieved": [datasets.Value("string")],
                "_id": datasets.Value("string"), 
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
        )

    def _split_generators(self, dl_manager):
        task_name = self.config.name
        data_dir = dl_manager.download(_URLS[task_name])
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": os.path.join(
                        # data_dir, f"{task_name}.jsonl"
                        data_dir
                    ),
                },
            )
        ]

    def _generate_examples(self, filepath):
        with open(filepath, encoding="utf-8") as f:
            for idx, line in enumerate(f):
                key = f"{self.config.name}-{idx}"
                item = json.loads(line)
                yield key, {
                    "input": item["input"],
                    "context": item["context"],
                    "answers": item["answers"],
                    "length": item["length"],
                    "dataset": item["dataset"],
                    "language": item["language"],
                    "retrieved": item["retrieved"],
                    "_id": item["_id"],
                    "all_classes": item["all_classes"],
                }