Skip to content
Snippets Groups Projects
load_data_sql.py 971 B
Newer Older
Jerry Liu's avatar
cr
Jerry Liu committed
import json

from modal import Retries
from .common import (
    stub,
    VOL_MOUNT_PATH,
    output_vol,
Jerry Liu's avatar
cr  
Jerry Liu committed
    get_data_path
Jerry Liu's avatar
cr
Jerry Liu committed
)

@stub.function(
    retries=Retries(
        max_retries=3,
        initial_delay=5.0,
        backoff_coefficient=2.0,
    ),
    timeout=60 * 60 * 2,
    network_file_systems={VOL_MOUNT_PATH.as_posix(): output_vol},
    cloud="gcp",
)
Jerry Liu's avatar
cr  
Jerry Liu committed
def load_data_sql():
Jerry Liu's avatar
cr
Jerry Liu committed
    from datasets import load_dataset

    dataset = load_dataset("b-mc2/sql-create-context")

    dataset_splits = {"train": dataset["train"]}
Jerry Liu's avatar
cr  
Jerry Liu committed
    out_path = get_data_path()
Jerry Liu's avatar
cr
Jerry Liu committed

    out_path.parent.mkdir(parents=True, exist_ok=True)

    for key, ds in dataset_splits.items():
        with open(out_path, "w") as f:
            for item in ds:
                newitem = {
                    "input": item["question"],
                    "context": item["context"],
                    "output": item["answer"],
                }
                f.write(json.dumps(newitem) + "\n")