Skip to content
Snippets Groups Projects
Commit b8d327f7 authored by Anshika's avatar Anshika Committed by Matthias Reso
Browse files

Update grammar_dataset_process.ipynb

parent fecfece7
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Copyright (c) Meta Platforms, Inc. and affiliates. Copyright (c) Meta Platforms, Inc. and affiliates.
This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
Use this notebook to pull in datasets and apply pre-processing. Most grammar datasets unfortunately require preprocessing before being usable in training. (example - jfleg has 4 targets per input, so we have to rematch as 1:1 pairings) Use this notebook to pull in datasets and apply pre-processing. Most grammar datasets unfortunately require preprocessing before being usable in training. (example - jfleg has 4 targets per input, so we have to rematch as 1:1 pairings)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import csv import csv
from datasets import load_metric, load_dataset from datasets import load_metric, load_dataset
from pathlib import Path from pathlib import Path
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
list_replacements = [ list_replacements = [
(" .", "."), (" .", "."),
(" ,", ","), (" ,", ","),
(" '", "'"), (" '", "'"),
(" ?", "?"), (" ?", "?"),
(" !", "!"), (" !", "!"),
(" :", ":"), (" :", ":"),
(" ;", ";"), (" ;", ";"),
(" n't", "n't"), (" n't", "n't"),
(" v", "n't"), (" v", "v"),
("2 0 0 6", "2006"), ("2 0 0 6", "2006"),
("5 5", "55"), ("5 5", "55"),
("4 0 0", "400"), ("4 0 0", "400"),
("1 7-5 0", "1750"), ("1 7-5 0", "1750"),
("2 0 %", "20%"), ("2 0 %", "20%"),
("5 0", "50"), ("5 0", "50"),
("1 2", "12"), ("1 2", "12"),
("1 0", "10"), ("1 0", "10"),
('" ballast water', '"ballast water') ('" ballast water', '"ballast water')
] ]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def correct_spacing(item): def correct_spacing(item):
""" we iterate through the list of all replacements per each item in dataset""" """ we iterate through the list of all replacements per each item in dataset"""
for fix in list_replacements: for fix in list_replacements:
item = item.replace(fix[0], fix[1]) item = item.replace(fix[0], fix[1])
return item return item
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def generate_csv(csv_path, dataset): def generate_csv(csv_path, dataset):
""" apply spacing corrections and save out matched pairs to csv file as dataset""" """ apply spacing corrections and save out matched pairs to csv file as dataset"""
with open(csv_path, 'w', newline='') as csvfile: with open(csv_path, 'w', newline='') as csvfile:
writer = csv.writer(csvfile) writer = csv.writer(csvfile)
writer.writerow(["input", "target"]) writer.writerow(["input", "target"])
for case in dataset: for case in dataset:
# Adding the t5 task indication prefix to input # Adding the t5 task indication prefix to input
input_text = case["sentence"] input_text = case["sentence"]
input_text = correct_spacing(input_text) input_text = correct_spacing(input_text)
for correction in case["corrections"]: for correction in case["corrections"]:
correction = correct_spacing(correction) correction = correct_spacing(correction)
# a few of the cases contain blank strings. # a few of the cases contain blank strings.
if input_text and correction: if input_text and correction:
writer.writerow([input_text, correction]) writer.writerow([input_text, correction])
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
In Jfleg - validation will be used as 'train', test will be 'validation' In Jfleg - validation will be used as 'train', test will be 'validation'
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
train_dataset = load_dataset("jfleg", split='validation[:]') train_dataset = load_dataset("jfleg", split='validation[:]')
eval_dataset = load_dataset("jfleg", split='test[:]') eval_dataset = load_dataset("jfleg", split='test[:]')
``` ```
%% Output %% Output
Found cached dataset jfleg (/data/home/mreso/.cache/huggingface/datasets/jfleg/default/1.0.0/ed4ab2367351fe31949f48849ae6732b164f0d5ea6bb5d4357ff4293ac89511b) Found cached dataset jfleg (/data/home/mreso/.cache/huggingface/datasets/jfleg/default/1.0.0/ed4ab2367351fe31949f48849ae6732b164f0d5ea6bb5d4357ff4293ac89511b)
Found cached dataset jfleg (/data/home/mreso/.cache/huggingface/datasets/jfleg/default/1.0.0/ed4ab2367351fe31949f48849ae6732b164f0d5ea6bb5d4357ff4293ac89511b) Found cached dataset jfleg (/data/home/mreso/.cache/huggingface/datasets/jfleg/default/1.0.0/ed4ab2367351fe31949f48849ae6732b164f0d5ea6bb5d4357ff4293ac89511b)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(train_dataset) print(train_dataset)
print(eval_dataset) print(eval_dataset)
``` ```
%% Output %% Output
Dataset({ Dataset({
features: ['sentence', 'corrections'], features: ['sentence', 'corrections'],
num_rows: 755 num_rows: 755
}) })
Dataset({ Dataset({
features: ['sentence', 'corrections'], features: ['sentence', 'corrections'],
num_rows: 748 num_rows: 748
}) })
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(train_dataset['sentence'][22]) print(train_dataset['sentence'][22])
print(train_dataset['corrections'][22]) print(train_dataset['corrections'][22])
``` ```
%% Output %% Output
Students can focus on only a few subjects they are intwerested in and they will become an experts in those areas . Students can focus on only a few subjects they are intwerested in and they will become an experts in those areas .
['Students can focus on only a few subjects they are interested in and they will become experts in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become experts in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become an expert in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become an expert in those areas . '] ['Students can focus on only a few subjects they are interested in and they will become experts in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become experts in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become an expert in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become an expert in those areas . ']
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
clean22 = correct_spacing(train_dataset['sentence'][22]) clean22 = correct_spacing(train_dataset['sentence'][22])
clean22 clean22
``` ```
%% Output %% Output
'Students can focus on only a few subjects they are intwerested in and they will become an experts in those areas. ' 'Students can focus on only a few subjects they are intwerested in and they will become an experts in those areas. '
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
jfleg_dir = Path.cwd()/'jfleg_dataset' # if you only use 'jfleg', hf will try and use that and complain jfleg_dir = Path.cwd()/'jfleg_dataset' # if you only use 'jfleg', hf will try and use that and complain
jfleg_dir.mkdir(parents=True,exist_ok=True) jfleg_dir.mkdir(parents=True,exist_ok=True)
c4_dir = Path.cwd()/'c4_dataset' c4_dir = Path.cwd()/'c4_dataset'
c4_dir.mkdir(parents=True,exist_ok=True) c4_dir.mkdir(parents=True,exist_ok=True)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Process Jfleg data Process Jfleg data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
j_train_file = jfleg_dir/'jtrain.csv' j_train_file = jfleg_dir/'jtrain.csv'
j_eval_file = jfleg_dir/'jeval.csv' j_eval_file = jfleg_dir/'jeval.csv'
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
generate_csv(j_train_file, train_dataset) generate_csv(j_train_file, train_dataset)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
generate_csv(j_eval_file, eval_dataset) generate_csv(j_eval_file, eval_dataset)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Process C4_200M (!) - we'll pull 10K to start Process C4_200M (!) - we'll pull 10K to start
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
c4_dataset = load_dataset("liweili/c4_200m", streaming = True) c4_dataset = load_dataset("liweili/c4_200m", streaming = True)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
iterator = iter(c4_dataset['train']) iterator = iter(c4_dataset['train'])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def c4_generate_csv(csv_path, iterator, num_examples): def c4_generate_csv(csv_path, iterator, num_examples):
with open(csv_path, 'w', newline='') as csvfile: with open(csv_path, 'w', newline='') as csvfile:
writer = csv.writer(csvfile) writer = csv.writer(csvfile)
writer.writerow(["input", "target"]) writer.writerow(["input", "target"])
for i in range(0,num_examples): for i in range(0,num_examples):
data = next(iterator) data = next(iterator)
input_text = data["input"] input_text = data["input"]
input_text = correct_spacing(input_text) input_text = correct_spacing(input_text)
correction = correct_spacing(data["output"]) correction = correct_spacing(data["output"])
if input_text and correction: if input_text and correction:
writer.writerow([input_text, correction]) writer.writerow([input_text, correction])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
c4_dir = Path.cwd()/'c4_dataset' c4_dir = Path.cwd()/'c4_dataset'
c4_dir.mkdir(parents=True,exist_ok=True) c4_dir.mkdir(parents=True,exist_ok=True)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
You can modify the following to make the csv file with desired number of instances, here we go for 10k to make a quick test You can modify the following to make the csv file with desired number of instances, here we go for 10k to make a quick test
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
c4_filename = c4_dir/'c4train_10k.csv' c4_filename = c4_dir/'c4train_10k.csv'
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
c4_generate_csv(c4_filename, iterator, num_examples=10000) c4_generate_csv(c4_filename, iterator, num_examples=10000)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Create a single training file by combining jtrain and c4train Create a single training file by combining jtrain and c4train
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
merge_list = [j_train_file, c4_filename, ] merge_list = [j_train_file, c4_filename, ]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
combined_csv = pd.concat([pd.read_csv(fn) for fn in merge_list]) combined_csv = pd.concat([pd.read_csv(fn) for fn in merge_list])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
merged_name = "gtrain_10k.csv" merged_name = "gtrain_10k.csv"
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
combined_csv.to_csv(merged_name, index=False, encoding = 'utf-8-sig', ) combined_csv.to_csv(merged_name, index=False, encoding = 'utf-8-sig', )
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
eval_name = "grammar_validation.csv" eval_name = "grammar_validation.csv"
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
eval_csv = pd.read_csv(j_eval_file) eval_csv = pd.read_csv(j_eval_file)
eval_csv.to_csv(eval_name, index=False, encoding = 'utf-8-sig', ) eval_csv.to_csv(eval_name, index=False, encoding = 'utf-8-sig', )
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment