Skip to content

Commit

Permalink
added multipel seeds when split symbolic dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonxyliu committed Jan 7, 2023
1 parent 6ac5507 commit 8b121c2
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 55 deletions.
11 changes: 6 additions & 5 deletions dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,13 +185,14 @@ def load_split_dataset(split_fpath):
# generate_tar_file()
# create_osm_dataset()

# Construct train, test split for 3 types of holdout
# Construct train, test split for 3 types of holdout for symbolic translation
create_symbolic_dataset('data/aggregated_responses_batch1.csv', False)
create_symbolic_dataset('data/aggregated_responses_batch1.csv', True)

data_fpath = "data/symbolic_no_perm_batch1.csv"
filter_types = ["fair_visit"]
seed = 42
construct_split_dataset(data_fpath, holdout_type="ltl_type", filter_types=filter_types, test_size=2, seed=seed)
construct_split_dataset(data_fpath, holdout_type="ltl_instance", filter_types=filter_types, test_size=0.2, seed=seed)
construct_split_dataset(data_fpath, holdout_type="utt", filter_types=filter_types, test_size=0.2, seed=seed)
seeds = [0, 1, 2, 42, 111]
for seed in seeds:
construct_split_dataset(data_fpath, holdout_type="ltl_type", filter_types=filter_types, test_size=2, seed=seed)
construct_split_dataset(data_fpath, holdout_type="ltl_instance", filter_types=filter_types, test_size=0.2, seed=seed)
construct_split_dataset(data_fpath, holdout_type="utt", filter_types=filter_types, test_size=0.2, seed=seed)
67 changes: 37 additions & 30 deletions s2s_pt_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
https://pytorch.org/tutorials/beginner/translation_transformer.html
"""
import argparse
import os
from pathlib import Path
import math
from timeit import default_timer as timer
Expand Down Expand Up @@ -239,35 +240,41 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol):
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--split_dataset_fpath', type=str, default='data/split_symbolic_no_perm_batch1_utt_0.2_42.pkl',
help='file path to train and test data for supervised seq2seq')
help='complete file path or prefix of file paths to train and test data for supervised seq2seq')
args = parser.parse_args()

# Load train, test data
train_iter, train_meta, valid_iter, valid_meta = load_split_dataset(args.split_dataset_fpath)
vocab_transform, text_transform, SRC_VOCAB_SIZE, TAR_VOCAB_SIZE = construct_dataset_meta(train_iter)

# Train and save model
transformer = Seq2SeqTransformer(SRC_VOCAB_SIZE, TAR_VOCAB_SIZE,
NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMBED_SIZE, NHEAD,
DIM_FFN_HID)
for param in transformer.parameters():
if param.dim() > 1:
nn.init.xavier_uniform_(param)
transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
writer = SummaryWriter() # writer will output to ./runs/ directory by default; activate: tensorboard --logdir=runs

for epoch in range(1, NUM_EPOCHS+1):
start_time = timer()
train_loss = train_epoch(transformer, optimizer, train_iter)
end_time = timer()
valid_loss = evaluate(transformer, valid_iter)
print(f'Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {valid_loss:.3f}\n'
f'Epoch time: {(end_time-start_time):.3f}s')
writer.add_scalars("Train Loss", {"train_loss": train_loss, "valid_loss": valid_loss}, epoch)
model_fpath = f'model/s2s_pt_transformer_{Path(args.split_dataset_fpath).stem}.pth'
torch.save(transformer.state_dict(), model_fpath)
writer.flush()
writer.close()
if "pkl" in args.split_dataset_fpath: # complete file path, e.g. data/split_symbolic_no_perm_batch1_utt_0.2_42.pkl
split_dataset_fpaths = [args.split_dataset_fpath]
else: # prefix of file paths, e.g. split_symbolic_no_perm_batch1_utt
split_dataset_fpaths = [os.path.join("data", fpath) for fpath in os.listdir("data") if args.split_dataset_fpath in fpath]

for split_dataset_fpath in split_dataset_fpaths:
# Load train, test data
train_iter, train_meta, valid_iter, valid_meta = load_split_dataset(split_dataset_fpath)
vocab_transform, text_transform, SRC_VOCAB_SIZE, TAR_VOCAB_SIZE = construct_dataset_meta(train_iter)

# Train and save model
transformer = Seq2SeqTransformer(SRC_VOCAB_SIZE, TAR_VOCAB_SIZE,
NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMBED_SIZE, NHEAD,
DIM_FFN_HID)
for param in transformer.parameters():
if param.dim() > 1:
nn.init.xavier_uniform_(param)
transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
writer = SummaryWriter() # writer will output to ./runs/ directory by default; activate: tensorboard --logdir=runs

for epoch in range(1, NUM_EPOCHS+1):
start_time = timer()
train_loss = train_epoch(transformer, optimizer, train_iter)
end_time = timer()
valid_loss = evaluate(transformer, valid_iter)
print(f'Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {valid_loss:.3f}\n'
f'Epoch time: {(end_time-start_time):.3f}s')
writer.add_scalars("Train Loss", {"train_loss": train_loss, "valid_loss": valid_loss}, epoch)
model_fpath = f'model/s2s_pt_transformer_{Path(split_dataset_fpath).stem}.pth'
torch.save(transformer.state_dict(), model_fpath)
writer.flush()
writer.close()
49 changes: 29 additions & 20 deletions s2s_sup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Infer trained model.
"""
import argparse
import os
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, T5ForConditionalGeneration
Expand Down Expand Up @@ -58,29 +59,37 @@ def parameters(self):
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--split_dataset_fpath', type=str, default='data/split_symbolic_no_perm_batch1_utt_0.2_42.pkl',
help='file path to train test split dataset')
help='complete file path or prefix of file paths to train test split dataset')
parser.add_argument('--model', type=str, default="t5-base", choices=["t5-base", "t5-small", "pt_transformer"],
help='name of supervised seq2seq model')
args = parser.parse_args()

# Load train, test data
train_iter, train_meta, valid_iter, valid_meta = load_split_dataset(args.split_dataset_fpath)
if "pkl" in args.split_dataset_fpath: # complete file path, e.g. data/split_symbolic_no_perm_batch1_utt_0.2_42.pkl
split_dataset_fpaths = [args.split_dataset_fpath]
else: # prefix of file paths, e.g. split_symbolic_no_perm_batch1_utt
split_dataset_fpaths = [os.path.join("data", fpath) for fpath in os.listdir("data") if args.split_dataset_fpath in fpath]

if args.model in T5_MODELS: # pretrained T5 from Hugging Face
s2s = Seq2Seq(args.model)
elif args.model == "pt_transformer": # pretrained seq2seq transformer implemented in PyTorch
vocab_transform, text_transform, src_vocab_size, tar_vocab_size = pt_transformer_construct_dataset_meta(train_iter)
model_params = f"model/s2s_{args.model}_{Path(args.split_dataset_fpath).stem}.pth"
s2s = Seq2Seq(args.model,
vocab_transform=vocab_transform, text_transform=text_transform,
src_vocab_sz=src_vocab_size, tar_vocab_sz=tar_vocab_size, fpath_load=model_params)
else:
raise TypeError(f"ERROR: unrecognized model, {args.model}")
print(f"Number of trainable parameters in {args.model}: {count_params(s2s)}")
print(f"Number of training samples: {len(train_iter)}")
print(f"Number of validation samples: {len(valid_iter)}")
for split_dataset_fpath in split_dataset_fpaths:
# Load train, test data
train_iter, train_meta, valid_iter, valid_meta = load_split_dataset(split_dataset_fpath)

result_log_fpath = f"results/s2s_{args.model}_{Path(args.split_dataset_fpath).stem}_log.csv"
analysis_fpath = "data/analysis_batch1.csv"
acc_fpath = f"results/s2s_{args.model}_{Path(args.split_dataset_fpath).stem}_acc.csv"
evaluate_lang_from_file(s2s, args.split_dataset_fpath, analysis_fpath, result_log_fpath, acc_fpath)
# Load trained model
if args.model in T5_MODELS: # pretrained T5 from Hugging Face
s2s = Seq2Seq(args.model)
elif args.model == "pt_transformer": # pretrained seq2seq transformer implemented in PyTorch
vocab_transform, text_transform, src_vocab_size, tar_vocab_size = pt_transformer_construct_dataset_meta(train_iter)
model_params = f"model/s2s_{args.model}_{Path(split_dataset_fpath).stem}.pth"
s2s = Seq2Seq(args.model,
vocab_transform=vocab_transform, text_transform=text_transform,
src_vocab_sz=src_vocab_size, tar_vocab_sz=tar_vocab_size, fpath_load=model_params)
else:
raise TypeError(f"ERROR: unrecognized model, {args.model}")
print(f"Number of trainable parameters in {args.model}: {count_params(s2s)}")
print(f"Number of training samples: {len(train_iter)}")
print(f"Number of validation samples: {len(valid_iter)}")

# Evaluation
result_log_fpath = f"results/s2s_{args.model}_{Path(split_dataset_fpath).stem}_log.csv"
analysis_fpath = "data/analysis_batch1.csv"
acc_fpath = f"results/s2s_{args.model}_{Path(split_dataset_fpath).stem}_acc.csv"
evaluate_lang_from_file(s2s, split_dataset_fpath, analysis_fpath, result_log_fpath, acc_fpath)

0 comments on commit 8b121c2

Please sign in to comment.