-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from sign-language-processing/pose_to_signwrit…
…ing_static_pretraining merging the synthetic signwriting to the learning process
- Loading branch information
Showing
15 changed files
with
480 additions
and
63 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
129 changes: 129 additions & 0 deletions
129
signwriting_transcription/pose_to_signwriting/data/prepare_pretrain.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
""" | ||
Prepare poses | ||
expected dir structure: | ||
vectorized_data_set/ | ||
└── poses/ | ||
├── fbank534/ | ||
│ ├── test1.npy | ||
│ ├── test2.npy | ||
│ ├── test3.npy | ||
├── fbank534.zip | ||
├── joey_train_asr.tsv | ||
├── joey_dev_asr.tsv | ||
└── joey_test_asr.tsv | ||
""" | ||
|
||
import argparse | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from joeynmt.helpers import write_list_to_file | ||
from signwriting_transcription.pose_to_signwriting.data.pose_data_utils import ( | ||
create_zip, | ||
get_zip_manifest, | ||
save_tsv, | ||
build_pose_vocab | ||
) | ||
from signwriting_transcription.pose_to_signwriting.data.datasets_pose import extract_to_matrix | ||
# pylint: disable=duplicate-code | ||
|
||
COLUMNS = ["id", "src", "n_frames", "trg"] | ||
|
||
SEED = 123 | ||
N_MEL_FILTERS = 534 | ||
N_WORKERS = 4 # cpu_count() | ||
SP_MODEL_TYPE = "bpe" # one of ["bpe", "unigram", "char"] | ||
VOCAB_SIZE = 1182 # joint vocab | ||
EXPANDED_DATASET = 1000 # the minimum number of samples in the dataset | ||
|
||
def get_split_data(dataset, feature_root): | ||
print("Fetching ZIP manifest...") | ||
zip_manifest = get_zip_manifest(feature_root.with_suffix(".zip")) | ||
|
||
# Generate TSV manifest | ||
print("Generating manifest...") | ||
all_data = [] | ||
|
||
for instance in dataset: | ||
utt_id = instance[0] | ||
n_frames = np.load(feature_root / f'{utt_id}.npy').shape[0] | ||
all_data.append({ | ||
"id": utt_id, | ||
"src": zip_manifest[str(utt_id)], | ||
"n_frames": n_frames, | ||
"trg": instance[2], | ||
"split": instance[3] | ||
}) | ||
return all_data | ||
|
||
|
||
def process(args): | ||
# pylint: disable=too-many-locals | ||
data_root, name, size = ( | ||
args.data_root, args.dataset_name, int(args.dataset_size)) | ||
cur_root = Path(data_root).absolute() | ||
cur_root = cur_root / name | ||
|
||
# dir for filterbank (shared across splits) | ||
feature_root = cur_root / f"fbank{N_MEL_FILTERS}" | ||
feature_root.mkdir(parents=True, exist_ok=True) | ||
const_np_array = np.zeros((10, 534)) | ||
const_np_array[0][0] = -9999 | ||
|
||
dataset = [] | ||
for index in range(size): | ||
instance = [str(index), const_np_array.copy(), "SYNTHETIC", "train"] | ||
dataset.append(tuple(instance)) | ||
for index in range(30): | ||
test_instance = [f"test{index}", const_np_array.copy(), "SYNTHETIC", "test"] | ||
dev_instance = [f"dev{index}", const_np_array.copy(), "SYNTHETIC", "dev"] | ||
dataset.append(tuple(test_instance)) | ||
dataset.append(tuple(dev_instance)) | ||
print("the length of dataset: ", len(dataset)) | ||
|
||
print("Extracting pose features ...") | ||
for instance in dataset: | ||
utt_id = instance[0] | ||
extract_to_matrix(instance[1], feature_root / f'{utt_id}.npy', overwrite=False) | ||
|
||
# Pack features into ZIP | ||
print("ZIPing features...") | ||
create_zip(feature_root, feature_root.with_suffix(".zip")) | ||
|
||
all_data = get_split_data(dataset, feature_root) | ||
|
||
all_df = pd.DataFrame.from_records(all_data) | ||
save_tsv(all_df, cur_root / "poses_all_data.tsv") | ||
|
||
for split in ['train', 'dev', 'test']: | ||
split_df = all_df[all_df['split'] == split] | ||
# save tsv | ||
save_tsv(split_df, cur_root / f"{split}.tsv") | ||
# save plain txt | ||
write_list_to_file(cur_root / f"{split}.txt", split_df['trg'].to_list()) | ||
print(split, len(split_df)) | ||
|
||
# Generate joint vocab | ||
print("Building joint vocab...") | ||
build_pose_vocab(cur_root / f"spm_bpe{VOCAB_SIZE}.vocab") | ||
print("Done!") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--data-root", "-d", required=True, type=str) | ||
parser.add_argument("--dataset-name", required=True, type=str) | ||
parser.add_argument("--dataset-size", required=False, type=str, default=True) | ||
args = parser.parse_args() | ||
# alert if the size is smaller then the expected size | ||
assert int(args.dataset_size) >= EXPANDED_DATASET | ||
process(args) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
154 changes: 154 additions & 0 deletions
154
signwriting_transcription/pose_to_signwriting/data/pretrain_config.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
import argparse | ||
from pathlib import Path | ||
|
||
|
||
# pylint: disable=duplicate-code | ||
def create_config(data_path="/output/poses", experiment_dir='/model/poses', test_eval_matrices='False', | ||
model=None): | ||
data_path = Path(data_path) | ||
experiment_dir = Path(experiment_dir) | ||
|
||
if model is not None: | ||
load_model_line = f'load_model: {model}/best.ckpt' | ||
else: | ||
load_model_line = '# load_model: "{pretrain_model}/best.ckpt"' | ||
config = """ | ||
name: "poses" | ||
joeynmt_version: 2.0.0 | ||
data: | ||
task: "S2T" # "S2T" for speech-to-text, "MT" for (text) translation | ||
train: "{data_dir}/train" | ||
dev: "{data_dir}/dev" | ||
test: "{data_dir}/test" | ||
dataset_type: "speech" # SpeechDataset takes tsv as input | ||
src: | ||
lang: "en_ng" | ||
num_freq: 534 # number of frequencies of audio inputs | ||
max_length: 3000 # much longer than text sequence! | ||
min_length: 1 # have to be specified so that 1d-conv works! | ||
level: "frame" # Here we specify we're working on BPEs. | ||
tokenizer_type: "pose" | ||
augment: True | ||
aug_param: 0.2 | ||
noise: True | ||
noise_param: 0.1 | ||
tokenizer_cfg: | ||
specaugment: | ||
freq_mask_n: 1 | ||
freq_mask_f: 5 | ||
time_mask_n: 1 | ||
time_mask_t: 10 | ||
time_mask_p: 1.0 | ||
cmvn: | ||
norm_means: True | ||
norm_vars: True | ||
before: True | ||
trg: | ||
lang: "en_ng" | ||
max_length: 100 | ||
lowercase: False | ||
level: "vpf" # Here we specify we're working on BPEs. | ||
voc_file: "{data_dir}/spm_bpe1182.vocab" | ||
tokenizer_type: "pose-vpf" | ||
tokenizer_cfg: | ||
model_file: "{data_dir}/spm_bpe1182.model" | ||
pretokenize: "none" | ||
testing: | ||
eval_all_metrics: {test_eval_matrices} | ||
n_best: 1 | ||
beam_size: 5 | ||
beam_alpha: 1.0 | ||
batch_size: 4 | ||
batch_type: "sentence" | ||
max_output_length: 100 # Don't generate translations longer than this. | ||
# eval_metrics: ["wer"] # Use "wer" for ASR task, "bleu" for ST task | ||
sacrebleu_cfg: # sacrebleu options | ||
tokenize: "intl" # `tokenize` option in sacrebleu.corpus_bleu() function (options include: "none" (use for already tokenized test data), "13a" (default minimal tokenizer), "intl" which mostly does punctuation and unicode, etc) | ||
training: | ||
{load_model_line} | ||
random_seed: 42 | ||
optimizer: "adam" | ||
normalization: "tokens" | ||
adam_betas: [0.9, 0.98] | ||
scheduling: "plateau" | ||
patience: 10 | ||
learning_rate: 0.00015 | ||
learning_rate_min: 0.000000015 | ||
weight_decay: 0.0 | ||
label_smoothing: 0.1 | ||
loss: "crossentropy-ctc" # use CrossEntropyLoss + CTCLoss | ||
ctc_weight: 0.3 # ctc weight in interpolation | ||
batch_size: 4 # much bigger than text! your "tokens" are "frames" now. | ||
batch_type: "sentence" | ||
batch_multiplier: 1 | ||
early_stopping_metric: chrf # by default, early stopping uses "fsw_eval" metric | ||
epochs: 300 # Decrease for when playing around and checking of working. | ||
validation_freq: 1000 # Set to at least once per epoch. | ||
logging_freq: 100 | ||
model_dir: "{experiment_dir}" | ||
overwrite: True | ||
shuffle: True | ||
use_cuda: True | ||
print_valid_sents: [0, 1, 2, 3] | ||
keep_best_ckpts: 2 | ||
model: | ||
initializer: "xavier_uniform" | ||
bias_initializer: "zeros" | ||
init_gain: 1.0 | ||
embed_initializer: "xavier_uniform" | ||
embed_init_gain: 1.0 | ||
tied_embeddings: False # DIsable embeddings sharing between enc(audio) and dec(text) | ||
tied_softmax: False | ||
encoder: | ||
type: "transformer" | ||
num_layers: 12 # Common to use doubly bigger encoder than decoder in S2T. | ||
num_heads: 4 | ||
embeddings: | ||
embedding_dim: 534 # Must be same as the frequency of the filterbank features! | ||
# typically ff_size = 4 x hidden_size | ||
hidden_size: 256 | ||
ff_size: 1024 | ||
dropout: 0.1 | ||
layer_norm: "pre" | ||
# new for S2T: | ||
subsample: True # enable 1d conv module | ||
conv_kernel_sizes: [5, 5] # convolution kernel sizes (window width) | ||
conv_channels: 512 # convolution channels | ||
in_channels: 534 # Must be same as the embedding_dim | ||
decoder: | ||
type: "transformer" | ||
num_layers: 6 | ||
num_heads: 4 | ||
embeddings: | ||
embedding_dim: 256 | ||
scale: True | ||
dropout: 0.0 | ||
# typically ff_size = 4 x hidden_size | ||
hidden_size: 256 | ||
ff_size: 1024 | ||
dropout: 0.1 | ||
layer_norm: "pre" | ||
""".format(data_dir=data_path.as_posix(), | ||
experiment_dir=experiment_dir.as_posix(), | ||
test_eval_matrices=test_eval_matrices, | ||
load_model_line=load_model_line) | ||
|
||
(data_path / 'config.yaml').write_text(config) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--data-path", "-d", required=True, type=str) | ||
parser.add_argument("--experiment-dir", "-e", required=True, type=str) | ||
parser.add_argument("--test-eval-matrices", required=False, default='False') | ||
parser.add_argument("--model", required=False, default=None) | ||
args = parser.parse_args() | ||
create_config(args.data_path, args.experiment_dir, args.test_eval_matrices, args.model) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.