Skip to content

Commit

Permalink
Merge pull request #12 from sign-language-processing/pose_to_signwrit…
Browse files Browse the repository at this point in the history
…ing_static_pretraining

merging the synthetic signwriting to the learning process
  • Loading branch information
AmitMY authored Jul 10, 2024
2 parents 97d885c + 400e656 commit 0ba47eb
Show file tree
Hide file tree
Showing 15 changed files with 480 additions and 63 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ pose_to_signwriting = [
"google-auth-oauthlib",
"google-api-python-client",
"ruamel.yaml",
"synthetic-signwriting @ git+https://github.com/sign-language-processing/synthetic-signwriting.git",
"pose-anonymization @ git+https://github.com/sign-language-processing/pose-anonymization.git",
]
pose_to_vq_to_signwriting = [
"sign-vq @ git+https://github.com/sign-language-processing/sign-vq.git" # Used for getting codes from poses
Expand Down
20 changes: 20 additions & 0 deletions signwriting_transcription/pose_to_signwriting/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,23 @@ Launch TensorBoard to visualize training progress:
```bash
tensorboard --logdir /content/models/poses/tensorboard
```

## Using bash script for fast training and updating the model

For training pre-trained model, you can use the following bash script:

```bash
bash signwriting_transcription/pose_to_signwriting/pretrain.sh
```

For activating the fine-tuning process, you can use the following bash script:

```bash
bash signwriting_transcription/pose_to_signwriting/fine_tuning.sh pretrain
```

For uploading the model to the cloud, you can use the following bash script:

```bash
bash signwriting_transcription/pose_to_signwriting/upload_model.sh {'pretrain' if it is pre-trained model or nothing if it is fine-tuned model}
```
20 changes: 14 additions & 6 deletions signwriting_transcription/pose_to_signwriting/data/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,16 @@
from pathlib import Path


def create_config(data_path="/output/poses", experiment_dir='/model/poses', test_eval_matrices='False'):
# pylint: disable=duplicate-code
def create_config(data_path="/output/poses", experiment_dir='/model/poses', test_eval_matrices='False',
model=None):
data_path = Path(data_path)
experiment_dir = Path(experiment_dir)

if model is not None:
load_model_line = f'load_model: {model}/best.ckpt'
else:
load_model_line = '# load_model: "{pretrain_model}/best.ckpt"'
config = """
name: "poses"
joeynmt_version: 2.0.0
Expand Down Expand Up @@ -63,7 +69,7 @@ def create_config(data_path="/output/poses", experiment_dir='/model/poses', test
tokenize: "intl" # `tokenize` option in sacrebleu.corpus_bleu() function (options include: "none" (use for already tokenized test data), "13a" (default minimal tokenizer), "intl" which mostly does punctuation and unicode, etc)
training:
#load_model: "{experiment_dir}/1.ckpt" # if uncommented, load a pre-trained model from this checkpoint
{load_model_line}
random_seed: 42
optimizer: "adam"
normalization: "tokens"
Expand All @@ -80,7 +86,7 @@ def create_config(data_path="/output/poses", experiment_dir='/model/poses', test
batch_type: "sentence"
batch_multiplier: 1
early_stopping_metric: chrf # by default, early stopping uses "fsw_eval" metric
epochs: 100 # Decrease for when playing around and checking of working.
epochs: 1 # Decrease for when playing around and checking of working.
validation_freq: 1000 # Set to at least once per epoch.
logging_freq: 100
model_dir: "{experiment_dir}"
Expand Down Expand Up @@ -129,7 +135,8 @@ def create_config(data_path="/output/poses", experiment_dir='/model/poses', test
layer_norm: "pre"
""".format(data_dir=data_path.as_posix(),
experiment_dir=experiment_dir.as_posix(),
test_eval_matrices=test_eval_matrices)
test_eval_matrices=test_eval_matrices,
load_model_line=load_model_line)

(data_path / 'config.yaml').write_text(config)

Expand Down Expand Up @@ -201,7 +208,7 @@ def create_test_config(data_path="/output/poses", experiment_dir='/model/poses')
adam_betas: [0.9, 0.98]
scheduling: "plateau"
patience: 10
learning_rate: 0.0002
learning_rate: 0.00005
learning_rate_min: 0.00000001
weight_decay: 0.0
label_smoothing: 0.1
Expand Down Expand Up @@ -269,8 +276,9 @@ def main():
parser.add_argument("--data-path", "-d", required=True, type=str)
parser.add_argument("--experiment-dir", "-e", required=True, type=str)
parser.add_argument("--test-eval-matrices", required=False, default='False')
parser.add_argument("--model", required=False, default=None)
args = parser.parse_args()
create_config(args.data_path, args.experiment_dir, args.test_eval_matrices)
create_config(args.data_path, args.experiment_dir, args.test_eval_matrices, args.model)


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from signwriting_transcription.pose_to_signwriting.data.datasets_pose import (
load_dataset, extract_to_matrix, frame2ms, pose_ndarray_to_matrix
)
# pylint: disable=duplicate-code

COLUMNS = ["id", "src", "n_frames", "trg"]

Expand All @@ -43,7 +44,6 @@
VOCAB_SIZE = 1182 # joint vocab
EXPANDED_DATASET = 1000 # the minimum number of samples in the dataset


def get_split_data(dataset, feature_root, pumping):
print("Fetching ZIP manifest...")
zip_manifest = get_zip_manifest(feature_root.with_suffix(".zip"))
Expand Down
129 changes: 129 additions & 0 deletions signwriting_transcription/pose_to_signwriting/data/prepare_pretrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python
# coding: utf-8
"""
Prepare poses
expected dir structure:
vectorized_data_set/
└── poses/
├── fbank534/
│ ├── test1.npy
│ ├── test2.npy
│ ├── test3.npy
├── fbank534.zip
├── joey_train_asr.tsv
├── joey_dev_asr.tsv
└── joey_test_asr.tsv
"""

import argparse
from pathlib import Path

import numpy as np
import pandas as pd

from joeynmt.helpers import write_list_to_file
from signwriting_transcription.pose_to_signwriting.data.pose_data_utils import (
create_zip,
get_zip_manifest,
save_tsv,
build_pose_vocab
)
from signwriting_transcription.pose_to_signwriting.data.datasets_pose import extract_to_matrix
# pylint: disable=duplicate-code

COLUMNS = ["id", "src", "n_frames", "trg"]

SEED = 123
N_MEL_FILTERS = 534
N_WORKERS = 4 # cpu_count()
SP_MODEL_TYPE = "bpe" # one of ["bpe", "unigram", "char"]
VOCAB_SIZE = 1182 # joint vocab
EXPANDED_DATASET = 1000 # the minimum number of samples in the dataset

def get_split_data(dataset, feature_root):
print("Fetching ZIP manifest...")
zip_manifest = get_zip_manifest(feature_root.with_suffix(".zip"))

# Generate TSV manifest
print("Generating manifest...")
all_data = []

for instance in dataset:
utt_id = instance[0]
n_frames = np.load(feature_root / f'{utt_id}.npy').shape[0]
all_data.append({
"id": utt_id,
"src": zip_manifest[str(utt_id)],
"n_frames": n_frames,
"trg": instance[2],
"split": instance[3]
})
return all_data


def process(args):
# pylint: disable=too-many-locals
data_root, name, size = (
args.data_root, args.dataset_name, int(args.dataset_size))
cur_root = Path(data_root).absolute()
cur_root = cur_root / name

# dir for filterbank (shared across splits)
feature_root = cur_root / f"fbank{N_MEL_FILTERS}"
feature_root.mkdir(parents=True, exist_ok=True)
const_np_array = np.zeros((10, 534))
const_np_array[0][0] = -9999

dataset = []
for index in range(size):
instance = [str(index), const_np_array.copy(), "SYNTHETIC", "train"]
dataset.append(tuple(instance))
for index in range(30):
test_instance = [f"test{index}", const_np_array.copy(), "SYNTHETIC", "test"]
dev_instance = [f"dev{index}", const_np_array.copy(), "SYNTHETIC", "dev"]
dataset.append(tuple(test_instance))
dataset.append(tuple(dev_instance))
print("the length of dataset: ", len(dataset))

print("Extracting pose features ...")
for instance in dataset:
utt_id = instance[0]
extract_to_matrix(instance[1], feature_root / f'{utt_id}.npy', overwrite=False)

# Pack features into ZIP
print("ZIPing features...")
create_zip(feature_root, feature_root.with_suffix(".zip"))

all_data = get_split_data(dataset, feature_root)

all_df = pd.DataFrame.from_records(all_data)
save_tsv(all_df, cur_root / "poses_all_data.tsv")

for split in ['train', 'dev', 'test']:
split_df = all_df[all_df['split'] == split]
# save tsv
save_tsv(split_df, cur_root / f"{split}.tsv")
# save plain txt
write_list_to_file(cur_root / f"{split}.txt", split_df['trg'].to_list())
print(split, len(split_df))

# Generate joint vocab
print("Building joint vocab...")
build_pose_vocab(cur_root / f"spm_bpe{VOCAB_SIZE}.vocab")
print("Done!")


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data-root", "-d", required=True, type=str)
parser.add_argument("--dataset-name", required=True, type=str)
parser.add_argument("--dataset-size", required=False, type=str, default=True)
args = parser.parse_args()
# alert if the size is smaller then the expected size
assert int(args.dataset_size) >= EXPANDED_DATASET
process(args)


if __name__ == "__main__":
main()
154 changes: 154 additions & 0 deletions signwriting_transcription/pose_to_signwriting/data/pretrain_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import argparse
from pathlib import Path


# pylint: disable=duplicate-code
def create_config(data_path="/output/poses", experiment_dir='/model/poses', test_eval_matrices='False',
model=None):
data_path = Path(data_path)
experiment_dir = Path(experiment_dir)

if model is not None:
load_model_line = f'load_model: {model}/best.ckpt'
else:
load_model_line = '# load_model: "{pretrain_model}/best.ckpt"'
config = """
name: "poses"
joeynmt_version: 2.0.0
data:
task: "S2T" # "S2T" for speech-to-text, "MT" for (text) translation
train: "{data_dir}/train"
dev: "{data_dir}/dev"
test: "{data_dir}/test"
dataset_type: "speech" # SpeechDataset takes tsv as input
src:
lang: "en_ng"
num_freq: 534 # number of frequencies of audio inputs
max_length: 3000 # much longer than text sequence!
min_length: 1 # have to be specified so that 1d-conv works!
level: "frame" # Here we specify we're working on BPEs.
tokenizer_type: "pose"
augment: True
aug_param: 0.2
noise: True
noise_param: 0.1
tokenizer_cfg:
specaugment:
freq_mask_n: 1
freq_mask_f: 5
time_mask_n: 1
time_mask_t: 10
time_mask_p: 1.0
cmvn:
norm_means: True
norm_vars: True
before: True
trg:
lang: "en_ng"
max_length: 100
lowercase: False
level: "vpf" # Here we specify we're working on BPEs.
voc_file: "{data_dir}/spm_bpe1182.vocab"
tokenizer_type: "pose-vpf"
tokenizer_cfg:
model_file: "{data_dir}/spm_bpe1182.model"
pretokenize: "none"
testing:
eval_all_metrics: {test_eval_matrices}
n_best: 1
beam_size: 5
beam_alpha: 1.0
batch_size: 4
batch_type: "sentence"
max_output_length: 100 # Don't generate translations longer than this.
# eval_metrics: ["wer"] # Use "wer" for ASR task, "bleu" for ST task
sacrebleu_cfg: # sacrebleu options
tokenize: "intl" # `tokenize` option in sacrebleu.corpus_bleu() function (options include: "none" (use for already tokenized test data), "13a" (default minimal tokenizer), "intl" which mostly does punctuation and unicode, etc)
training:
{load_model_line}
random_seed: 42
optimizer: "adam"
normalization: "tokens"
adam_betas: [0.9, 0.98]
scheduling: "plateau"
patience: 10
learning_rate: 0.00015
learning_rate_min: 0.000000015
weight_decay: 0.0
label_smoothing: 0.1
loss: "crossentropy-ctc" # use CrossEntropyLoss + CTCLoss
ctc_weight: 0.3 # ctc weight in interpolation
batch_size: 4 # much bigger than text! your "tokens" are "frames" now.
batch_type: "sentence"
batch_multiplier: 1
early_stopping_metric: chrf # by default, early stopping uses "fsw_eval" metric
epochs: 300 # Decrease for when playing around and checking of working.
validation_freq: 1000 # Set to at least once per epoch.
logging_freq: 100
model_dir: "{experiment_dir}"
overwrite: True
shuffle: True
use_cuda: True
print_valid_sents: [0, 1, 2, 3]
keep_best_ckpts: 2
model:
initializer: "xavier_uniform"
bias_initializer: "zeros"
init_gain: 1.0
embed_initializer: "xavier_uniform"
embed_init_gain: 1.0
tied_embeddings: False # DIsable embeddings sharing between enc(audio) and dec(text)
tied_softmax: False
encoder:
type: "transformer"
num_layers: 12 # Common to use doubly bigger encoder than decoder in S2T.
num_heads: 4
embeddings:
embedding_dim: 534 # Must be same as the frequency of the filterbank features!
# typically ff_size = 4 x hidden_size
hidden_size: 256
ff_size: 1024
dropout: 0.1
layer_norm: "pre"
# new for S2T:
subsample: True # enable 1d conv module
conv_kernel_sizes: [5, 5] # convolution kernel sizes (window width)
conv_channels: 512 # convolution channels
in_channels: 534 # Must be same as the embedding_dim
decoder:
type: "transformer"
num_layers: 6
num_heads: 4
embeddings:
embedding_dim: 256
scale: True
dropout: 0.0
# typically ff_size = 4 x hidden_size
hidden_size: 256
ff_size: 1024
dropout: 0.1
layer_norm: "pre"
""".format(data_dir=data_path.as_posix(),
experiment_dir=experiment_dir.as_posix(),
test_eval_matrices=test_eval_matrices,
load_model_line=load_model_line)

(data_path / 'config.yaml').write_text(config)


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", "-d", required=True, type=str)
parser.add_argument("--experiment-dir", "-e", required=True, type=str)
parser.add_argument("--test-eval-matrices", required=False, default='False')
parser.add_argument("--model", required=False, default=None)
args = parser.parse_args()
create_config(args.data_path, args.experiment_dir, args.test_eval_matrices, args.model)


if __name__ == '__main__':
main()
Loading

0 comments on commit 0ba47eb

Please sign in to comment.