basic adaptation model

sign-language-processing · Dec 28, 2023 · 2b1aa92 · 2b1aa92
1 parent 483fbf9
commit 2b1aa92
Show file tree

Hide file tree

Showing 33 changed files with 9,960 additions and 0 deletions.
diff --git a/config.py b/config.py
@@ -0,0 +1,125 @@
+# Create the config
+from pathlib import Path
+
+data_path = Path('output/posses')
+experiment_dir = Path('models/posses')
+
+config = """
+name: "posses"
+joeynmt_version: 2.0.0
+
+data:
+    task: "S2T"                     # "S2T" for speech-to-text, "MT" for (text) translation
+    train: "{data_dir}/train"
+    dev:   "{data_dir}/dev"
+    test:  "{data_dir}/test"
+    dataset_type: "speech"          # SpeechDataset takes tsv as input
+    src:
+        lang: "en_ng"
+        num_freq: 534                # number of frequencies of audio inputs
+        max_length: 3000            # much longer than text sequence!
+        min_length: 10              # have to be specified so that 1d-conv works!
+        level: "frame"              # Here we specify we're working on BPEs.
+        tokenizer_type: "speech"
+        tokenizer_cfg: 
+            specaugment:
+                freq_mask_n: 1
+                freq_mask_f: 5
+                time_mask_n: 1
+                time_mask_t: 10
+                time_mask_p: 1.0
+            cmvn:
+                norm_means: True
+                norm_vars: True
+                before: True
+    trg:
+        lang: "en_ng"
+        max_length: 100
+        lowercase: False
+        level: "bpe"                # Here we specify we're working on BPEs.
+        voc_file: "{data_dir}/spm_bpe40.vocab"
+        tokenizer_type: "sentencepiece"
+        tokenizer_cfg: 
+            model_file: "{data_dir}/spm_bpe40.model"
+            pretokenize: "none"
+
+testing:
+    n_best: 1
+    beam_size: 5
+    beam_alpha: 1.0
+    batch_size: 4
+    batch_type: "sentence"
+    max_output_length: 100          # Don't generate translations longer than this.
+    eval_metrics: ["wer"]           # Use "wer" for ASR task, "bleu" for ST task
+    sacrebleu_cfg:                  # sacrebleu options
+        tokenize: "intl"            # `tokenize` option in sacrebleu.corpus_bleu() function (options include: "none" (use for already tokenized test data), "13a" (default minimal tokenizer), "intl" which mostly does punctuation and unicode, etc) 
+
+training:
+    #load_model: "{experiment_dir}/1.ckpt" # if uncommented, load a pre-trained model from this checkpoint
+    random_seed: 42
+    optimizer: "adam"
+    normalization: "tokens"
+    adam_betas: [0.9, 0.98] 
+    scheduling: "plateau"
+    patience: 5
+    learning_rate: 0.0002
+    learning_rate_min: 0.00000001
+    weight_decay: 0.0
+    label_smoothing: 0.1
+    loss: "crossentropy-ctc"       # use CrossEntropyLoss + CTCLoss
+    ctc_weight: 0.3                # ctc weight in interpolation
+    batch_size: 4                  # much bigger than text! your "tokens" are "frames" now.
+    batch_type: "sentence"
+    batch_multiplier: 1
+    early_stopping_metric: "wer"
+    epochs: 10                     # Decrease for when playing around and checking of working.
+    validation_freq: 1000          # Set to at least once per epoch.
+    logging_freq: 100
+    model_dir: "{experiment_dir}"
+    overwrite: True
+    shuffle: True
+    use_cuda: True
+    print_valid_sents: [0, 1, 2, 3]
+    keep_best_ckpts: 2
+
+model:
+    initializer: "xavier_uniform"
+    bias_initializer: "zeros"
+    init_gain: 1.0
+    embed_initializer: "xavier_uniform"
+    embed_init_gain: 1.0
+    tied_embeddings: False       # DIsable embeddings sharing between enc(audio) and dec(text)
+    tied_softmax: False
+    encoder:
+        type: "transformer"
+        num_layers: 12           # Common to use doubly bigger encoder than decoder in S2T.
+        num_heads: 4
+        embeddings:
+            embedding_dim: 534    # Must be same as the frequency of the filterbank features!
+        # typically ff_size = 4 x hidden_size
+        hidden_size: 256
+        ff_size: 1024
+        dropout: 0.1
+        layer_norm: "pre"
+        # new for S2T:
+        subsample: True           # enable 1d conv module
+        conv_kernel_sizes: [5, 5] # convolution kernel sizes (window width)
+        conv_channels: 512        # convolution channels
+        in_channels: 534           # Must be same as the embedding_dim
+    decoder:
+        type: "transformer"
+        num_layers: 6
+        num_heads: 4
+        embeddings:
+            embedding_dim: 256
+            scale: True
+            dropout: 0.0
+        # typically ff_size = 4 x hidden_size
+        hidden_size: 256
+        ff_size: 1024
+        dropout: 0.1
+        layer_norm: "pre"
+""".format(data_dir=data_path.as_posix(),
+           experiment_dir=experiment_dir.as_posix())
+
+(data_path / 'config.yaml').write_text(config)
diff --git a/data_preprocessing.py b/data_preprocessing.py
@@ -0,0 +1,32 @@
+import argparse
+from pathlib import Path
+
+from pose_format import Pose
+from pose_format.utils.generic import pose_normalization_info, correct_wrists, reduce_holistic
+
+
+def preprocess(srcDir, trgDir):
+    srcDir = Path(srcDir)
+    trgDir = Path(trgDir)
+    trgDir.mkdir(parents=True, exist_ok=True)
+    for path in srcDir.iterdir():
+        if path.is_file() and path.suffix == ".pose":
+            with open(srcDir / path.name, 'rb') as pose_file:
+                pose = Pose.read(pose_file.read())
+            pose = reduce_holistic(pose)
+            correct_wrists(pose)
+            pose = pose.normalize(pose_normalization_info(pose.header))
+            with open(trgDir / path.name, 'w+b') as pose_file:
+                pose.write(pose_file)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--srcDir", required=True, type=str)
+    parser.add_argument("--trgDir", required=True, type=str)
+    args = parser.parse_args()
+    preprocess(args.srcDir, args.trgDir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets_pose.py b/datasets_pose.py
@@ -0,0 +1,47 @@
+import numpy as np
+from pose_format import Pose
+import pandas as pd
+from swu_representation import swu2data
+
+FrameRate = 29.97003
+
+
+def ms2frame(ms) -> int:
+    return int(ms / 1000 * FrameRate)
+
+
+def pose_to_matrix(file_path, start_ms, end_ms):
+    with open(file_path, "rb") as f:
+        pose = Pose.read(f.read())
+    pose = pose.body.data
+    pose = pose.reshape(pose.shape[0], pose.shape[2] * pose.shape[3])
+    pose = pose[ms2frame(start_ms):ms2frame(end_ms)]
+    return pose
+
+
+def load_dataset(folder_name):
+
+    target = pd.read_csv(f'{folder_name}/target.csv')
+    dataset = []
+    for line in target.values:
+        pose = pose_to_matrix(f'{folder_name}/{line[0]}', line[2], line[3])
+        pose = pose.filled(fill_value=0)
+        utt_id = line[0].split('.')[0]
+        utt_id = f'{utt_id}({line[2]})'
+        dataset.append((utt_id, pose, swu2data(line[4])))
+    return dataset
+
+
+def extract_to_fbank(pose_data, output_path, overwrite: bool = False):
+    if output_path is not None and output_path.is_file() and not overwrite:
+        return np.load(output_path.as_posix())
+    if output_path is not None:
+        np.save(output_path.as_posix(), pose_data)
+        assert output_path.is_file(), output_path
+    return pose_data
+
+
+if __name__ == "__main__":
+    dataSet = load_dataset("Dataset")
+
+    print(dataSet)
diff --git a/joeynmt/__init__.py b/joeynmt/__init__.py
diff --git a/joeynmt/__main__.py b/joeynmt/__main__.py
@@ -0,0 +1,64 @@
+import argparse
+
+from joeynmt.prediction import test, translate
+from joeynmt.training import train
+
+
+def main():
+    ap = argparse.ArgumentParser("Joey NMT")
+
+    ap.add_argument(
+        "mode",
+        choices=["train", "test", "translate"],
+        help="train a model or test or translate",
+    )
+
+    ap.add_argument("config_path", type=str, help="path to YAML config file")
+
+    ap.add_argument("-c", "--ckpt", type=str, help="checkpoint for prediction")
+
+    ap.add_argument("-o",
+                    "--output_path",
+                    type=str,
+                    help="path for saving translation output")
+
+    ap.add_argument(
+        "-a",
+        "--save_attention",
+        action="store_true",
+        help="save attention visualizations",
+    )
+
+    ap.add_argument("-s", "--save_scores", action="store_true", help="save scores")
+
+    ap.add_argument(
+        "-t",
+        "--skip_test",
+        action="store_true",
+        help="Skip test after training",
+    )
+
+    args = ap.parse_args()
+
+    if args.mode == "train":
+        train(cfg_file=args.config_path, skip_test=args.skip_test)
+    elif args.mode == "test":
+        test(
+            cfg_file=args.config_path,
+            ckpt=args.ckpt,
+            output_path=args.output_path,
+            save_attention=args.save_attention,
+            save_scores=args.save_scores,
+        )
+    elif args.mode == "translate":
+        translate(
+            cfg_file=args.config_path,
+            ckpt=args.ckpt,
+            output_path=args.output_path,
+        )
+    else:
+        raise ValueError("Unknown mode")
+
+
+if __name__ == "__main__":
+    main()