Merge pull request #11 from sign-language-processing/pose_to_signwrit…

…ing_static_pretraining online running update bash files
sign-language-processing · Jun 22, 2024 · 97d885c · 97d885c
2 parents 2c18ffa + ffc2548
commit 97d885c
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 10 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ pose_to_signwriting = [
      # Uploads results to Google Sheets
     "google-auth-oauthlib",
     "google-api-python-client",
+    "ruamel.yaml",
 ]
 pose_to_vq_to_signwriting = [
     "sign-vq @ git+https://github.com/sign-language-processing/sign-vq.git" # Used for getting codes from poses

diff --git a/signwriting_transcription/pose_to_signwriting/bin.py b/signwriting_transcription/pose_to_signwriting/bin.py
@@ -73,8 +73,9 @@ def preprocessing_signs(preprocessed_pose: Pose, sign_annotations: list, strateg
         else:  # tight strategy - add padding(PADDING_PACTOR) to the tight segment
             # add padding to the segment by the distance between the segments
             np_pose, frame_rate = pose_to_matrix(preprocessed_pose)
-            pose_ndarray_to_matrix(np_pose, sign_start - (sign_start - start_point) * PADDING_PACTOR, frame_rate,
-                                   sign_end + (end_point - sign_end) * PADDING_PACTOR).filled(fill_value=0)
+            np_pose = (pose_ndarray_to_matrix(np_pose, sign_start - (sign_start - start_point) * PADDING_PACTOR,
+                                              frame_rate, sign_end + (end_point - sign_end) * PADDING_PACTOR)
+                       .filled(fill_value=0))
             start_point = sign_end
         pose_path = temp_path / f'{index}.npy'
         np.save(pose_path, np_pose)

diff --git a/signwriting_transcription/pose_to_signwriting/data/config.py b/signwriting_transcription/pose_to_signwriting/data/config.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 
-def create_config(data_path="/output/poses", experiment_dir='/model/poses'):
+def create_config(data_path="/output/poses", experiment_dir='/model/poses', test_eval_matrices='False'):
     data_path = Path(data_path)
     experiment_dir = Path(experiment_dir)
 
@@ -51,7 +51,7 @@ def create_config(data_path="/output/poses", experiment_dir='/model/poses'):
                 pretokenize: "none"
 
     testing:
-        eval_all_metrics: False
+        eval_all_metrics: {test_eval_matrices}
         n_best: 1
         beam_size: 5
         beam_alpha: 1.0
@@ -70,17 +70,17 @@ def create_config(data_path="/output/poses", experiment_dir='/model/poses'):
         adam_betas: [0.9, 0.98] 
         scheduling: "plateau"
         patience: 10
-        learning_rate: 0.0002
-        learning_rate_min: 0.00000001
+        learning_rate: 0.00015
+        learning_rate_min: 0.000000015
         weight_decay: 0.0
         label_smoothing: 0.1
         loss: "crossentropy-ctc"       # use CrossEntropyLoss + CTCLoss
         ctc_weight: 0.3                # ctc weight in interpolation
         batch_size: 4                  # much bigger than text! your "tokens" are "frames" now.
         batch_type: "sentence"
         batch_multiplier: 1
-        # early_stopping_metric:       # by default, early stopping uses "fsw_eval" metric
-        epochs: 15                     # Decrease for when playing around and checking of working.
+        early_stopping_metric: chrf        # by default, early stopping uses "fsw_eval" metric
+        epochs: 100                     # Decrease for when playing around and checking of working.
         validation_freq: 1000          # Set to at least once per epoch.
         logging_freq: 100
         model_dir: "{experiment_dir}"
@@ -128,7 +128,8 @@ def create_config(data_path="/output/poses", experiment_dir='/model/poses'):
             dropout: 0.1
             layer_norm: "pre"
     """.format(data_dir=data_path.as_posix(),
-               experiment_dir=experiment_dir.as_posix())
+               experiment_dir=experiment_dir.as_posix(),
+               test_eval_matrices=test_eval_matrices)
 
     (data_path / 'config.yaml').write_text(config)
 
@@ -267,8 +268,9 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--data-path", "-d", required=True, type=str)
     parser.add_argument("--experiment-dir", "-e", required=True, type=str)
+    parser.add_argument("--test-eval-matrices", required=False, default='False')
     args = parser.parse_args()
-    create_config(args.data_path, args.experiment_dir)
+    create_config(args.data_path, args.experiment_dir, args.test_eval_matrices)
 
 
 if __name__ == '__main__':

diff --git a/signwriting_transcription/pose_to_signwriting/pipeline.sh b/signwriting_transcription/pose_to_signwriting/pipeline.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Clone the repository
+git clone https://github.com/sign-language-processing/signwriting-transcription.git
+cd signwriting-transcription
+
+# Install the required packages
+pip install .[dev,pose_to_signwriting]
+
+# Download and unzip the transcription data set
+wget -O transcription.zip "https://firebasestorage.googleapis.com/v0/b/sign-language-datasets/o/poses%2Fholistic%2Ftranscription.zip?alt=media"
+unzip transcription.zip -d transcription_data_set
+
+# Run preprocessing script
+python signwriting_transcription/pose_to_signwriting/data/preprocessing.py --src-dir transcription_data_set --trg-dir normalized_data_set --normalization True
+
+# Prepare segmentation data set
+mkdir -p segment_data_set
+cp data/data_segmentation.csv segment_data_set/target.csv
+cp data/data.csv normalized_data_set/target.csv
+
+# Run prepare_poses script
+python signwriting_transcription/pose_to_signwriting/data/prepare_poses.py \
+  --dataset-root normalized_data_set \
+  --data-root vectorized_data_set \
+  --dataset-name poses \
+  --tokenizer-type pose-vpf \
+  --data-segment segment_data_set
+
+# Run config script
+python signwriting_transcription/pose_to_signwriting/data/config.py --data-path vectorized_data_set/poses --experiment-dir experiment
+
+# Prepare experiment directory
+mkdir -p experiment
+cp vectorized_data_set/poses/config.yaml experiment/config.yaml
+
+# Run training script
+python signwriting_transcription/pose_to_signwriting/joeynmt_pose/training.py vectorized_data_set/poses/config.yaml
+
+# Download token.json
+wget 'https://drive.google.com/uc?export=download&id=1EwgVIAxa_VcPWMtaFXru19ZBqc8NPq8K' -O signwriting_transcription/pose_to_signwriting/joeynmt_pose/token.json
+
+# Modify the config.yaml file to set eval_all_metrics to True
+python signwriting_transcription/pose_to_signwriting/data/config.py --data-path experiment --experiment-dir experiment --test-eval-matrices True
+
+# Run prediction script
+python signwriting_transcription/pose_to_signwriting/joeynmt_pose/prediction.py experiment/config.yaml test none
diff --git a/signwriting_transcription/pose_to_signwriting/start.sh b/signwriting_transcription/pose_to_signwriting/start.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# File to store the output
+OUTPUT_FILE="output.log"
+
+# Run the run_bash.sh script, capture both stdout and stderr
+# Display the output on the screen and write to the file
+./pipeline.sh | tee -a $OUTPUT_FILE