bert_classifier.py

import argparse
import pathlib
from itertools import product

import numpy as np
import sklearn.metrics as metrics

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # required for optimizer

from adamw_metric import create_optimizer
from epoch_model_checkpoint import save_graph
from f1_score import F1Score
from preprocessing import load_ds, load_env_vars


map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}
map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}


def train_bert(ds_list: list[tf.data.Dataset], logs_dir: pathlib.Path, hparams: dict, **params):
    print(hparams)

    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

        preprocessing_layer = hub.KerasLayer(hparams['PREPROCESS_MODEL'], name='preprocessing')
        encoder_inputs = preprocessing_layer(text_input)
        encoder = hub.KerasLayer(hparams['EMBED_MODEL'], trainable=True, name='encoder')
        outputs = encoder(encoder_inputs)

        net = outputs['pooled_output']
        net = tf.keras.layers.Dropout(hparams['DROPOUT'])(net)
        net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)

        model = tf.keras.Model(text_input, net, name='bert')

        num_train_steps = tf.data.experimental.cardinality(ds_list[0]).numpy() * params['EPOCHS']
        optimizer = create_optimizer(init_lr=hparams['INITIAL_LEARNING_RATE'],  num_train_steps=num_train_steps,
                                     num_warmup_steps=int(0.1 * num_train_steps))

        model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                      optimizer=optimizer,
                      metrics=[
                          'accuracy',  # 'precision', 'recall'
                          tf.keras.metrics.Precision(name='precision'),
                          tf.keras.metrics.Recall(name='recall'),
                          F1Score(name='f1'),
                      ])

    history = model.fit(x=ds_list[0],
                        validation_data=ds_list[1],
                        epochs=params['EPOCHS'],
                        verbose=0)

    metrics = model.evaluate(ds_list[2])

    f1 = 0 if metrics[2] * metrics[3] == 0 else (2 * metrics[2] * metrics[3]) / (metrics[2] + metrics[3])
    dir = logs_dir / 'graphs' / 'bert'
    save_graph(dir / f'{f1}-{"-".join(map(str, [hparams[k] for k in ["DROPOUT", "INITIAL_LEARNING_RATE"]]))}.png', history)
    _save_data(dir / f'{f1}-{"-".join(map(str, [hparams[k] for k in ["DROPOUT", "INITIAL_LEARNING_RATE"]]))}.txt', model, ds_list[2])

    return model, history


def _save_data(save_path: pathlib.Path, model: tf.keras.Model, test_ds: tf.data.Dataset):
    y = np.concatenate(list(test_ds.map(lambda x, y: y).as_numpy_iterator()))
    pred = np.where(model.predict(test_ds) > 0.5, 1, 0)

    with open(save_path, 'w') as f:
        f.write(str(metrics.confusion_matrix(y, pred)) + "\n")
        f.write(str(metrics.precision_score(y, pred)) + "\n")
        f.write(str(metrics.recall_score(y, pred)) + "\n")


def optimize_hyperparameters(ds_list: list[tf.data.Dataset], logs_dir: pathlib.Path, hparams: dict, **params):
    for comb in product(*hparams.values()):
        hp = {}
        for i, k in enumerate(hparams.keys()):
            hp[k] = comb[i]

        model, _ = train_bert(ds_list, logs_dir, hp, **params)


def get_hyperparameters(**settings):
    parser = argparse.ArgumentParser(description='Test hyperparameter combinations.')
    parser.add_argument('--preprocess-model', nargs='*', type=str, default=['bert_en_uncased_L-4_H-512_A-8'])
    parser.add_argument('--embed-model', nargs='*', type=str, default=['bert_en_uncased_L-4_H-512_A-8'])
    parser.add_argument('--dropout', nargs='*', type=float, default=[0.1, 0.2, 0.3, 0.4, 0.5])
    parser.add_argument('-init-lr', '--initial-learning-rate', nargs='*', type=float,
                        default=[1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7])
    parser.add_argument('-wp', '--warmup-percentage', nargs='*', type=float, default=[0.1])

    hparams = {k.upper(): v for k, v in vars(parser.parse_args()).items()}
    dir = settings['BASE_DIR'] / settings['MODEL_DIR']
    hparams['PREPROCESS_MODEL'] = [(dir / hp[: -7]).as_posix() if hp.endswith('.tar.gz') else
                                   map_model_to_preprocess[hp] for hp in hparams['PREPROCESS_MODEL']]
    hparams['EMBED_MODEL'] = [(dir / hp[: -7]).as_posix() if hp.endswith('.tar.gz') else map_name_to_handle[hp]
                              for hp in hparams['EMBED_MODEL']]

    return hparams


if __name__ == '__main__':
    settings, params = load_env_vars()
    hparams = get_hyperparameters(**settings)

    print(hparams)

    ds_list = load_ds(settings['BASE_DIR'] / settings['DATA_DIR'], is_xlsx=False, prefix='sen_', **params)
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    ds_list = [ds.with_options(options).cache().prefetch(tf.data.AUTOTUNE) for ds in ds_list]

    optimize_hyperparameters(ds_list, settings['BASE_DIR'] / settings['LOGS_DIR'], hparams, **params)