From 8ba420603466037f5f1e2d2af4083e275981e25d Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 15:20:58 +0300
Subject: [PATCH 01/19] Optimization API config files & model attribute builder

---
 hls4ml/optimization/__init__.py             |   0
 hls4ml/optimization/attributes.py           | 204 ++++++++++++++++++++
 hls4ml/optimization/config.py               |  43 +++++
 hls4ml/optimization/keras/__init__.py       |   0
 hls4ml/optimization/keras/config.py         |  25 +++
 test/pytest/optimization/test_attributes.py |  83 ++++++++
 6 files changed, 355 insertions(+)
 create mode 100644 hls4ml/optimization/__init__.py
 create mode 100644 hls4ml/optimization/attributes.py
 create mode 100644 hls4ml/optimization/config.py
 create mode 100644 hls4ml/optimization/keras/__init__.py
 create mode 100644 hls4ml/optimization/keras/config.py
 create mode 100644 test/pytest/optimization/test_attributes.py

diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/optimization/attributes.py b/hls4ml/optimization/attributes.py
new file mode 100644
index 0000000000..d19572163b
--- /dev/null
+++ b/hls4ml/optimization/attributes.py
@@ -0,0 +1,204 @@
+import hls4ml
+import numpy as np
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType
+
+class hls4mlAttributes:
+    '''
+    A class for storing hls4ml information of a single layer
+
+    Args:
+    - n_in (int): Number of inputs (rows) for Dense matrix multiplication
+    - n_out (int): Number of outputs (cols) for Dense matrix multiplication
+    - io_type (string): io_parallel or io_stream
+    - strategy (string): Resource or Latency
+    - weight_precision (FixedPrecisionType): Layer weight precision
+    - output_precision (FixedPrecisionType): Layer output precision
+    - reuse_factor (int): Layer reuse factor
+    - parallelization_factor (int): Layer parallelization factor - [applicable to io_parallel Conv2D]
+    '''
+    def __init__(self, n_in, n_out, io_type, strategy, weight_precision, output_precision, reuse_factor, parallelization_factor=1):    
+        if not isinstance(weight_precision, (FixedPrecisionType, IntegerPrecisionType)):
+            raise Exception('Layer weight precision is not in valid format')
+    
+        if not isinstance(output_precision, (FixedPrecisionType, IntegerPrecisionType)):
+            raise Exception('Layer weight precision is not in valid format')
+
+        if not strategy in ('Latency', 'latency', 'Resource', 'resource'):
+            raise Exception('Unknown layer strategy')
+        
+        if not io_type in ('io_parallel', 'io_stream'):
+            raise Exception('Unknown IO type')
+        
+        self.n_in = n_in
+        self.n_out = n_out
+        self.io_type = io_type
+        self.strategy = strategy
+        self.weight_precision = weight_precision
+        self.output_precision = output_precision
+        self.reuse_factor = reuse_factor
+        self.parallelization_factor = parallelization_factor
+
+class OptimizationAttributes:
+    '''
+    A class for storing layer optimization attributes
+    
+    Args:
+        - structure_type (enum): Targeted structure - unstructured, structured, pattern, block
+        - pruning (boolean): Should pruning be applied to the layer
+        - weight_sharing (boolean): Should weight sharing be applied to the layer
+        - block_shape (tuple): Block shape if structure_type == block
+        - pattern_offset (int): Length of each pattern if structure_type == pattern 
+        - consecutive_patterns (int): How many consecutive patterns are grouped together if structure_type == pattern 
+    
+    Notes:
+        - In the case of hls4ml, pattern_offset is equivalent to the number of weights processed in parallel
+        - The pattern_offset is n_in * n_out / reuse_factor; default case (=1) is equivalent to no unrolling
+    '''
+    def __init__(self, structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=False, weight_sharing=False, block_shape=(1, 1), pattern_offset=1, consecutive_patterns=1):
+        if not isinstance(structure_type, SUPPORTED_STRUCTURES):
+            raise Exception(f'{self.__class__.__name__} unknown structure type')
+    
+        self.structure_type = structure_type
+        self.pruning = pruning
+        self.weight_sharing = weight_sharing
+        self.block_shape = block_shape
+        self.pattern_offset = pattern_offset
+        self.consecutive_patterns = consecutive_patterns
+
+class LayerAttributes:
+    '''
+    A class for storing layer information
+    
+    Args:
+        - name (string): Layer name
+        - layer_type (keras.Layer): Layer type (e.g. Dense, Conv2D etc.)
+        - inbound_layers (list): List of parent nodes, identified by name
+        - weight_shape (tuple): Layer weight shape
+        - input_shape (tuple): Layer input shape
+        - output_shape (tuple): Layer output shape
+        - optimizable (bool): Should optimizations (pruning, weight sharing) be applied to this layer
+        - optimization_attributes (OptimizationAttributes): Type of optimization, optimization vs weight sharing, block shape and pattern offset
+        - args (dict): Additional information, e.g. hls4mlAttributes; dictionary so it can be generic enough for different platforms
+    '''
+    def __init__(self, name, layer_type, inbound_layers, weight_shape, input_shape,output_shape,optimizable, optimization_attributes, args):
+        self.name = name
+        self.layer_type = layer_type
+        self.inbound_layers = inbound_layers
+        self.weight_shape = weight_shape
+        self.input_shape = input_shape
+        self.output_shape = output_shape
+        self.optimizable = optimizable
+        self.optimization_attributes = optimization_attributes 
+        self.args = args
+
+    def update_args(self, updates):
+        self.args.update(updates)
+
+    def __str__(self):
+        return  f'name: {self.name}, ' \
+                f'layer_type: {self.layer_type}, ' \
+                f'inbound_layers: {self.inbound_layers}, ' \
+                f'weight_shape: {self.weight_shape}, ' \
+                f'input_shape: {self.input_shape}, ' \
+                f'output_shape: {self.output_shape}, ' \
+                f'optimizable: {self.optimizable}, ' \
+                f'optimization_attributes: {self.optimization_attributes}, ' \
+                f'args: {self.args}, ' \
+
+def get_attributes_from_keras_model(model):
+    '''
+    Given a Keras model, builds a dictionary of class attributes
+    Additional arguments (e.g. reuse factor), depend on the target hardware platform and are inserted later
+    Per-layer pruning sype (structured, pattern etc.), depend on the pruning objective and are inserted later
+    
+    Args:
+        - model (keras.model): Model to extract attributes from
+
+    Return:
+        - model_attributes (dict): Each key corresponds to a layer name, values are instances of LayerAttribute 
+    '''
+    is_sequential =  model.__class__.__name__ == 'Sequential'
+    model_attributes = {}
+
+    for i, layer in enumerate(model.layers):
+        inbound_layers = []
+        if is_sequential and i > 0:
+            inbound_layers.append(model.layers[i-1])
+        elif not is_sequential:
+            nodes = model.get_config()['layers'][i]['inbound_nodes']
+            if len(nodes) > 0:
+                inbound_layers.append(node[0] for node in nodes[0])
+        
+        layer_weights = layer.get_weights()
+        weight_shape = layer_weights[0].shape if len(layer_weights) > 0 else ()
+
+        model_attributes[layer.name] = LayerAttributes(
+            layer.name, layer.__class__, inbound_layers,
+            weight_shape, layer.input_shape[1:], layer.output_shape[1:],
+            False, OptimizationAttributes(), {}
+        )
+    
+    return model_attributes
+
+def get_attributes_from_keras_model_and_hls4ml_config(model, config):
+    '''
+    Given a Keras model and hls4ml configuration, builds a dictionary of class attributes
+    Per-layer pruning sype (structured, pruning etc.), depend on the pruning objective and are inserted later
+    
+    Args:
+        - model (keras.model): Model to extract attributes from
+        - config (dict): hls4ml dictionary 
+    
+    Return:
+        - model_attributes (dict): Each key corresponds to a layer name, values are LayerAttribute instances 
+    '''
+
+    # Extract Keras attributes
+    model_attributes = get_attributes_from_keras_model(model)
+
+    # Extract hls4ml attributes
+    io_type = config['IOType']
+    default_reuse_factor = config['Model']['ReuseFactor']
+    default_strategy = config['Model']['Strategy']
+    default_precision = config['Model']['Precision']
+    
+    # Build dictionary
+    for layer in model_attributes:
+        if model_attributes[layer].layer_type in SUPPORTED_LAYERS:
+            n_in, n_out = __get_layer_mult_size(model_attributes[layer])
+            layer_config = config['LayerName'][layer] if layer in config['LayerName'] else {}
+            reuse_factor = layer_config['ReuseFactor'] if 'ReuseFactor' in layer_config else default_reuse_factor 
+            parallelization_factor =  layer_config['ParallelizationFactor'] if 'ParallelizationFactor' in layer_config else 1
+            strategy = layer_config['Strategy'] if 'Strategy' in layer_config else default_strategy
+            weight_precision = layer_config['Precision']['weight'] if 'weight' in layer_config['Precision'] else default_precision
+            weight_precision = hls4ml.backends.fpga.fpga_backend.FPGABackend.convert_precision_string(weight_precision)
+            output_precision = layer_config['Precision']['result'] if 'result' in layer_config['Precision'] else default_precision
+            output_precision = hls4ml.backends.fpga.fpga_backend.FPGABackend.convert_precision_string(output_precision)
+        
+            hls4ml_attributes = hls4mlAttributes(
+                n_in, n_out,
+                io_type, strategy,
+                weight_precision, output_precision,
+                reuse_factor, parallelization_factor
+            )
+            model_attributes[layer].update_args({'hls4ml_attributes': hls4ml_attributes})
+    
+    return model_attributes
+
+def __get_layer_mult_size(attributes):
+    '''
+    Helper function to calculate layer multiplication size
+    '''
+    if 'Dense' in attributes.layer_type.__name__:
+        n_in = np.prod(attributes.input_shape)
+        n_out = np.prod(attributes.output_shape)
+        return n_in, n_out
+
+    if 'Conv' in attributes.layer_type.__name__:
+        n_in = np.prod(attributes.weight_shape[0:-2])
+        n_out = attributes.weight_shape[-1]
+        return n_in, n_out
+
+    raise Exception(f'Cannot get mult size for layer {attributes.name}')
diff --git a/hls4ml/optimization/config.py b/hls4ml/optimization/config.py
new file mode 100644
index 0000000000..fb4e9c4de0
--- /dev/null
+++ b/hls4ml/optimization/config.py
@@ -0,0 +1,43 @@
+from enum import Enum
+
+'''
+A list of currently supported structures when optimizing (pruning, weight sharing)
+For more information, see attributes.py
+
+1. Unstructured:
+    - Pruning: Y
+    - Weight sharing: N
+    - Description: Removes (zeroes out) individual weights
+    - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
+
+2. Structured:
+    - Pruning: Y
+    - Weight sharing: Y
+    - Description: Zeroes out or quantizes all the weights in a structure:
+        - Dense: Neurons, determined by their outgoing connections (columns in Keras weight tensors)
+        - Conv2D: Filters (structures of size filt_width x filt_height x n_chan)
+        - Notes: 
+            - For Dense, it was also possible optimize by incoming connections (rows); however, removing zero neurons becomes harder
+            - For Conv2D, significant literature explored pruning channels; currently not supported
+    - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
+
+3. Pattern:
+    - Pruning: Y
+    - Weight sharing: Y
+    - Description: Zeroes out or quantizes all the weights in a group.
+       Groups are determined by a variable, n, and every n-th weight in the flattened, transposed (Resource) weight tensor is collected and stored in the same group
+       Equivalent to pruning/quantizing weight processed by the same DSP in hls4ml
+    - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
+
+4. Block:
+    - Pruning: Y
+    - Weight sharing: Y
+    - Description: Zeroes out or quantizes all the weights in a block of size (w, h)
+    - Supports: All rank-2 (e.g. Dense, but not Conv2D) layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
+
+'''
+class SUPPORTED_STRUCTURES(Enum):
+    UNSTRUCTURED = 'unstructured'
+    STRUCTURED   = 'structured'
+    PATTERN      = 'pattern'
+    BLOCK        = 'block' 
diff --git a/hls4ml/optimization/keras/__init__.py b/hls4ml/optimization/keras/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/optimization/keras/config.py b/hls4ml/optimization/keras/config.py
new file mode 100644
index 0000000000..1d651cd480
--- /dev/null
+++ b/hls4ml/optimization/keras/config.py
@@ -0,0 +1,25 @@
+from qkeras import QDense, QConv2D
+from tensorflow.keras.layers import Dense, Conv2D
+
+'''
+Optimizable layers in Keras / QKeras
+Any new layers need to be registered here first
+Additional logic in the source files may need to be written (e.g. recurrent layers should also optimize recurrent kernels)
+'''
+SUPPORTED_LAYERS = (Dense, Conv2D, QDense, QConv2D)
+
+
+'''
+Supported ranking metrics, for classifying redundant (groups of) weights
+
+1. l1 - groups of weights are ranked by their l1 norm
+2. l2 - groups of weights are ranked by their l2 norm
+3. oracle - abs(dL / dw * w), introduced by Molchanov et al. (2016) Pruning Convolutional Neural Networks for Resource Efficient Inference
+4. saliency - (d^2L / dw^2 * w)^2, introduced by Lecun et al. (1989) Optimal Brain Damage
+'''
+SUPPORTED_METRICS = ('l1', 'l2', 'oracle', 'saliency')
+
+'''
+Temporary directory for storing best models, tuning results etc.
+'''
+TMP_DIRECTORY = 'hls4ml-optimization-keras'
diff --git a/test/pytest/optimization/test_attributes.py b/test/pytest/optimization/test_attributes.py
new file mode 100644
index 0000000000..c6991be7e4
--- /dev/null
+++ b/test/pytest/optimization/test_attributes.py
@@ -0,0 +1,83 @@
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Conv2D, Flatten, ReLU
+from hls4ml.utils.config import config_from_keras_model
+from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
+
+def test_attributes():
+    dense_units = 16
+    conv_filters = 6
+    conv_channels = 3
+    conv_shape = (3, 3)
+    input_shape = (8, 8)
+    io_type = 'io_parallel'
+    strategy = 'Resource'
+    
+    model = Sequential()
+    model.add(Conv2D(conv_filters, input_shape=(*input_shape, conv_channels), kernel_size=conv_shape, name='conv2d', padding='same', kernel_initializer='ones'))
+    model.add(Flatten(name='flatten'))
+    model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
+    model.add(ReLU(name='relu'))
+
+    default_reuse_factor = 2 
+    default_precision = 'ap_fixed<8, 0>'
+    cfg = config_from_keras_model(model, granularity='name', default_precision=default_precision, default_reuse_factor=default_reuse_factor)    
+    cfg['IOType'] = io_type
+    cfg['Model']['Strategy'] = strategy
+    cfg['LayerName']['dense']['ReuseFactor'] = 1
+
+    # Verify correct information for every layer
+    model_attributes = get_attributes_from_keras_model_and_hls4ml_config(model, cfg)
+    assert(len(model_attributes) == 4)
+
+    # conv2d
+    assert(model_attributes['conv2d'].name == 'conv2d')
+    assert(model_attributes['conv2d'].layer_type.__name__ == 'Conv2D')
+    assert(model_attributes['conv2d'].inbound_layers == [])
+    assert(model_attributes['conv2d'].weight_shape == (3, 3, 3, 6))
+    assert(model_attributes['conv2d'].input_shape == (8, 8, 3))
+    assert(model_attributes['conv2d'].output_shape == (8, 8, 6))
+    assert(model_attributes['conv2d'].optimizable == False)
+    assert(model_attributes['conv2d'].args['hls4ml_attributes'].n_in == 9)
+    assert(model_attributes['conv2d'].args['hls4ml_attributes'].n_out == 6)
+    assert(model_attributes['conv2d'].args['hls4ml_attributes'].io_type == io_type)
+    assert(model_attributes['conv2d'].args['hls4ml_attributes'].strategy == strategy)
+    assert(model_attributes['conv2d'].args['hls4ml_attributes'].reuse_factor == default_reuse_factor)
+    assert(model_attributes['conv2d'].args['hls4ml_attributes'].weight_precision.width == 8)
+    assert(model_attributes['conv2d'].args['hls4ml_attributes'].parallelization_factor == 1)
+
+    # flatten
+    assert(model_attributes['flatten'].name == 'flatten')
+    assert(model_attributes['flatten'].layer_type.__name__ == 'Flatten')
+    assert(model_attributes['flatten'].weight_shape == ())
+    assert(model_attributes['flatten'].input_shape == (8, 8, 6))
+    assert(model_attributes['flatten'].output_shape == (384, ))
+    assert(model_attributes['flatten'].optimizable == False)
+    
+    # Flatten is not optimizable so hls4mlAttributes (n_in, n_out, reuse factor etc.) will not be stored for it
+    assert('hls4ml_attributes' not in model_attributes['flatten'].args) 
+
+    # dense
+    assert(model_attributes['dense'].name == 'dense')
+    assert(model_attributes['dense'].layer_type.__name__ == 'Dense')
+    assert(model_attributes['dense'].weight_shape == (384, 16))
+    assert(model_attributes['dense'].input_shape == (384, ))
+    assert(model_attributes['dense'].output_shape == (16, ))
+    assert(model_attributes['dense'].optimizable == False)
+    assert(model_attributes['dense'].args['hls4ml_attributes'].n_in == 384)
+    assert(model_attributes['dense'].args['hls4ml_attributes'].n_out == 16)
+    assert(model_attributes['dense'].args['hls4ml_attributes'].io_type == io_type)
+    assert(model_attributes['dense'].args['hls4ml_attributes'].strategy == strategy)
+    assert(model_attributes['dense'].args['hls4ml_attributes'].reuse_factor == 1)
+    assert(model_attributes['dense'].args['hls4ml_attributes'].output_precision.width == 8)
+    assert(model_attributes['dense'].args['hls4ml_attributes'].parallelization_factor == 1)
+
+    # relu
+    assert(model_attributes['relu'].name == 'relu')
+    assert(model_attributes['relu'].layer_type.__name__ == 'ReLU')
+    assert(model_attributes['relu'].weight_shape == ())
+    assert(model_attributes['relu'].input_shape == (16, ))
+    assert(model_attributes['relu'].output_shape == (16, ))
+    assert(model_attributes['relu'].optimizable == False)
+
+    # ReLU is not optimizable so hls4mlAttributes (n_in, n_out, reuse factor etc.) will not be stored for it
+    assert('hls4ml_attributes' not in model_attributes['relu'].args) 

From 12fba05da78572c6a654045c2e2199167298da06 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 15:26:15 +0300
Subject: [PATCH 02/19] Optimization sparsity schedulers

---
 hls4ml/optimization/scheduler.py           | 128 +++++++++++++++++++++
 test/pytest/optimization/test_scheduler.py |  74 ++++++++++++
 2 files changed, 202 insertions(+)
 create mode 100644 hls4ml/optimization/scheduler.py
 create mode 100644 test/pytest/optimization/test_scheduler.py

diff --git a/hls4ml/optimization/scheduler.py b/hls4ml/optimization/scheduler.py
new file mode 100644
index 0000000000..fe02688fd7
--- /dev/null
+++ b/hls4ml/optimization/scheduler.py
@@ -0,0 +1,128 @@
+from abc import ABC, abstractmethod
+
+class OptimizationScheduler(ABC):
+    '''
+    Baseline class handling logic regarding target sparsity and its updates at every step
+    '''
+    
+    def __init__(self, initial_sparsity=0, final_sparsity=1):
+        '''
+        intial_sparsity and final_sparsity are between 0.0 and 1.0, NOT 0% and 100%
+        '''
+        if (initial_sparsity < 0 or initial_sparsity > 1):
+            raise Exception('intial_sparsity must be between 0.0 and 1.0')
+        if (final_sparsity < 0 or final_sparsity > 1):
+            raise Exception('final_sparsity must be between 0.0 and 1.0')
+        self.sparsity = initial_sparsity
+        self.lower_bound = initial_sparsity
+        self.upper_bound = final_sparsity
+    
+    @abstractmethod
+    def update_step(self):
+        '''
+        Increments the current sparsity, according to the rule, examples:
+            - ConstantScheduler, sparsity = 0.5, increment = 0.05 -> sparsity = 0.55
+            - BinaryScheduler, sparsity = 0.5, target = 1.0 -> sparsity = 0.75
+        
+        Return:
+            - updated (boolean) - Has the sparsity changed? If not, the optimization algorithm can stop
+            - sparsity (float) - Updated sparsity
+        '''
+        pass
+
+    @abstractmethod
+    def repair_step(self):
+        '''
+        Method used when the neural architecture does not meet satisfy performance requirement for a given sparsity
+        Then, the target sparsity is decreased according to the rule, examples:
+            - ConstantScheduler, sparsity = 0.5, increment = 0.05 -> sparsity = 0.55 [see ConstantScheduler for explanation]
+            - BinaryScheduler, sparsity = 0.75, target = 1.0, previous = 0.5 -> sparsity = (0.5 + 0.75) / 2 = 0.625
+        
+        Return:
+            - updated (boolean) - Has the sparsity changed? If not, the optimization algorithm can stop
+            - sparsity (float) - Updated sparsity
+        '''
+        pass
+
+    def get_sparsity(self):
+        return self.sparsity
+
+class ConstantScheduler(OptimizationScheduler):
+    '''
+    Sparsity updated by a constant term, until (i) sparsity target reached (ii) optimization algorithm stops requesting state updates
+    '''
+    def __init__(self, initial_sparsity=0, final_sparsity=1.0, update_step=0.05):
+        self.increment = update_step
+        super(ConstantScheduler, self).__init__(initial_sparsity, final_sparsity)
+    
+    def update_step(self):
+        if self.sparsity + self.increment <= self.upper_bound:
+            self.sparsity += self.increment
+            return True, self.sparsity
+        else:
+            return False, self.sparsity
+
+    '''
+    In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity
+    In this case, constant sparsity (since it increments by a small amount every time), will simply jump to the next sparsity level
+    The model's performance over several sparsity levels optimization is stoped after high loss over several trials (see top level pruning/optimization function)
+    
+    '''
+    def repair_step(self):
+        return self.update_step()
+    
+class BinaryScheduler(OptimizationScheduler):
+    '''
+    Sparsity updated by binary halving the search space; constantly updates lower and upper bounds
+    In the update step, sparsity is incremented, as the midpoint between previous sparsity and target sparsity (upper bound)
+    In the repair step, sparsity is decrement, as the midpoint between between the lower bound and previous sparsity
+    '''
+    def __init__(self, initial_sparsity=0, final_sparsity=1.0, threshold=0.01):
+        self.threshold = threshold
+        super(BinaryScheduler, self).__init__(initial_sparsity, final_sparsity)
+    
+    def update_step(self):
+        if self.upper_bound - self.sparsity >= self.threshold:
+            self.lower_bound = self.sparsity
+            self.sparsity = 0.5 * (self.lower_bound + self.upper_bound)
+            return True, self.sparsity
+        else:
+            self.lower_bound = self.sparsity
+            return False, self.sparsity
+
+    def repair_step(self):
+        if self.sparsity - self.lower_bound >= self.threshold:
+            self.upper_bound = self.sparsity
+            self.sparsity = 0.5 * (self.lower_bound + self.upper_bound)
+            return True, self.sparsity
+        else:
+            self.upper_bound = self.sparsity
+            return False, self.sparsity
+
+class PolynomialScheduler(OptimizationScheduler):
+    '''
+    Sparsity updated by at a polynomial decay, until (i) sparsity target reached (ii) optimization algorithm stops requesting state updates
+    For more information, see Zhu & Gupta (2016) - 'To prune, or not to prune: exploring the efficacy of pruning for model compression' and TensorFlow Model Optimization library
+    Note, the implementation is slightly different, as TensorFlow Prune API depends on the total number of epochs and update frequency
+    '''
+    def __init__(self, maximum_steps, initial_sparsity=0, final_sparsity=1.0, decay_power=3):
+        self.decay_power = decay_power
+        self.current_step = 0
+        self.maximum_steps = maximum_steps
+        super(PolynomialScheduler, self).__init__(initial_sparsity, final_sparsity)
+    
+    def update_step(self):
+        if self.current_step < self.maximum_steps:
+            self.current_step += 1
+            self.sparsity = self.upper_bound + (self.lower_bound - self.upper_bound) * ((1 - self.current_step / self.maximum_steps) ** self.decay_power)
+            return True, self.sparsity
+        else:
+            return False, self.sparsity
+        
+    '''
+    In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity
+    In this case, polynomial sparsity, will simply jump to the next sparsity level
+    The model's performance over several sparsity levels optimization is stoped after high loss over several trials (see top level pruning/optimization function)
+    '''
+    def repair_step(self):
+        return self.update_step()
diff --git a/test/pytest/optimization/test_scheduler.py b/test/pytest/optimization/test_scheduler.py
new file mode 100644
index 0000000000..d16d15da73
--- /dev/null
+++ b/test/pytest/optimization/test_scheduler.py
@@ -0,0 +1,74 @@
+import numpy as np  # Use np.testing.assert_allclose due to floating point rounding errors
+from hls4ml.optimization.scheduler import ConstantScheduler, BinaryScheduler, PolynomialScheduler
+
+def test_constant_scheduler():
+    initial_sparsity =  0.25
+    update_step = 0.10
+    target_sparsity = initial_sparsity + 2.5 * update_step
+
+    # Assert initial sparsity correct
+    scheduler = ConstantScheduler(initial_sparsity=initial_sparsity, final_sparsity=target_sparsity, update_step=update_step)
+    np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity)
+
+    # Assert update step is correct
+    np.testing.assert_allclose(scheduler.update_step(), (True, initial_sparsity + update_step))
+
+    # Assert repair step is correct
+    np.testing.assert_allclose(scheduler.repair_step(), (True, initial_sparsity + 2 * update_step))
+
+    # Assert cannot update again, since it would go over target sparsity
+    np.testing.assert_allclose(scheduler.update_step(), (False, initial_sparsity + 2 * update_step))
+
+    # Assert final (achievable) sparsity is correct
+    np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity + 2 * update_step)
+
+def test_binary_scheduler():
+    initial_sparsity =  0.25
+    target_sparsity = 0.5
+    threshold = 0.05
+
+    # Assert initial sparsity correct
+    scheduler = BinaryScheduler(initial_sparsity=initial_sparsity, final_sparsity=target_sparsity, threshold=threshold)
+    np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity)
+
+    # Assert 1st update step is correct
+    s1 = 0.5 * (initial_sparsity + target_sparsity)
+    np.testing.assert_allclose(scheduler.update_step(), (True, s1))
+
+    # Assert 1st repair step is correct
+    s2 = 0.5 * (initial_sparsity + s1)
+    np.testing.assert_allclose(scheduler.repair_step(), (True, s2))
+
+    # Assert 2nd update step is correct
+    s3 = 0.5 * (s2 + s1)
+    np.testing.assert_allclose(scheduler.update_step(), (True, s3))
+
+    # Assert 2nd repair step doest not take place, difference < threshold
+    np.testing.assert_allclose(scheduler.repair_step(), (False, s3))
+
+    # Assert final (achievable) sparsity is correct
+    np.testing.assert_allclose(scheduler.get_sparsity(), s3)
+
+def test_polynomial_scheduler():
+    decay_power = 2
+    maximum_steps = 2
+    initial_sparsity =  0.25
+    target_sparsity = 0.5
+  
+    # Assert initial sparsity correct
+    scheduler = PolynomialScheduler(maximum_steps, initial_sparsity=initial_sparsity, final_sparsity=target_sparsity, decay_power=decay_power)
+    np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity)
+
+    # Assert 1st update step is correct
+    s1 =  target_sparsity + (initial_sparsity - target_sparsity) * ((1 - 1 / maximum_steps) ** decay_power)
+    np.testing.assert_allclose(scheduler.update_step(), (True, s1))
+
+    # Assert 1st repair step is correct
+    s2 =  target_sparsity + (initial_sparsity - target_sparsity) * ((1 - 2 / maximum_steps) ** decay_power)
+    np.testing.assert_allclose(scheduler.repair_step(), (True, s2))
+
+    # Assert 2nd update step does not occur, since current_step = maximum_steps
+    np.testing.assert_allclose(scheduler.update_step(), (False, s2))
+
+    # Assert final (achievable) sparsity is correct
+    np.testing.assert_allclose(scheduler.get_sparsity(), target_sparsity)

From d95c956217694db284d9a3774da069090250a292 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 15:32:22 +0300
Subject: [PATCH 03/19] Utils & regularizers for Keras optimization

---
 hls4ml/optimization/keras/regularizers.py     | 227 ++++++++++++++++++
 hls4ml/optimization/keras/utils.py            | 101 ++++++++
 .../optimization/keras/test_regularizers.py   | 151 ++++++++++++
 3 files changed, 479 insertions(+)
 create mode 100644 hls4ml/optimization/keras/regularizers.py
 create mode 100644 hls4ml/optimization/keras/utils.py
 create mode 100644 test/pytest/optimization/keras/test_regularizers.py

diff --git a/hls4ml/optimization/keras/regularizers.py b/hls4ml/optimization/keras/regularizers.py
new file mode 100644
index 0000000000..18c72a1a9c
--- /dev/null
+++ b/hls4ml/optimization/keras/regularizers.py
@@ -0,0 +1,227 @@
+import numpy as np
+import tensorflow as tf
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+
+@tf.keras.utils.register_keras_serializable(name='DenseRegularizer')
+class DenseRegularizer(tf.keras.regularizers.Regularizer):
+    '''
+    A flexible regularizer for Dense layers, simultaneously penalizing high values and variance
+
+    Args:
+        - alpha (float): Sparse penalty; a higher value pushes more weights towards zero
+        - beta (float): Variance penalty; a higer value reduces variance between a group of weights
+        - norm (int): Norm type (l1 or l2)
+        - structure_type (string): Type of regularisation - unstructured, structured, pattern, block
+        - block_shape (tuple): Block shape if structure_type == block
+        - pattern_offset (int): Length of each pattern if structure_type == pattern
+        - consecutive_patterns (int): How many consecutive patterns should be considered
+        - weights (tf.Variable): Two-dimensional layer weight tensor, dimensionality (M x N)
+
+    Return:
+        - Regularizer penalty (tf.Variable): Penalty associated with layer weights
+
+    Examples:
+        - structure_type = unstructured: unstructured weight regularisation
+        - structure_type = structured: neuron regularization (group weights by row)
+        - structure_type = pattern: regularization on groups of every n-th weight (e.g. grouping by reuse factor in hls4ml)
+        - structure_type = block: regularisation on blocks within weight matrix (e.g. 4x4, 8x1 for certain SIMD processors)
+
+        - consecutive_patterns is commonly encountered with optimization of BRAM utilization - e.g. while it is true that each
+          DSP pattern consumes one DSP, they likely use less than one BRAM block (e.g. if the BRAM width is 36 bit and weight width is 16)
+          In that case, we need to group several patterns together, so the entire block of patterns can be removed, thus saving DSP and BRAM
+    '''
+
+    def __init__(self, alpha, beta=0, norm=1, structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED, block_shape=(1, 1),
+                 pattern_offset=1, consecutive_patterns=1):
+        if norm != 1 and norm != 2:
+            raise Exception(f'{self.__class__.__name__} currently supports l1- and l2-based regularization')
+
+        if isinstance(structure_type, str):
+            structure_type = SUPPORTED_STRUCTURES(structure_type)
+
+        if not isinstance(structure_type, SUPPORTED_STRUCTURES):
+            raise Exception(f'{self.__class__.__name__} unknown regularization type')
+
+        self.alpha = alpha
+        self.beta = beta
+        self.norm = norm
+        self.structure_type = structure_type
+        self.block_shape = block_shape
+        self.pattern_offset = pattern_offset
+        self.consecutive_patterns = consecutive_patterns
+
+    @tf.function
+    def __call__(self, weights):
+        if self.structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+            sparse_penalty = self.alpha * tf.norm(weights, ord=self.norm)
+            variance_penalty = self.beta * tf.math.reduce_variance(weights)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+            sparse_penalty = self.alpha * tf.norm(tf.norm(weights, axis=0, ord=2), ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(weights, axis=0), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.PATTERN:
+            # This is equivalent to penalising all the weights processed by the same DSP block in hls4ml.
+            # The matrix is transposed, according to Resource strategy and reshaped into (pattern_offset, pattern_number)
+            # Pattern offset corresponds to the number of patterns is equivalent to RF
+            if (np.prod(weights.shape)) % self.pattern_offset != 0:
+                print(np.prod(weights.shape), self.pattern_offset)
+                raise Exception(f'{self.__class__.__name__}: pattern offset needs to be a factor of matrix size')
+
+            if self.pattern_offset % self.consecutive_patterns != 0:
+                raise Exception(
+                    f'{self.__class__.__name__}: consecutive patterns need to be a factor of pattern offset')
+
+            # Reshape weight matrix into [number_of_patterns, pattern_offset]
+            number_of_patterns = np.prod(weights.shape) // self.pattern_offset
+            target_shape = (self.pattern_offset, number_of_patterns)
+            reshaped = tf.reshape(tf.transpose(weights), target_shape)
+            # Group consecutive patterns (columns) into blocks and reshape
+            # Docs for the functions to extract blocks are below [block regularization]
+            total_blocks = self.pattern_offset // self.consecutive_patterns
+            blocks = tf.reshape(
+                tf.image.extract_patches(
+                    tf.expand_dims(tf.expand_dims(reshaped, 2), 0),
+                    [1, self.consecutive_patterns, number_of_patterns, 1],
+                    [1, self.consecutive_patterns, number_of_patterns, 1],
+                    [1, 1, 1, 1], 'SAME'
+                ),
+                (total_blocks, -1)
+            )
+
+            # Calculate penalty
+            sparse_penalty = self.alpha * tf.norm(tf.norm(blocks, axis=1, ord=2), ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(blocks, axis=1), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.BLOCK:
+            if (weights.shape[0] % self.block_shape[0]) != 0 or (weights.shape[1] % self.block_shape[1] != 0):
+                raise Exception(f'{self.__class__.__name__}: block sizes need to be fators of weight matrix dimensions')
+
+            # TensorFlow has a built-in method for exctracting sub-tensors of given shape and stride
+            # This method is commonly used to perform im2col, docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
+            total_blocks = (weights.shape[0] * weights.shape[1]) // (self.block_shape[0] * self.block_shape[1])
+            blocks = tf.reshape(
+                tf.image.extract_patches(
+                    tf.expand_dims(tf.expand_dims(weights, 2), 0),
+                    [1, self.block_shape[0], self.block_shape[1], 1],
+                    [1, self.block_shape[0], self.block_shape[1], 1],
+                    [1, 1, 1, 1], 'SAME'
+                ),
+                (total_blocks, self.block_shape[0] * self.block_shape[1])
+            )
+
+            sparse_penalty = self.alpha * tf.norm(tf.norm(blocks, axis=1, ord=2), ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(blocks, axis=1), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+    def get_config(self):
+        return {
+            'alpha': self.alpha,
+            'beta': self.beta,
+            'norm': self.norm,
+            'structure_type': self.structure_type,
+            'block_shape': self.block_shape,
+            'pattern_offset': self.pattern_offset,
+            'consecutive_patterns': self.consecutive_patterns
+        }
+
+
+@tf.keras.utils.register_keras_serializable(name='Conv2DRegularizer')
+class Conv2DRegularizer(tf.keras.regularizers.Regularizer):
+    '''
+    A flexible regulariser for Conv2D layers, simultaneously performing pruning and clustering
+
+    Args:
+        - alpha (float): Sparse penalty; a higher value pushes more weights towards zero
+        - beta (float): Variance penalty; a higer value reduces variance between a group of weights
+        - norm (int): Norm type (l1 or l2)
+        - structure_type (string): Type of regularisation - unstructured, structured, pattern
+        - pattern_offset (int): Length of each pattern if structure_type == pattern
+        - weights (tf.Variable): Four-dimensional layer weight tensor, dimensionality (filter_width x filter_height x n_chan x n_filt)
+
+    Return:
+        - Regularizer penalty (tf.Variable): Penalty associated with layer weights
+
+    Example use cases:
+        - structure_type = unstructured: unstructured weight regularisation
+        - structure_type = structured: filter regularization (group weights of dimensionality filt_width x filt_height x n_chan)
+        - structure_type = pattern: regularization on groups of every n-th weight in flattened array (e.g. grouping by reuse factor in hls4ml)
+    '''
+
+    def __init__(self, alpha, beta=0, norm=1, structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED, pattern_offset=1,
+                 consecutive_patterns=1):
+        if norm != 1 and norm != 2:
+            raise Exception(f'{self.__class__.__name__} currently supports l1- and l2-based regularization')
+
+        if isinstance(structure_type, str):
+            structure_type = SUPPORTED_STRUCTURES(structure_type)
+
+        if not isinstance(structure_type, SUPPORTED_STRUCTURES):
+            raise Exception(f'{self.__class__.__name__} unknown regularization type')
+
+        # Block pruning is only supported for Dense and QDense layers
+        if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+            raise Exception('Block pruning is supported for 2-dimensional weight matrices')
+
+        self.alpha = alpha
+        self.beta = beta
+        self.norm = norm
+        self.structure_type = structure_type
+        self.pattern_offset = pattern_offset
+        self.consecutive_patterns = consecutive_patterns
+
+    @tf.function
+    def __call__(self, weights):
+        if len(weights.shape) != 4:
+            raise Exception(f'{self.__class__.__name__} regularizes Conv2D layers; weight matrix is not 4-dimensional')
+
+        if self.structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+            sparse_penalty = self.alpha * tf.norm(weights, ord=self.norm)
+            variance_penalty = self.beta * tf.math.reduce_variance(weights)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+            sparse_penalty = self.alpha * tf.norm(tf.reduce_sum(tf.norm(weights, axis=(0, 1), ord='fro'), axis=0),
+                                                  ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(weights, axis=(0, 1, 2)), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.PATTERN:
+            if (np.prod(weights.shape)) % self.pattern_offset != 0:
+                raise Exception(f'{self.__class__.__name__}: pattern offset needs to be a factor of matrix size')
+
+            if self.pattern_offset % self.consecutive_patterns != 0:
+                raise Exception(
+                    f'{self.__class__.__name__}: consecutive patterns need to be a factor of pattern offset')
+
+            number_of_patterns = np.prod(weights.shape) // self.pattern_offset
+            target_shape = (self.pattern_offset, number_of_patterns)
+            reshaped = tf.reshape(tf.transpose(weights, (3, 0, 1, 2)), target_shape)
+
+            total_blocks = self.pattern_offset // self.consecutive_patterns
+            blocks = tf.reshape(
+                tf.image.extract_patches(
+                    tf.expand_dims(tf.expand_dims(reshaped, 2), 0),
+                    [1, self.consecutive_patterns, number_of_patterns, 1],
+                    [1, self.consecutive_patterns, number_of_patterns, 1],
+                    [1, 1, 1, 1], 'SAME'
+                ),
+                (total_blocks, -1)
+            )
+
+            sparse_penalty = self.alpha * tf.norm(tf.norm(blocks, axis=1, ord=2), ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(blocks, axis=1), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+    def get_config(self):
+        return {
+            'alpha': self.alpha,
+            'beta': self.beta,
+            'norm': self.norm,
+            'structure_type': self.structure_type,
+            'pattern_offset': self.pattern_offset,
+            'consecutive_patterns': self.consecutive_patterns
+        }
\ No newline at end of file
diff --git a/hls4ml/optimization/keras/utils.py b/hls4ml/optimization/keras/utils.py
new file mode 100644
index 0000000000..4d9111cc96
--- /dev/null
+++ b/hls4ml/optimization/keras/utils.py
@@ -0,0 +1,101 @@
+import numpy as np
+import tensorflow as tf
+
+@tf.function
+def get_model_gradients(model, loss_fn, X, y):
+    '''
+    Calculate model gradients with respect to weights
+
+    Args:
+        - model (keras.model): Input model
+        - loss_fn (keras.losses.Loss): Model loss function
+        - X (np.array): Input data
+        - y (np.array): Output data
+    
+    Return:
+        - grads (dict): Per-layer gradients of loss with respect to weights
+    '''
+    grads = {}
+    # While persistent GradientTape slows down execution, it is faster than performing forward pass and non-persisten GradientTape for every layer
+    with tf.GradientTape(persistent=True) as tape:
+        output = model(X, training=True)
+        loss_value = loss_fn(y, output)
+        
+        for layer in model.layers:
+            if len(layer.trainable_weights) > 0:
+                grads[layer.name] = tape.gradient(loss_value, layer.kernel)
+
+    return grads
+
+@tf.function
+def get_model_hessians(model, loss_fn, X, y):
+    '''
+    Calculate the second derivatives of the loss with repsect to model weights
+    Note, only diagonal elements of the Hessian are computed
+    
+    Args:
+        - model (keras.model): Input model
+        - loss_fn (keras.losses.Loss): Model loss function
+        - X (np.array): Input data
+        - y (np.array): Output data
+    
+    Return:
+        - grads (dict): Per-layer second derivatives of loss with respect to weights
+    '''
+    grads = {}
+    with tf.GradientTape(persistent=True) as tape:
+        output = model(X, training=False)
+        loss_value = loss_fn(y, output)
+        
+        for layer in model.layers:
+            if hasattr(layer, 'kernel'):
+                grads[layer.name] = tape.gradient(tape.gradient(loss_value, layer.kernel), layer.kernel)
+
+    return grads
+
+def get_model_sparsity(model):
+    '''
+    Calculate total and per-layer model sparsity
+
+    Args:
+        - model (keras.model): Model to be evaluated
+    
+    Return:
+        - sparsity (float): Model sparsity, as a percentage of zero weights w.r.t to total number of model weights 
+        - layers (dict): Key-value dictionary; each key is a layer name and the associated value is the layer's sparsity 
+    
+    TODO - Extend support for recurrent layers (reccurent_kernel)
+    '''
+    
+    total_weights = 0
+    zero_weights = 0
+    layer_sparsity = {}
+    
+    for layer in model.layers:
+        if hasattr(layer, 'kernel'):
+          weights = layer.get_weights()[0].flatten()
+          total_weights = total_weights + len(weights)
+          zero_weights = zero_weights + len(weights) - np.count_nonzero(weights)
+          layer_sparsity[layer.name] = 1.0 - np.count_nonzero(weights) / len(weights)
+    
+    try:
+        return zero_weights / total_weights, layer_sparsity
+    except:
+        return 0.0, layer_sparsity
+
+# TODO - Does this work for non-linear models (e.g. skip connections) ?
+def get_last_layer_with_weights(model):
+    '''
+    Finds the last layer with weights
+    The last layer with weights determined the output shape, so, pruning is sometimes not applicable to it
+    As an example, consider a network with 16 - 32 - 5 neurons - the last layer's neuron (5) cannot be removed, as they map to the data labels
+    Args:
+        - model (keras.model): Input model
+    
+    Return:
+        - idx (int): Index location of last layer with params
+    '''
+    for idx, layer in reversed(list(enumerate(model.layers))):
+        if hasattr(layer, 'kernel'):
+            return idx
+    return len(model.layers)
\ No newline at end of file
diff --git a/test/pytest/optimization/keras/test_regularizers.py b/test/pytest/optimization/keras/test_regularizers.py
new file mode 100644
index 0000000000..377d986365
--- /dev/null
+++ b/test/pytest/optimization/keras/test_regularizers.py
@@ -0,0 +1,151 @@
+import pytest
+import numpy as np
+import tensorflow as tf
+from qkeras import QDense, QConv2D
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Conv2D, Flatten
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.regularizers import DenseRegularizer, Conv2DRegularizer
+
+# Constants
+pattern_offset = 4
+block_shape = (4, 4)
+consecutive_patterns = 2
+
+dense_layers = [Dense, QDense]
+conv2d_layers = [Conv2D, QConv2D]
+
+
+# Sets the loss due to data to zero; train model only on regularization loss
+def zero_loss(y_true, y_pred):
+    return tf.reduce_mean(0 * tf.square(y_true - y_pred), axis=-1)
+
+
+# Helper function, calculates the group norm and variance for a single layer
+def get_norm_and_variance(weights, structure_type, layer='dense'):
+    if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+        norm = np.linalg.norm(weights.flatten(), ord=1)
+        var = np.var(weights)
+        return norm, var
+
+    if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+        if layer == 'conv2d':
+            norm = np.linalg.norm(np.sum(np.linalg.norm(weights, axis=(0, 1), ord='fro'), axis=0), ord=1)
+            var = np.linalg.norm(np.var(weights, axis=(0, 1, 2)), ord=1)
+        else:
+            norm = np.linalg.norm(np.linalg.norm(weights, axis=0, ord=2), ord=1)
+            var = np.linalg.norm(np.var(weights, axis=0), ord=1)
+
+        return norm, var
+
+    if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+        if layer == 'dense':
+            weights = weights.T
+        elif layer == 'conv2d':
+            weights = np.transpose(weights, (3, 0, 1, 2))
+
+        number_of_patterns = np.prod(weights.shape) // pattern_offset
+        target_shape = (pattern_offset, number_of_patterns)
+        reshaped = np.reshape(weights, target_shape)
+        total_blocks = pattern_offset // consecutive_patterns
+        blocks = np.reshape(
+            tf.image.extract_patches(
+                np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                [1, consecutive_patterns, number_of_patterns, 1],
+                [1, consecutive_patterns, number_of_patterns, 1],
+                [1, 1, 1, 1], 'SAME'
+            ).numpy(),
+            (total_blocks, -1)
+        )
+        norm = np.linalg.norm(np.linalg.norm(blocks, axis=1, ord=2), ord=1)
+        var = np.linalg.norm(np.var(blocks, axis=1), ord=1)
+        return norm, var
+
+    if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+        total_blocks = (weights.shape[0] * weights.shape[1]) // (block_shape[0] * block_shape[1])
+        blocks = np.reshape(
+            tf.image.extract_patches(
+                np.expand_dims(np.expand_dims(weights, 2), 0),
+                [1, block_shape[0], block_shape[1], 1],
+                [1, block_shape[0], block_shape[1], 1],
+                [1, 1, 1, 1], 'SAME'
+            ).numpy(),
+            (total_blocks, block_shape[0] * block_shape[1])
+        )
+
+        norm = np.linalg.norm(np.linalg.norm(blocks, axis=1, ord=2), ord=1)
+        var = np.linalg.norm(np.var(blocks, axis=1), ord=1)
+        return norm, var
+
+
+@pytest.mark.parametrize('dense', dense_layers)
+@pytest.mark.parametrize('structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED,
+                                            SUPPORTED_STRUCTURES.PATTERN, SUPPORTED_STRUCTURES.BLOCK])
+def test_dense_regularizer(structure_type, dense):
+    epochs = 10
+    data_points = 10
+    input_shape = (32,)
+    output_shape = (16,)
+    X = np.random.rand(data_points, *input_shape)
+    y = np.random.rand(data_points, *output_shape)
+    w = np.random.rand(input_shape[0], output_shape[0])
+
+    # First, fit a model without regularization
+    model = Sequential()
+    model.add(dense(output_shape[0], input_shape=input_shape))
+    dense_weights = model.layers[0].get_weights()
+    dense_weights[0] = w
+    model.layers[0].set_weights(dense_weights)
+    model.compile(loss=zero_loss, optimizer=Adam(1.0))
+    model.fit(X, y, epochs=epochs)
+    norm, var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type)
+
+    # Now, fit a model with strong regularization, starting with the same initial weights
+    dense_weights = model.layers[0].get_weights()
+    dense_weights[0] = w
+    model.layers[0].set_weights(dense_weights)
+    regularizer = DenseRegularizer(alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset,
+                                   block_shape=block_shape)
+    model.layers[0].add_loss(lambda layer=model.layers[0]: regularizer(layer.kernel))
+    model.compile(loss=zero_loss, optimizer=Adam(1.0))
+    model.fit(X, y, epochs=epochs)
+    reg_norm, reg_var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type)
+
+    # Verify regularization decreased weight magnitude and variance
+    assert (reg_norm < norm)
+    assert (reg_var < var)
+
+
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+@pytest.mark.parametrize('structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED,
+                                            SUPPORTED_STRUCTURES.PATTERN])
+def test_conv2d_regularizer(structure_type, conv2d):
+    epochs = 10
+    data_points = 10
+    input_shape = (16, 16, 3)
+    num_filters = 4
+    X = np.random.rand(data_points, *input_shape)
+    y = np.random.rand(data_points, 1)
+
+    # First, fit a model without regularization
+    model = Sequential()
+    model.add(conv2d(num_filters, (3, 3), input_shape=input_shape))
+    model.add(Flatten())
+    model.add(Dense(1))
+    conv_weights = model.layers[0].get_weights()
+    model.compile(loss=zero_loss, optimizer=Adam())
+    model.fit(X, y, epochs=epochs, verbose=True)
+    norm, var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type, 'conv2d')
+
+    # Now, fit a model with strong regularization, starting with the same initial weights
+    model.layers[0].set_weights(conv_weights)
+    regularizer = Conv2DRegularizer(alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset)
+    model.layers[0].add_loss(lambda layer=model.layers[0]: regularizer(layer.kernel))
+    model.compile(loss=zero_loss, optimizer=Adam())
+    model.fit(X, y, epochs=epochs, verbose=True)
+    reg_norm, reg_var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type, 'conv2d')
+
+    # Verify regularization decreased weight magnitude and variance
+    assert (reg_norm < norm)
+    assert (reg_var < var)

From de51797bd7a9e58fce14fb2cca75997e75970b68 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 15:41:05 +0300
Subject: [PATCH 04/19] Knapsack solver & unstructured pruning objective

---
 hls4ml/optimization/knapsack.py             | 247 ++++++++++++++++++++
 hls4ml/optimization/objectives/__init__.py  | 122 ++++++++++
 test/pytest/optimization/test_knapsack.py   |  47 ++++
 test/pytest/optimization/test_objectives.py |  42 ++++
 4 files changed, 458 insertions(+)
 create mode 100644 hls4ml/optimization/knapsack.py
 create mode 100644 hls4ml/optimization/objectives/__init__.py
 create mode 100644 test/pytest/optimization/test_knapsack.py
 create mode 100644 test/pytest/optimization/test_objectives.py

diff --git a/hls4ml/optimization/knapsack.py b/hls4ml/optimization/knapsack.py
new file mode 100644
index 0000000000..5b31212da5
--- /dev/null
+++ b/hls4ml/optimization/knapsack.py
@@ -0,0 +1,247 @@
+import sys
+import time
+import numpy as np
+
+
+def solve_knapsack(values, weights, capacity, implementation='CBC_MIP', **kwargs):
+    '''
+    A function for solving the Knapsack problem
+
+    Args:
+        - values (np.array, float): A one-dimensional array, where each entry is the value of an item
+        - weights (np.array, int): An matrix, each row represents the weights of every item, in a given knapsack
+        - capacity (np.array, int): A one-dimensional array, each entry is the maximum weights of a Knapsack
+        - implementation (string): Algorithm to solve Knapsack problem - dynamic programming, greedy, branch and bound
+
+    Kwargs:
+        - time_limit (float): Limit (in seconds) after which the CBC or Branch & Bound should stop looking for a solution and return optimal so far
+        - scaling_factor (float): Scaling factor for floating points values in CBC or Branch & Bound (see function for details)
+
+    Return:
+        - optimal_value (float): The optimal values of elements in the knapsack
+        - selected_items (list): A list of indices, corresponding to the selected elements
+
+    Notes:
+        - The general formulation of the Knapsack problem for N items and M knapsacks is:
+                max v.T @ x
+                s.t. A @ x <= W
+                v ~ (N, 1) x ~ (N, 1) A ~ (M, N) W ~ (M, 1)
+                x_{i, j} = {0, 1} and <= is the generalized, element-wise inequlaity for vectors
+
+        - Supported implementations:
+            - Dynamic programming:
+                - Optimal solution
+                - Time complexity: O(nW)
+                - Suitable for single-dimensional constraints and a medium number of items, with integer weights
+            - Branch and bound:
+                - Optimal
+                - Solved using Google OR-Tools
+                - Suitable for multi-dimensional constraints and a large number of items
+            - Branch and bound:
+                - Solution sub-optimal, but often better than greeedy
+                - Solved using Google OR-Tools, with the CBC MIP Solver
+                - Suitable for multi-dimensional constraints and a very high number of items
+            - Greedy:
+                - Solution sub-optimal
+                - Time complexity: O(mn)
+                - Suitable for highly dimensional constraints or a very high number of items
+
+        - Most implementations require integer values of weights and capacities; for pruning & weight sharing this is never a problem
+            In case non-integer weights and capacities are requires, all of the values should be scaled by an appropriate scaling factor
+    '''
+    if implementation not in ('dynamic', 'greedy', 'branch_bound', 'CBC_MIP'):
+        raise Exception('Unknown algorithm for solving Knapsack')
+
+    if len(values.shape) != 1:
+        raise Exception('Current implementations of Knapsack optimization support single-objective problems. \
+                        Values must be one-dimensional')
+
+    if len(weights.shape) != 2:
+        raise Exception('Current implementation of Knapsack assumes weight vector is 2-dimensional, to allow for multi-dimensional Knapsack problem. \
+                        If solve a one-dimensional Knapsack problem, extend dimensions of weights to a one-row matrix')
+
+    if values.shape[0] != weights.shape[1]:
+        raise Exception('Uneven number of items and weights')
+
+    if not (np.all(values >= 0) and np.all(weights >= 0)):
+        raise Exception('Current implementation of Knapsack problem requires non-negative values and weights')
+
+    if not np.all(np.equal(np.mod(capacity, 1), 0)) or not np.all(np.equal(np.mod(weights, 1), 0)):
+        raise Exception('Current implementation of Knapsack problem requires integer weights and capacities')
+
+    print(f'Starting to solve Knapsack problem with {values.shape[0]} variables and a weight constraint of {capacity}')
+    start = time.time()
+
+    # Special case, empty list
+    if values.shape[0] == 0:
+        return 0, []
+
+    # Special case, the sum of all weights is less than al the capacity constraints
+    if np.all(np.sum(weights, axis=1) <= capacity):
+        return np.sum(values), list(range(0, values.shape[0]))
+
+    # Special case, all the item weights per knapsack are equal, so we can greedily select the ones with the highest value
+    if np.all([weights[i, :] == weights[i, 0] for i in range(weights.shape[0])]):
+        return __solve_knapsack_equal_weights(values, weights, capacity)
+
+    # General cases
+    if implementation == 'dynamic':
+        if weights.shape[0] == 1:
+            optimal_value, selected_items = __solve_1d_knapsack_dp(values, weights[0], capacity[0])
+        else:
+            raise Exception('Solving Knapsack with dynamic programming requires single-dimensional constraints')
+    elif implementation == 'branch_bound':
+        optimal_value, selected_items = __solve_knapsack_branch_and_bound(values, weights, capacity, **kwargs)
+    elif implementation == 'CBC_MIP':
+        optimal_value, selected_items = __solve_knapsack_cbc_mip(values, weights, capacity, **kwargs)
+    else:
+        optimal_value, selected_items = __solve_knapsack_greedy(values, weights, capacity)
+
+    print(f'Time taken to solve Knapsack {time.time() - start}s')
+    return optimal_value, selected_items
+
+
+def __solve_1d_knapsack_dp(values, weights, capacity):
+    '''
+    Helper function to solve the 1-dimensional Knapsack problem exactly through dynamic programming
+    The dynamic programming approach is only suitable for one-dimensional weight constraints
+    Furthermore, it has a high computational complexity and it is not suitable for highly-dimensional arrays
+    NOTE: The weights and corresponding weight constraint need to be integers; if not, the they should be scaled and rounded beforehand
+    '''
+    assert (len(weights.shape) == 1)
+
+    # Build look-up table in bottom-up approach
+    N = values.shape[0]
+    K = [[0 for w in range(capacity + 1)] for i in range(N + 1)]
+    for i in range(1, N + 1):
+        for w in range(1, capacity + 1):
+            if weights[i - 1] <= w:
+                K[i][w] = max(values[i - 1] + K[i - 1][w - weights[i - 1]], K[i - 1][w])
+            else:
+                K[i][w] = K[i - 1][w]
+
+    # Reverse Knapsack to find selected groups
+    i = N
+    w = capacity
+    res = K[N][capacity]
+    selected = []
+    while i >= 0 and res > 0:
+        if res == K[i - 1][w]:
+            pass
+        else:
+            selected.append(i - 1)
+            res = res - values[i - 1]
+            w = w - weights[i - 1]
+        i = i - 1
+
+    return K[N][capacity], selected
+
+
+def __solve_knapsack_greedy(values, weights, capacity):
+    '''
+    Helper function that solves the n-dimensional Knapsack algorithm with a greedy algorithm
+    The greedy approach should only be used for problems with many items or highly dimensional weights
+    The solution can [and often will] be sub-optimal; otherwise, dynamic programming, branch & bound etc. should be used
+    '''
+
+    # For each item, calculate the value per weight ratio (this can be thought of as item efficiency)
+    # The weights are scaled for every dimension, to avoid inherent bias towards large weights in a single dimension
+    weights_rescaled = weights / np.max(weights, axis=1)[:, np.newaxis]
+    ratios = values / weights_rescaled.sum(axis=0)
+    indices = np.argsort(ratios)
+
+    # Greedily select item with the highest ratio (efficiency)
+    optimal = 0
+    selected = []
+    accum = np.zeros_like(capacity)
+    for i in reversed(indices):
+        if np.all((accum + weights[:, i]) <= capacity):
+            selected.append(i)
+            optimal += values[i]
+            accum += weights[:, i]
+        else:
+            break
+
+    # The greedy algorithm can be sub-optimal; however, selecting the above elements or the next element that could not fit into the knapsack
+    # Will lead to solution that is at most (1/2) of the optimal solution; therefore, take whichever is higher and satisfies the constraints
+    if values[i] > optimal and np.all(weights[:, i]) <= capacity:
+        return values[i], [i]
+    else:
+        return optimal, selected
+
+
+def __solve_knapsack_branch_and_bound(values, weights, capacity, time_limit=sys.float_info.max, scaling_factor=10e4):
+    '''
+    Helper function to solve Knapsack problem using Branch and Bound; implemented using Google OR-Tools [weights & capacities need to be integers]
+    The algorithm explores the search space (a tree of all the posible combinations, 2^N nodes), but discards infeasible & sub-optimal solutions
+
+    Additional args:
+        - time_limit - Time limit (seconds) after which Branch & Bound search should stop and return a sub-optimal solution
+        - scaling_factor - Factor to scale floats in values arrays; OR-Tools requires all values & weights to be integers; so all of the values are scaled by a large number
+    '''
+    # TODO - Should we make ortools a hard requirement in setup.cfg?
+    # If so, need to see if there are dependencies issues, when I installed a hard requirement was a protobuf version incompatible with tensorboard (MacOS X)
+    # Installed newer version of protobuf and then downgraded (to work with tensorboard) and ortools worked correctly
+    try:
+        from ortools.algorithms import pywrapknapsack_solver
+    except ModuleNotFoundError:
+        raise Exception('OR-Tools not found. Please insteal Google OR-Tools from pip.')
+
+    solver = pywrapknapsack_solver.KnapsackSolver(
+        pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_BRANCH_AND_BOUND_SOLVER, 'BB')
+    solver.set_time_limit(time_limit)
+    solver.Init((values * scaling_factor).astype(int).tolist(), weights.astype(int).tolist(),
+                capacity.astype(int).tolist())
+    optimal = solver.Solve()
+    selected = [i for i in range(values.shape[0]) if solver.BestSolutionContains(i)]
+    return optimal / scaling_factor, selected
+
+
+def __solve_knapsack_cbc_mip(values, weights, capacity, time_limit=sys.float_info.max, scaling_factor=10e4):
+    '''
+    Helper function to solve Knapsack problem using the CBC MIP solver using Google OR-Tools [weights & capacities need to be integers]
+
+    Additional args:
+        - time_limit - Time limit (seconds) after which CBC solver should stop and return a sub-optimal solution
+        - scaling_factor - Factor to scale floats in values arrays; OR-Tools requires all values & weights to be integers; so all of the values are scaled by a large number
+    '''
+    try:
+        from ortools.algorithms import pywrapknapsack_solver
+    except ModuleNotFoundError:
+        raise Exception('OR-Tools not found. Please insteal Google OR-Tools from pip.')
+
+    solver = pywrapknapsack_solver.KnapsackSolver(
+        pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_CBC_MIP_SOLVER, 'CBC')
+    solver.set_time_limit(time_limit)
+    solver.Init((values * scaling_factor).astype(int).tolist(), weights.astype(int).tolist(),
+                capacity.astype(int).tolist())
+    optimal = solver.Solve()
+    selected = [i for i in range(values.shape[0]) if solver.BestSolutionContains(i)]
+    return optimal / scaling_factor, selected
+
+
+def __solve_knapsack_equal_weights(values, weights, capacity):
+    '''
+    Helper function that solves the n-dimensional Knapsack algorithm with a greedy algorithm
+    The assumption is that all the items have the same weight; while this seems a bit artificial
+    It occurs often in pruning - e.g. in pattern pruning, each DSP block saves one DSP; however, as a counter-example
+    In structured pruning, each structure can save a different amount of FLOPs (Conv2D filter vs Dense neuron)
+    '''
+    assert (np.all([weights[i, :] == weights[i, 0] for i in range(weights.shape[0])]))
+
+    # Find items with the highest value
+    indices = np.argsort(values)
+
+    # Greedily select item with the highest ratio
+    optimal = 0
+    selected = []
+    accum = np.zeros_like(capacity)
+    for i in reversed(indices):
+        if np.all((accum + weights[:, i]) <= capacity):
+            selected.append(i)
+            optimal += values[i]
+            accum += weights[:, i]
+        else:
+            break
+
+    return optimal, selected
diff --git a/hls4ml/optimization/objectives/__init__.py b/hls4ml/optimization/objectives/__init__.py
new file mode 100644
index 0000000000..fc3a4c618e
--- /dev/null
+++ b/hls4ml/optimization/objectives/__init__.py
@@ -0,0 +1,122 @@
+import logging
+import numpy as np
+from abc import ABC, abstractmethod
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.attributes import OptimizationAttributes
+
+'''
+Pruning & weight sharing are formulated as an optimization problem, with the aim of minimising some metric
+Metrics can include: total number of weights, DSP utilization, latency, FLOPs etc.
+'''
+
+class ObjectiveEstimator(ABC):
+    '''
+    Abstract class with methods for estimating the utilization and savings of a certain layer, with respect to some objective
+    For each objective, an inherited class is written with the correct implementaton of the below methods
+    The objectives can be multi-dimensional, e.g. DSPs and BRAM [care needs to be taken when optimizing several objectives, especially if conflicting]
+    '''
+
+    @abstractmethod
+    def is_layer_optimizable(self, layer_attributes):
+        '''
+        For a given layer, checks whether optimizations make sense, with respect to the given objective(s)
+        Furthermore, it returns the type of optimization (structured, unstructured etc.) most suitable for minimising the objective(s).
+        
+        Args:
+            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+        
+        Return:
+            - optimizable (boolean) - can optimizations be applied to this layer
+            - optimization_attributes (hls4ml.optimiation.attributes.OptimizationAttributes) - most suitable approach for optimization
+        
+        Examples: 
+            - Metric = Total weights, Layer = Dense, shape = (4, 4) -> return True, unstructured 
+            - Metric = DSP, Layer = Dense, Precision = ap_fixed<8, 0> -> return False (Vivado doesn't use DSP when precision < 9)
+            - Metric = DSP, Layer = Dense, Precision = ap_fixed<16, 6> -> return True, pattern structure, both pruning and weight sharing
+        '''
+        pass
+
+    @abstractmethod
+    def layer_resources(self, layer_attributes):
+        '''
+        For a given layer, how many units of the metric are used, given a generic weight matrix
+        
+        Args:
+            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+        
+        Return:
+            - resources (list, int) - total resources (w.r.t every dimension of the objective) used
+                
+        Example: Metric = Total weights, Layer = Dense, shape = (4, 4) -> return [16] [regardless of layer sparsity]
+        '''
+        pass
+    
+    @abstractmethod
+    def layer_savings(self, layer_attributes):
+        '''
+        For a given layer, how many units of the metric are saved, when optimizing one structure 
+        The structure type, alongside its parameters (e.g. block shape) are stored in layer attributes
+        For best results, OptimizationAttributes in layer_attribtues should be obtained from is_layer_optimizable
+        
+        Args:
+            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+        
+        Return:
+            - savings (list, int) - savings achieved (one for every dimenson of objective) with OptimizationAttributes from layer_attributes
+        
+        Example: Metric = Total weights, Layer = Dense, shape = (4, 4):
+            - structure_type == unstructured -> return [1]
+            - structure_type == structured -> return [4] 
+        '''
+        pass
+    
+'''
+A class containing objective estimation with the goal of minimizing the number of non-zero weights in a layer [corresponds to unstructured pruning]
+'''
+class ParameterEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return False, None
+        else:
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+    
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        else:
+            return [np.prod(layer_attributes.weight_shape)]
+    
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        
+        structure_type = layer_attributes.optimization_attributes.structure_type 
+        pruning = layer_attributes.optimization_attributes.pruning
+        weight_sharing = layer_attributes.optimization_attributes.weight_sharing
+
+        if weight_sharing:
+            logging.warn('Weight sharing does not decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            return [0]
+        
+        if not pruning:
+            logging.warn('Pruning needs to be enabled to decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            return [0]
+
+        # In this case, pruning = True and weight_sharing = False, so calculate savings incurred by removing a group of weights
+        if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+            return [1]
+        elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+            if 'Dense' in layer_attributes.layer_type.__name__ or 'Conv2D' in layer_attributes.layer_type.__name__:
+                return [np.prod(layer_attributes.weight_shape[:-1])]
+            else:
+                raise Exception('Unknown layer encountered when estimating parameter savings.')
+        elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+            number_of_patterns = np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+            return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns]
+        elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+            return [np.prod(layer_attributes.optimization_attributes.block_shape)]
diff --git a/test/pytest/optimization/test_knapsack.py b/test/pytest/optimization/test_knapsack.py
new file mode 100644
index 0000000000..6c985688c4
--- /dev/null
+++ b/test/pytest/optimization/test_knapsack.py
@@ -0,0 +1,47 @@
+import pytest
+import numpy as np
+from hls4ml.optimization.knapsack import solve_knapsack
+
+# In the simple case below, both implementations give the optimal answer
+# In general, the greedy algorithm will not give the optimal solution
+@pytest.mark.parametrize('implementation', ['dynamic', 'greedy', 'branch_bound', 'CBC_MIP'])
+def test_knapsack_1d(implementation):
+    values = np.array([4, 5, 6, 8, 3])
+    weights = np.array([[2, 5, 3, 2, 5]])
+    capacity = np.array([8])
+
+    optimal, selected = solve_knapsack(values, weights, capacity, implementation=implementation)
+    assert(optimal == 18)
+    assert(0 in selected)
+    assert(2 in selected)
+    assert(3 in selected)
+
+@pytest.mark.parametrize('implementation', ['greedy', 'branch_bound', 'CBC_MIP'])
+def test_multidimensional_knapsack(implementation):
+    values = np.array([10, 2, 6, 12, 3])
+    weights = np.array([[3, 1, 4, 5, 5], [3, 2, 4, 1, 2]])
+    capacity = np.array([8, 7])
+
+    optimal, selected = solve_knapsack(values, weights, capacity, implementation=implementation)
+    assert(optimal == 22)
+    assert(0 in selected)
+    assert(3 in selected)
+
+def test_knapsack_equal_weights():
+    values = np.array([10, 2, 6, 8, 3])
+    weights = np.array([[2, 2, 2, 2, 2], [3, 3, 3, 3, 3]])
+    capacity = np.array([7, 7])
+
+    optimal, selected = solve_knapsack(values, weights, capacity)
+    assert(optimal == 18)
+    assert(0 in selected)
+    assert(3 in selected)
+
+def test_knapsack_all_elements_fit():
+    values = np.array([10, 2, 6, 12, 3])
+    weights = np.array([[3, 1, 4, 5, 5], [3, 2, 4, 1, 2]])
+    capacity = np.array([19, 12])
+
+    optimal, selected = solve_knapsack(values, weights, capacity)
+    assert(optimal == 33)
+    assert(selected == list(range(0, values.shape[0])))
diff --git a/test/pytest/optimization/test_objectives.py b/test/pytest/optimization/test_objectives.py
new file mode 100644
index 0000000000..2991b1c3fc
--- /dev/null
+++ b/test/pytest/optimization/test_objectives.py
@@ -0,0 +1,42 @@
+import numpy as np
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Conv2D, Flatten
+from hls4ml.optimization.objectives import ParameterEstimator
+from hls4ml.optimization.attributes import get_attributes_from_keras_model
+
+# Test attempts to verify one of the estimators (parameter) is correctly declared, the functions are static etc. 
+def test_parameter_objective():
+    # Model parameters
+    dense_units = 16
+    conv_filters = 6
+    conv_channels = 3
+    conv_shape = (3, 3)
+    input_shape = (8, 8)
+
+    model = Sequential()
+    model.add(Conv2D(conv_filters, input_shape=(*input_shape, conv_channels), kernel_size=conv_shape, name='conv2d', padding='same', kernel_initializer='ones'))
+    model.add(Flatten(name='flatten'))
+    model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
+    model_attributes = get_attributes_from_keras_model(model)
+
+    # Identify optimizable layers and the suitable structure
+    for layer in model.layers:
+        optimizable, optimization_attributes = ParameterEstimator.is_layer_optimizable(model_attributes[layer.name])
+        model_attributes[layer.name].optimizable = optimizable
+        model_attributes[layer.name].optimization_attributes = optimization_attributes
+
+    # Verify conv2d and dense are optimizable, flatten is not
+    assert(model_attributes['conv2d'].optimizable)
+    assert(not model_attributes['flatten'].optimizable)
+    assert(model_attributes['dense'].optimizable)
+
+    # Verify layer resources (number of parameters) are correct
+    assert([conv_filters * conv_channels * np.prod(conv_shape)] == ParameterEstimator.layer_resources(model_attributes['conv2d']))
+    assert([0] == ParameterEstimator.layer_resources(model_attributes['flatten']))
+    assert([conv_filters * np.prod(input_shape) * dense_units] == ParameterEstimator.layer_resources(model_attributes['dense']))
+
+    # Verify layer savings are correct - is_layer_optimizable should have returned UNSTRUCTURED as the pruning type
+    # Since it wasn't overwritten, each pruning step saves one parameter
+    assert([1] == ParameterEstimator.layer_savings(model_attributes['conv2d']))
+    assert([0] == ParameterEstimator.layer_savings(model_attributes['flatten']))
+    assert([1] == ParameterEstimator.layer_savings(model_attributes['dense']))

From a49a1134d0735f8a3dce8045b11b764d2714f869 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 16:10:21 +0300
Subject: [PATCH 05/19] Keras optimization masking and weight removal logic

---
 hls4ml/optimization/keras/masking.py          | 729 ++++++++++++++++++
 .../pytest/optimization/keras/test_masking.py | 447 +++++++++++
 .../optimization/keras/test_weight_sharing.py | 159 ++++
 3 files changed, 1335 insertions(+)
 create mode 100644 hls4ml/optimization/keras/masking.py
 create mode 100644 test/pytest/optimization/keras/test_masking.py
 create mode 100644 test/pytest/optimization/keras/test_weight_sharing.py

diff --git a/hls4ml/optimization/keras/masking.py b/hls4ml/optimization/keras/masking.py
new file mode 100644
index 0000000000..50052e7297
--- /dev/null
+++ b/hls4ml/optimization/keras/masking.py
@@ -0,0 +1,729 @@
+import sys
+import logging
+import numpy as np
+import tensorflow as tf
+from qkeras import QDense, QConv2D
+from tensorflow.keras.layers import Dense, Conv2D
+from hls4ml.optimization.knapsack import solve_knapsack
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS
+
+
+def get_model_masks(keras_model, model_attributes, sparsity, objective, metric='l1', local=False, gradients=None,
+                    hessians=None, knapsack_solver='CBC_MIP'):
+    '''
+    Function calculating a binary mask for all optimizable layers
+    Entries equal to one correspond to the weight being updated during the training
+    Entries equal to zero correspond to the weight being frozen during the training
+
+    Masking is such that:
+        * resource_utilization <= (1 - sparsity) * baseline_utilization OR
+        * resource_saving > sparsity * baseline_utilization [equivalent formulation]
+
+    Offsets are used for weight sharing - in the case of weight sharing, the mask is set to zero
+    Therefore, the weights will be frozen during training; however, they still need to be the mean of the group
+    Offsets represent the mean of each weight-shared group - therefore, it is important to have offsets only for
+    frozen weights; that is where the corresponding entry in the mask tensor is zero
+
+    If a layer supports both weight sharing and pruning, both the norm and variance of the group are calculated
+    And the smaller one is considered; so if the norm is smaller, the group will be considered for pruning
+    Otherise, the group will be considered for weight sharing. Both the norm and variance are normalized, to avoid magnitude biases
+
+    Args:
+        - keras_model (keras.model) - Model to be masked
+        - model_attributes (dict) - A layer-wise dictionary of LayerAttributes classes
+        - sparsity (float) - Desired sparsity, with respect to the objective
+        - objective (ObjectiveEstimator) - Objective to be minimized (e.g. DSP, FLOPs etc.)
+        - metric (string) - Weight ranking metric - l1, l2, Oracle, saliency
+        - local (boolean) - Equal layer-wise sparsity
+        - gradients (dict) - A layer-wise dictionary of weight gradients, needed for Oracle ranking
+        - hessians (dict) - A layer-wse dictionary of second gradients, needed for saliency ranking
+        - knapsack_solver (str) - Algorithm for solving Knapsack problem; recommended is to use default, unless dealing with highly dimensional problems, in which case greedy is better
+
+    Return:
+        - masks (dict) - Layer-wise dictionary of binary tensors
+        - offsets (dict) - Layer-wise dictionary of offsets for every weight
+    '''
+
+    if metric not in SUPPORTED_METRICS:
+        raise Exception('Unknown metric for ranking weights')
+
+    if metric == 'oracle' and gradients is None:
+        raise Exception('Oracle ranking requires the gradient of the loss with respect to model weights')
+
+    if metric == 'saliency' and hessians is None:
+        raise Exception('Saliency ranking requires second order derivatives')
+
+    if local:
+        return __get_masks_local(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians,
+                                 knapsack_solver)
+    else:
+        return __get_masks_global(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians,
+                                  knapsack_solver)
+
+
+def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver):
+    '''
+    Function calculating a layer-wise binary mask for all optimizable layers
+    This function performs layer-wise masking, so all layers have the same sparsity (with respect to the objective)
+    '''
+    masks = {}
+    offsets = {}
+
+    for layer in keras_model.layers:
+        # Care needs to be taken if layer_savings = 0
+        # As long as the default attributes are used (from is_layer_optimizable(...)), this should never happen
+        # However, if the optimization attributes are manually changed it would be possible to select the structure type such that savings = 0
+        # In this case, the goal is to keep resource utilization under a certain threshold; and savings are equivalent to resources per single group
+        # Therefore, in the knapsack solver, zero-saving would remain unmasked; as it has a "weight" of zero, it is always stored in the knapsack
+        # So not masking a group without saving is as expected; however, if solved through greedy knaspack an exception will be thrown (division by zero)
+        if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
+            layer_savings = objective.layer_savings(model_attributes[layer.name])
+            layer_resources = objective.layer_resources(model_attributes[layer.name])
+            target_resources = ((1 - sparsity) * np.array(layer_resources)).astype(int)
+            structure_type = model_attributes[layer.name].optimization_attributes.structure_type
+
+            # All the groups (structures, patterns, blocks) have the same resource utilisation in one layer
+            # So we can greedily prune the groups with the lowest "loss" (magnitude, saliency etc.)
+            # Greedily pruning the groups with the lowest loss is a special case of the Knapsack problem with equal weights
+            value = layer.get_weights()[0]
+            if metric == 'oracle':
+                value = np.abs(np.multiply(value, gradients[layer.name]))
+                norm = 1
+            elif metric == 'saliency':
+                value = np.multiply(np.square(value), hessians[layer.name])
+                norm = 1
+            elif metric == 'l1':
+                norm = 1
+            else:
+                norm = 2
+
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    logging.warn('Weight sharing not suitable for unstructured pruning. Ignoring....')
+
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    # Since no norm is taken, calculate absolute value [avoids removing large negative weights]
+                    value = np.abs(value)
+
+                    # Get all posible indices in the weight tensor
+                    indices = np.indices(value.shape).reshape(value.ndim, -1).T
+
+                    # Find weights with the lowest loss
+                    groups = []
+                    for i in indices:
+                        groups.append(__WeightGroups__(value[tuple(i)], layer_savings, tuple(i)))
+                    _, selected = solve_knapsack(
+                        np.array([g.value for g in groups]),
+                        np.array([g.resources for g in groups]).T,
+                        target_resources,
+                        implementation=knapsack_solver
+                    )
+
+                    # Selected weights are not masked
+                    mask = np.zeros(value.shape, dtype=value.dtype)
+                    for i in selected:
+                        mask[groups[i].layer_position] = 1
+
+                    # Offsets are always zero (weight sharing not applicable to unstructured)
+                    masks[layer.name] = mask
+                    offsets[layer.name] = np.zeros(value.shape, dtype=value.dtype)
+
+            if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                # Dense -> Masking neurons (columns)
+                if isinstance(layer, (Dense, QDense)):
+                    # If pruning enabled, find cost associated with pruning each neuron
+                    if model_attributes[layer.name].optimization_attributes.pruning:
+                        vals_norm = np.linalg.norm(value, axis=0, ord=norm)
+                    else:
+                        vals_norm = np.full((value.shape[1]), sys.float_info.max, dtype=value.dtype)
+
+                    # If weight sharing enabled, find cost asociated with quantizing nerons to their mean
+                    if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                        vals_var = np.var(value, axis=0)
+                    else:
+                        vals_var = np.full((value.shape[1]), sys.float_info.max, dtype=value.dtype)
+
+                    # Choose min(pruning, weight sharing)
+                    groups = []
+                    for i in range(vals_norm.shape[0]):
+                        if vals_norm[i] <= vals_var[i]:
+                            groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, optimization_type='pruning'))
+                        else:
+                            groups.append(__WeightGroups__(vals_var[i], layer_savings, i, optimization_type='sharing'))
+
+                    # Select neurons with the lowest loss
+                    _, selected = solve_knapsack(
+                        np.array([g.value for g in groups]),
+                        np.array([g.resources for g in groups]).T,
+                        target_resources,
+                        implementation=knapsack_solver
+                    )
+
+                    # Selected neurons are not masked
+                    mask = np.zeros(value.shape, value.dtype)
+                    for i in selected:
+                        mask[:, groups[i].layer_position] = 1
+
+                    # Masked neurons can either be pruned or quantized
+                    # If quantized, add the corresponding offset
+                    offset = np.zeros(value.shape, value.dtype)
+                    zeros = np.where(~np.all(mask, axis=0))[0]
+                    for i in zeros:
+                        if groups[i].optimization_type == 'sharing':
+                            offset[:, i] = np.mean(layer.get_weights()[0][:, i])
+                    masks[layer.name] = mask
+                    offsets[layer.name] = offset
+
+                # Conv2D -> Masking filters (W x H x C)
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    if model_attributes[layer.name].optimization_attributes.pruning:
+                        vals_norm = np.linalg.norm(np.linalg.norm(value, axis=(0, 1), ord='fro'), axis=0, ord=norm)
+                    else:
+                        vals_norm = np.full((value.shape[3]), sys.float_info.max, dtype=value.dtype)
+
+                    if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                        vals_var = np.var(np.linalg.norm(value, axis=(0, 1), ord='fro'), axis=0)
+                    else:
+                        vals_var = np.full((value.shape[3]), sys.float_info.max, dtype=value.dtype)
+
+                    groups = []
+                    for i in range(vals_norm.shape[0]):
+                        if vals_norm[i] <= vals_var[i]:
+                            groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, optimization_type='pruning'))
+                        else:
+                            groups.append(__WeightGroups__(vals_var[i], layer_savings, i, optimization_type='sharing'))
+
+                    _, selected = solve_knapsack(
+                        np.array([g.value for g in groups]),
+                        np.array([g.resources for g in groups]).T,
+                        target_resources,
+                        implementation=knapsack_solver
+                    )
+
+                    mask = np.zeros(value.shape, value.dtype)
+                    for i in selected:
+                        mask[:, :, :, groups[i].layer_position] = 1
+
+                    offset = np.zeros(value.shape, value.dtype)
+                    zeros = np.where(~np.all(mask, axis=(0, 1, 2)))[0]
+                    for i in zeros:
+                        if groups[i].optimization_type == 'sharing':
+                            offset[:, :, :, i] = np.mean(layer.get_weights()[0][:, :, :, i])
+
+                    masks[layer.name] = mask
+                    offsets[layer.name] = offset
+
+            if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                pattern_offset = model_attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = model_attributes[layer.name].optimization_attributes.consecutive_patterns
+
+                if (np.prod(value.shape)) % pattern_offset != 0:
+                    raise Exception('Pattern offset needs to be a factor of matrix size')
+
+                if pattern_offset % consecutive_patterns != 0:
+                    raise Exception('Consecutive patterns need to be a factor of matrix size')
+
+                # Transpose, as done in hls4ml Resource strategy
+                if isinstance(layer, (Dense, QDense)):
+                    value = value.T
+                    transposed_shape = value.shape
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    value = np.transpose(value, axes=[3, 0, 1, 2])
+                    transposed_shape = value.shape
+
+                # Reshape weight matrix into [number_of_patterns, pattern_offset]
+                # Note, swapping the axis will mess up the weight order
+                # In the case of hls4ml, number_of_patterns is equivalent to reuse factor
+                # And, pattern_offset, is the number of multiplications done in parallel
+                number_of_patterns = np.prod(transposed_shape) // pattern_offset
+                target_shape = (pattern_offset, number_of_patterns)
+                reshaped = np.reshape(value, target_shape)
+
+                # Group consecutive patterns (rows) into blocks and reshape
+                total_blocks = pattern_offset // consecutive_patterns
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, 1, 1, 1], 'SAME'
+                    ).numpy(), (total_blocks, -1)
+                )
+
+                # If pruning enabled, find cost associated with pruning each neuron
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    vals_norm = np.linalg.norm(blocks, axis=1, ord=norm)
+                else:
+                    vals_norm = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # If weight sharing enabled, find cost asociated with quantizing nerons to their mean
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    vals_var = np.var(blocks, axis=1)
+                else:
+                    vals_var = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # Choose min(pruning, weight sharing)
+                groups = []
+                for i in range(vals_norm.shape[0]):
+                    if vals_norm[i] <= vals_var[i]:
+                        groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, optimization_type='pruning'))
+                    else:
+                        groups.append(__WeightGroups__(vals_var[i], layer_savings, i, optimization_type='sharing'))
+
+                # Select groups with highest importance
+                _, selected = solve_knapsack(
+                    np.array([g.value for g in groups]),
+                    np.array([g.resources for g in groups]).T,
+                    target_resources,
+                    implementation=knapsack_solver
+                )
+
+                # Decode masked groups into transposed shape and set selected groups to one
+                mask = np.zeros((np.prod(transposed_shape),), value.dtype)
+                for i in selected:
+                    pos = i * number_of_patterns * consecutive_patterns
+                    for j in range(pos, pos + number_of_patterns * consecutive_patterns, number_of_patterns):
+                        mask[range(j, j + number_of_patterns)] = 1
+
+                        # Decode offset
+                not_selected = [i for i in range(len(groups)) if i not in selected]
+                offset = np.zeros((np.prod(transposed_shape),), value.dtype)
+                for i in not_selected:
+                    if groups[i].optimization_type == 'sharing':
+                        mean = np.mean(blocks[i, :])
+                        pos = i * number_of_patterns * consecutive_patterns
+                        for j in range(pos, pos + number_of_patterns * consecutive_patterns, number_of_patterns):
+                            offset[range(j, j + number_of_patterns)] = mean
+
+                            # Reshape into original shape and store result
+                if isinstance(layer, (Dense, QDense)):
+                    mask = np.reshape(mask, transposed_shape).T
+                    offset = np.reshape(offset, transposed_shape).T
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    mask = np.transpose(np.reshape(mask, transposed_shape), (1, 2, 3, 0))
+                    offset = np.transpose(np.reshape(offset, transposed_shape), (1, 2, 3, 0))
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+            if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                if len(value.shape) != 2:
+                    raise Exception('Block pruning is supported for 2-dimensional weight matrices')
+
+                block_shape = model_attributes[layer.name].optimization_attributes.block_shape
+                if (value.shape[0] % block_shape[0]) != 0 or (value.shape[1] % block_shape[1] != 0):
+                    raise Exception('Block sizes need to be fators of weight matrix dimensions')
+
+                # TensorFlow has a built-in method for exctracting sub-tensors of given shape and stride
+                # This method is commonly used to perform im2col, docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
+                total_blocks = (value.shape[0] * value.shape[1]) // (block_shape[0] * block_shape[1])
+                blocks_in_row = value.shape[1] // block_shape[1]
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(value, 2), 0),
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, 1, 1, 1], 'SAME'
+                    ).numpy(),
+                    (total_blocks, block_shape[0] * block_shape[1])
+                )
+
+                # If pruning enabled, find cost associated with pruning each neuron
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    vals_norm = np.linalg.norm(blocks, axis=1, ord=norm)
+                else:
+                    vals_norm = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # If weight sharing enabled, find cost asociated with quantizing nerons to their mean
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    vals_var = np.var(blocks, axis=1)
+                else:
+                    vals_var = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # Choose min(pruning, weight sharing)
+                groups = []
+                for i in range(vals_norm.shape[0]):
+                    if vals_norm[i] <= vals_var[i]:
+                        groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, optimization_type='pruning'))
+                    else:
+                        groups.append(__WeightGroups__(vals_var[i], layer_savings, i, optimization_type='sharing'))
+
+                # Select groups with highest importance
+                _, selected = solve_knapsack(
+                    np.array([g.value for g in groups]),
+                    np.array([g.resources for g in groups]).T,
+                    target_resources,
+                    implementation=knapsack_solver
+                )
+
+                # Decode position of masked weights and set selected weights to one
+                mask = np.zeros(value.shape, value.dtype)
+                for i in selected:
+                    row = block_shape[0] * (i // blocks_in_row)
+                    col = block_shape[1] * (i % blocks_in_row)
+                    cols = np.linspace(col, col + block_shape[1], block_shape[1], endpoint=False, dtype=np.int32)
+                    rows = np.linspace(row, row + block_shape[0], block_shape[0], endpoint=False, dtype=np.int32)
+                    zeros = np.array(np.meshgrid(rows, cols)).T.reshape(-1, 2)
+                    mask[zeros[:, 0], zeros[:, 1]] = 1
+
+                # Calculate offset
+                not_selected = [i for i in range(len(groups)) if i not in selected]
+                offset = np.zeros(value.shape, value.dtype)
+                for i in not_selected:
+                    if groups[i].optimization_type == 'sharing':
+                        mean = np.mean(blocks[i, :])
+                        row = block_shape[0] * (i // blocks_in_row)
+                        col = block_shape[1] * (i % blocks_in_row)
+                        cols = np.linspace(col, col + block_shape[1], block_shape[1], endpoint=False, dtype=np.int32)
+                        rows = np.linspace(row, row + block_shape[0], block_shape[0], endpoint=False, dtype=np.int32)
+                        pos = np.array(np.meshgrid(rows, cols)).T.reshape(-1, 2)
+                        offset[pos[:, 0], pos[:, 1]] = mean
+
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+    return masks, offsets
+
+
+def __get_masks_global(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians,
+                       knapsack_solver):
+    '''
+    Function calculating a layer-wise binary mask for all optimizable layers
+    Global masking, with layers of different sparsity; masks are calculated by solving a Knapsack problem
+    Most of the logic remains similar to local masking; comments describing implementation are given in the function above
+    '''
+    groups = []
+    total_resources = []
+
+    # Iterate through all layers and create a list of all the optimizable groups (single weight, structure, pattern, block)
+    # Each entry contains the value associated with the group, alongside the layer it belongs to and its location in the layer
+    # The values is normalised w.r.t to the to largest element in the group, to avoid bias towards large layers
+    # We also keep track of total model resources, with respect to the objective
+    # A detailed comment in the local masking function is given for considerations on exception of layer savings and how to address them
+    for layer in keras_model.layers:
+        # Optimizable should be always enabled if either pruning or weight sharing are enabled
+        # However, if the objectives are implemented incorrectly, it is possible to have optimizatons enabled without any types of optimization (pruning, weight sharing etc.) enabled
+        layer_optimizable = model_attributes[layer.name].optimizable and (
+                    model_attributes[layer.name].optimization_attributes.weight_sharing or model_attributes[
+                layer.name].optimization_attributes.pruning)
+        if isinstance(layer, SUPPORTED_LAYERS) and layer_optimizable:
+            value = layer.get_weights()[0]
+            structure_type = model_attributes[layer.name].optimization_attributes.structure_type
+            layer_savings = objective.layer_savings(model_attributes[layer.name])
+            total_resources.append(objective.layer_resources(model_attributes[layer.name]))
+
+            if metric == 'oracle':
+                value = np.abs(np.multiply(value, gradients[layer.name]))
+                norm = 1
+            elif metric == 'saliency':
+                value = np.multiply(np.square(value), hessians[layer.name])
+                norm = 1
+            elif metric == 'l1':
+                norm = 1
+            else:
+                norm = 2
+
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    logging.warn('Weight sharing not suitable for unstructured pruning. Ignoring....')
+
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    value = np.abs(value)
+                    value = value / np.max(value)
+                    indices = np.indices(value.shape).reshape(value.ndim, -1).T
+                    for i in indices:
+                        group = __WeightGroups__(value[tuple(i)], layer_savings, tuple(i), structure_type, layer.name,
+                                                 'pruning')
+                        groups.append(group)
+
+            if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                if isinstance(layer, (Dense, QDense)):
+                    if model_attributes[layer.name].optimization_attributes.pruning:
+                        vals_norm = np.linalg.norm(value, axis=0, ord=norm)
+                        vals_norm = vals_norm / np.max(vals_norm)
+                    else:
+                        vals_norm = np.full((value.shape[1]), sys.float_info.max, dtype=value.dtype)
+
+                    if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                        vals_var = np.var(value, axis=0)
+                    else:
+                        vals_var = np.full((value.shape[1]), sys.float_info.max, dtype=value.dtype)
+
+                    for i in range(vals_norm.shape[0]):
+                        if vals_norm[i] <= vals_var[i]:
+                            groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, structure_type, layer.name,
+                                                           optimization_type='pruning'))
+                        else:
+                            groups.append(__WeightGroups__(vals_var[i], layer_savings, i, structure_type, layer.name,
+                                                           optimization_type='sharing'))
+
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    if model_attributes[layer.name].optimization_attributes.pruning:
+                        vals_norm = np.linalg.norm(np.linalg.norm(value, axis=(0, 1), ord='fro'), axis=0, ord=norm)
+                        vals_norm = vals_norm / np.max(vals_norm)
+                    else:
+                        vals_norm = np.full((value.shape[3]), sys.float_info.max, dtype=value.dtype)
+
+                    if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                        vals_var = np.var(np.linalg.norm(value, axis=(0, 1), ord='fro'), axis=0)
+                    else:
+                        vals_var = np.full((value.shape[3]), sys.float_info.max, dtype=value.dtype)
+
+                    for i in range(vals_norm.shape[0]):
+                        if vals_norm[i] <= vals_var[i]:
+                            groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, structure_type, layer.name,
+                                                           optimization_type='pruning'))
+                        else:
+                            groups.append(__WeightGroups__(vals_var[i], layer_savings, i, structure_type, layer.name,
+                                                           optimization_type='sharing'))
+
+            if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                pattern_offset = model_attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = model_attributes[layer.name].optimization_attributes.consecutive_patterns
+                if (np.prod(value.shape)) % pattern_offset != 0:
+                    raise Exception('Pattern offset needs to be a factor of matrix size')
+
+                if pattern_offset % consecutive_patterns != 0:
+                    raise Exception('Consecutive patterns need to be a factor of matrix size')
+
+                # Transpose, as done in hls4ml Resource strategy
+                if isinstance(layer, (Dense, QDense)):
+                    value = value.T
+                    transposed_shape = value.shape
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    value = np.transpose(value, axes=[3, 0, 1, 2])
+                    transposed_shape = value.shape
+
+                # Reshape weight matrix into [number_of_patterns, pattern_offset]
+                # Note, swapping the axis will mess up the weight order
+                # In the case of hls4ml, number_of_patterns is equivalent to reuse factor
+                # And, pattern_offset, is the number of multiplications done in parallel
+                number_of_patterns = np.prod(transposed_shape) // pattern_offset
+                target_shape = (pattern_offset, number_of_patterns)
+                reshaped = np.reshape(value, target_shape)
+
+                # Group consecutive patterns (rows) into blocks and reshape
+                total_blocks = pattern_offset // consecutive_patterns
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, 1, 1, 1], 'SAME'
+                    ).numpy(), (total_blocks, -1)
+                )
+
+                # If pruning enabled, find cost associated with pruning each neuron
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    vals_norm = np.linalg.norm(blocks, axis=1, ord=norm)
+                else:
+                    vals_norm = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # If weight sharing enabled, find cost asociated with quantizing nerons to their mean
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    vals_var = np.var(blocks, axis=1)
+                else:
+                    vals_var = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # Choose min(pruning, weight sharing)
+                for i in range(vals_norm.shape[0]):
+                    if vals_norm[i] <= vals_var[i]:
+                        groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, structure_type, layer.name,
+                                                       optimization_type='pruning'))
+                    else:
+                        groups.append(__WeightGroups__(vals_var[i], layer_savings, i, structure_type, layer.name,
+                                                       optimization_type='sharing'))
+
+            if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                if len(value.shape) != 2:
+                    raise Exception('Block pruning is supported for 2-dimensional weight matrices')
+
+                block_shape = model_attributes[layer.name].optimization_attributes.block_shape
+                if (value.shape[0] % block_shape[0]) != 0 or (value.shape[1] % block_shape[1] != 0):
+                    raise Exception('Block sizes need to be fators of weight matrix dimensions')
+
+                total_blocks = (value.shape[0] * value.shape[1]) // (block_shape[0] * block_shape[1])
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(value, 2), 0),
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, 1, 1, 1], 'SAME'
+                    ).numpy(),
+                    (total_blocks, block_shape[0] * block_shape[1])
+                )
+
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    vals_norm = np.linalg.norm(blocks, axis=1, ord=norm)
+                    vals_norm = vals_norm / np.max(vals_norm)
+                else:
+                    vals_norm = np.full((blocks.shape[0]), sys.float_info.max, dtype=value.dtype)
+
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    vals_var = np.var(blocks, axis=1)
+                else:
+                    vals_var = np.full((blocks.shape[0]), sys.float_info.max, dtype=value.dtype)
+
+                for i in range(vals_norm.shape[0]):
+                    if vals_norm[i] <= vals_var[i]:
+                        groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, structure_type, layer.name,
+                                                       optimization_type='pruning'))
+                    else:
+                        groups.append(__WeightGroups__(vals_var[i], layer_savings, i, structure_type, layer.name,
+                                                       optimization_type='sharing'))
+
+    # The goal is to maximize network accuracy (values) subject to resorces (objective) staying under some threshold
+    # This is a Knapsack problem; several implementations are provided in the helper functions
+    # The selected values correspond to weight / groups being kept in the network; the rest are pruned / weight shared
+    total_resources = np.sum(np.array(total_resources), axis=0)
+    target_resources = ((1 - sparsity) * np.array(total_resources)).astype(int)
+    _, selected = solve_knapsack(
+        np.array([s.value for s in groups]),
+        np.array([s.resources for s in groups]).T,
+        target_resources,
+        implementation=knapsack_solver
+    )
+    # Update masks and offsets
+    masks = {}
+    offsets = {}
+
+    for layer in keras_model.layers:
+        if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
+            structure_type = model_attributes[layer.name].optimization_attributes.structure_type
+            selected_layer = [i for i in selected if groups[i].layer_name == layer.name]
+            not_selected_layer = [i for i in range(len(groups)) if
+                                  groups[i].layer_name == layer.name and i not in selected_layer]
+
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                mask = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                for i in selected_layer:
+                    mask[groups[i].layer_position] = 1
+                masks[layer.name] = mask
+                offsets[layer.name] = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+
+            if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                mask = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                offset = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                if isinstance(layer, (Dense, QDense)):
+                    for i in selected_layer:
+                        mask[:, groups[i].layer_position] = 1
+                    for i in not_selected_layer:
+                        if groups[i].optimization_type == 'sharing':
+                            offset[:, groups[i].layer_position] = np.mean(layer.get_weights()[0][:, i])
+                if isinstance(layer, (Conv2D, QConv2D)):
+                    for i in selected_layer:
+                        mask[:, :, :, groups[i].layer_position] = 1
+                    for i in not_selected_layer:
+                        if groups[i].optimization_type == 'sharing':
+                            offset[:, :, :, groups[i].layer_position, ] = np.mean(layer.get_weights()[0][:, :, :, i])
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+            if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                pattern_offset = model_attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = model_attributes[layer.name].optimization_attributes.consecutive_patterns
+                number_of_patterns = np.prod(model_attributes[layer.name].weight_shape) // pattern_offset
+
+                # Transpose shape, as done in hls4ml Resource strategy
+                # We need the weights to recalculate the block means
+                weight_shape = model_attributes[layer.name].weight_shape
+                if isinstance(layer, (Dense, QDense)):
+                    value = layer.get_weights()[0].T
+                    transposed_shape = (weight_shape[1], weight_shape[0])
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    value = np.transpose(layer.get_weights()[0], (3, 0, 1, 2))
+                    transposed_shape = (weight_shape[3], weight_shape[0], weight_shape[1], weight_shape[2])
+
+                # Decode masks
+                mask = np.zeros((np.prod(transposed_shape),), layer.get_weights()[0].dtype)
+                for i in selected_layer:
+                    pos = groups[i].layer_position * number_of_patterns * consecutive_patterns
+                    for j in range(pos, pos + number_of_patterns * consecutive_patterns, number_of_patterns):
+                        mask[range(j, j + number_of_patterns)] = 1
+
+                        # Decode offsets
+                offset = np.zeros((np.prod(transposed_shape),), layer.get_weights()[0].dtype)
+
+                # Re-calculate the blocks, they are needed to calculate block means
+                target_shape = (pattern_offset, number_of_patterns)
+                reshaped = np.reshape(value, target_shape)
+
+                total_blocks = pattern_offset // consecutive_patterns
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, 1, 1, 1], 'SAME'
+                    ).numpy(), (total_blocks, -1)
+                )
+
+                for i in not_selected_layer:
+                    if groups[i].optimization_type == 'sharing':
+                        mean = np.mean(blocks[groups[i].layer_position, :])
+                        pos = groups[i].layer_position * number_of_patterns * consecutive_patterns
+                        for j in range(pos, pos + number_of_patterns * consecutive_patterns, number_of_patterns):
+                            offset[range(j, j + number_of_patterns)] = mean
+
+                            # Reshape into original shape and store result
+                if isinstance(layer, (Dense, QDense)):
+                    mask = np.reshape(mask, transposed_shape).T
+                    offset = np.reshape(offset, transposed_shape).T
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    mask = np.transpose(np.reshape(mask, transposed_shape), (1, 2, 3, 0))
+                    offset = np.transpose(np.reshape(offset, transposed_shape), (1, 2, 3, 0))
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+            if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                block_shape = model_attributes[layer.name].optimization_attributes.block_shape
+                total_blocks = np.prod(model_attributes[layer.name].weight_shape) // np.prod(block_shape)
+                blocks_in_row = model_attributes[layer.name].weight_shape[1] // block_shape[1]
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(layer.get_weights()[0], 2), 0),
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, 1, 1, 1], 'SAME'
+                    ).numpy(),
+                    (total_blocks, block_shape[0] * block_shape[1])
+                )
+
+                mask = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                for i in selected_layer:
+                    row = block_shape[0] * (groups[i].layer_position // blocks_in_row)
+                    col = block_shape[1] * (groups[i].layer_position % blocks_in_row)
+                    cols = np.linspace(col, col + block_shape[1], block_shape[1], endpoint=False, dtype=np.int32)
+                    rows = np.linspace(row, row + block_shape[0], block_shape[0], endpoint=False, dtype=np.int32)
+                    zeros = np.array(np.meshgrid(rows, cols)).T.reshape(-1, 2)
+                    mask[zeros[:, 0], zeros[:, 1]] = 1
+
+                offset = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                for i in not_selected_layer:
+                    if groups[i].optimization_type == 'sharing':
+                        mean = np.mean(blocks[groups[i].layer_position, :])
+                        row = block_shape[0] * (groups[i].layer_position // blocks_in_row)
+                        col = block_shape[1] * (groups[i].layer_position % blocks_in_row)
+                        cols = np.linspace(col, col + block_shape[1], block_shape[1], endpoint=False, dtype=np.int32)
+                        rows = np.linspace(row, row + block_shape[0], block_shape[0], endpoint=False, dtype=np.int32)
+                        pos = np.array(np.meshgrid(rows, cols)).T.reshape(-1, 2)
+                        offset[pos[:, 0], pos[:, 1]] = mean
+
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+    return masks, offsets
+
+
+class __WeightGroups__:
+    '''
+    A helper class containing information about a group of weights
+    '''
+
+    def __init__(self, value, resources, layer_position, structure_type=None, layer_name=None, optimization_type=None):
+        self.value = value
+        self.resources = resources
+        self.layer_position = layer_position
+        self.structure_type = structure_type
+        self.layer_name = layer_name
+        self.optimization_type = optimization_type
\ No newline at end of file
diff --git a/test/pytest/optimization/keras/test_masking.py b/test/pytest/optimization/keras/test_masking.py
new file mode 100644
index 0000000000..6726dcc638
--- /dev/null
+++ b/test/pytest/optimization/keras/test_masking.py
@@ -0,0 +1,447 @@
+import pytest
+import numpy as np
+from qkeras import QDense, QConv2D
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Conv2D, Flatten
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.masking import get_model_masks
+from hls4ml.optimization.objectives import ParameterEstimator
+from hls4ml.optimization.attributes import get_attributes_from_keras_model
+
+'''
+In all the tests, an artifical network with one Dense/Conv2D layer and pre-determined weights is created
+Then, the tests assert zeros occur in the correct places, based on the masking structure (unstructured, block etc.)
+Furthermore, tests assert the masks are binary, so only zeros and ones occur
+Masking is such that:
+        * non_zero_params <= (1 - sparsity) * total_params OR
+        * zero_params > sparsity * total_params
+Since the targetted objective is ParameterEstimator, weight sharing is not suitable [does not decrease the number of weights]
+Therefore, all the test verify offsets are zero
+'''
+sparsity = 0.33
+local_masking = [True, False]
+dense_layers = [Dense, QDense]
+conv2d_layers = [Conv2D, QConv2D]
+
+
+# Create a Dense layer with artificial weights, so that (1, 1) and (2, 3) (matrix indexing) are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_dense_masking_unstructured(local_masking, dense):
+    weight_shape = (2, 3)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+    weights[0][0, 0] = 1e-6
+    weights[0][1, 2] = 1e-6
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.UNSTRUCTURED
+
+    # 33% sparsity - zero 2 out of 6 blocks with lowest norm [0.33 * 6 = 1.98 -> next largest int is 2]
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
+                                     local=local_masking)
+    zeros = np.array([
+        [0, 0],
+        [1, 2]
+    ],
+        dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert (not np.any(offsets['dense']))
+    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
+    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+
+
+# Create a Dense layer with artificial weights, so that the 1st and 3rd column (neuron) are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_dense_masking_structured(local_masking, dense):
+    weight_shape = (3, 6)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+    weights[0][:, 0] = 1e-6
+    weights[0][:, 2] = 1e-6
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.STRUCTURED
+
+    # 33% sparsity - zero 2 out of 6 blocks with lowest norm [0.33 * 6 = 1.98 -> next largest int is 2]
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
+                                     local=local_masking)
+    zeros = np.array([
+        [0, 0], [1, 0], [2, 0],  # First neuron
+        [0, 2], [1, 2], [2, 2],  # Third neuron
+    ],
+        dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert (not np.any(offsets['dense']))
+    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
+    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+
+
+# Create a Dense layer with artificial weights, so that some patterns are pruned
+# Set pattern offset to 4, which is equivalent to RF = 3 [4 * 3 / 4]
+# In this case consecutive patterns are one, so pruning per DSP block
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_dense_masking_pattern(local_masking, dense):
+    weight_shape = (3, 4)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+
+    # Set 1st block low
+    weights[0][0, 0] = 1e-6
+    weights[0][1, 0] = 1e-6
+    weights[0][2, 0] = 1e-6
+
+    # Set 3rd block low
+    weights[0][0, 2] = 1e-6
+    weights[0][1, 2] = 1e-6
+    weights[0][2, 2] = 1e-6
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.PATTERN
+    model_attributes['dense'].optimization_attributes.pattern_offset = 4
+    model_attributes['dense'].optimization_attributes.consecutive_patterns = 1
+
+    # 33% sparsity - zero 4 out of 12 weights, group by pattern [0.33 * 12 = 3.96] - so will select 2 patterns, 6 weights (>=)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
+                                     local=local_masking)
+    zeros = np.array([
+        [0, 0], [1, 0], [2, 0],
+        [0, 2], [1, 2], [2, 2]
+    ],
+        dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert (not np.any(offsets['dense']))
+    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
+    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+
+
+# Create a Dense layer with artificial weights, so that the 1st and 4th block are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_dense_masking_block(local_masking, dense):
+    weight_shape = (4, 6)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+
+    # Set 1st block low
+    weights[0][0, 0] = 1e-6
+    weights[0][0, 1] = 1e-6
+    weights[0][1, 0] = 1e-6
+    weights[0][1, 2] = 1e-6
+
+    # Set 4th block low
+    weights[0][2, 2] = 1e-6
+    weights[0][2, 3] = 1e-6
+    weights[0][3, 2] = 1e-6
+    weights[0][3, 3] = 1e-6
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.BLOCK
+    model_attributes['dense'].optimization_attributes.block_shape = (2, 2)
+
+    # 33% sparsity - zero 2 out of 6 blocks with lowest norm
+    # The first block is the smallest, the fourth block is set to zero
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
+                                     local=local_masking)
+    zeros = np.array([
+        [0, 0], [0, 1], [1, 0], [1, 1],
+        [2, 2], [2, 3], [3, 2], [3, 3]
+    ],
+        dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert (not np.any(offsets['dense']))
+    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
+    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+
+
+# Create a Conv2D layer with artificial weights and mask some small weights
+# Target sparsity is 0.33, so there should be <= (1 - 0.33) * 16 = 10.72 non-zero params
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+def test_conv2d_masking_unstructured(local_masking, conv2d):
+    filt_width = 2
+    filt_height = 2
+    n_channels = 2
+    n_filters = 2
+
+    model = Sequential()
+    model.add(conv2d(n_filters, input_shape=(8, 8, n_channels), kernel_size=(filt_width, filt_height), name='conv2d'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+    weights[0][0, 0, 0, 0] = 1e-6
+    weights[0][1, 0, 0, 0] = 1e-6
+    weights[0][0, 1, 0, 0] = 1e-6
+    weights[0][0, 0, 1, 0] = 1e-6
+    weights[0][0, 0, 0, 1] = 1e-6
+    weights[0][1, 1, 1, 1] = 1e-6
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.UNSTRUCTURED
+
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
+                                     local=local_masking)
+    zeros = np.array([[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1]],
+                     dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
+
+    assert (not np.any(offsets['conv2d']))
+    assert (not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]]))
+    assert ((filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0]))
+
+
+# Create a Conv2D layer with artificial weights, so that second and last filter are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+def test_conv2d_masking_structured(local_masking, conv2d):
+    filt_width = 3
+    filt_height = 3
+    n_channels = 4
+    n_filters = 6
+
+    model = Sequential()
+    model.add(conv2d(n_filters, input_shape=(8, 8, n_channels), kernel_size=(filt_width, filt_height), name='conv2d'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+    weights[0][:, :, :, 1] = 1e-3
+    weights[0][:, :, :, 5] = 1e-3
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.STRUCTURED
+
+    # 33% sparsity - zero 2 out of 6 filters with lowest norm
+    # Generate all possible combinations of width and height pixels with channel using np.meshgrid()
+    # This represents all the positions for a single filter; then append filter position to the last columns
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
+                                     local=local_masking)
+    width_pixels = np.array(range(0, filt_width))
+    height_pixels = np.array(range(0, filt_height))
+    channels = np.array(range(0, n_channels))
+    combinations = np.array(np.meshgrid(width_pixels, height_pixels, channels)).T.reshape(-1, 3)
+    zeros = np.array(np.append(combinations, np.full((filt_width * filt_height * n_channels, 1), 1), axis=1).tolist() +
+                     np.append(combinations, np.full((filt_width * filt_height * n_channels, 1), 5), axis=1).tolist(),
+                     dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
+
+    assert (not np.any(offsets['conv2d']))
+    assert (not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]]))
+    assert ((filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0]))
+
+
+# Create a Conv2D layer with artificial weights, so that the first and second pattern are pruned
+# Set pattern offset to 4, which is equivalent to RF = 2 [2 * 2 * 2 / 4]
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+def test_conv2d_masking_pattern(local_masking, conv2d):
+    filt_width = 2
+    filt_height = 1
+    n_channels = 2
+    n_filters = 2
+
+    model = Sequential()
+    model.add(conv2d(n_filters, input_shape=(8, 8, n_channels), kernel_size=(filt_width, filt_height), name='conv2d'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+
+    # Set the first DSP block to be approximately zero
+    weights[0][0, 0, 0, 0] = 1e-6
+    weights[0][0, 0, 1, 0] = 1e-6
+
+    # Set the third DSP block to be approximately zero
+    weights[0][0, 0, 0, 1] = 1e-6
+    weights[0][0, 0, 1, 1] = 1e-6
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.PATTERN
+    model_attributes['conv2d'].optimization_attributes.pattern_offset = 4
+
+    # 33% sparsity - zero out the two of the lowest groups
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
+                                     local=local_masking)
+    print(masks['conv2d'].shape)
+    print(weights[0].shape)
+    zeros = np.array([[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 1]], dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
+
+    assert (not np.any(offsets['conv2d']))
+    assert (not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]]))
+    assert ((filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0]))
+
+
+# Block pruning is only allowed for 2-dimensional matrices, so assert a correct exception is raised when pruning with Conv2D
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+def test_conv2d_block_masking_raises_exception(local_masking, conv2d):
+    model = Sequential()
+    model.add(conv2d(4, input_shape=(8, 8, 3), kernel_size=(3, 3), name='conv2d'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.BLOCK
+
+    try:
+        get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    except Exception:
+        assert (True)
+        return
+    assert (False)
+
+
+# Test edge cases: 0% and 100% sparsity
+# Test 50% sparsity with two layers
+@pytest.mark.parametrize('s', [0, 0.5, 1])
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED,
+                                  SUPPORTED_STRUCTURES.PATTERN, SUPPORTED_STRUCTURES.BLOCK])
+def test_multi_layer_masking(s, local_masking, type):
+    dense_units = 16
+    conv_filters = 6
+    conv_channels = 4
+    conv_shape = (2, 2)  # Using (2, 2) instead of (3, 3) as it's an even number of weights
+    input_shape = (8, 8)
+
+    # Simple model, Conv2D weight shape (2, 2, 4, 6) and Dense weight shape (384, 16)
+    model = Sequential()
+    model.add(Conv2D(conv_filters, input_shape=(*input_shape, conv_channels), kernel_size=conv_shape, name='conv2d',
+                     padding='same', kernel_initializer='ones'))
+    model.add(Flatten())
+    model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
+
+    # Make 'dense' and 'conv2d' optimizable
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = type
+    model_attributes['dense'].optimization_attributes.pattern_offset = 1024  # Equivalent to RF = 6 (384 * 16 / 1024)
+
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes[
+        'conv2d'].optimization_attributes.structure_type = type if type != SUPPORTED_STRUCTURES.BLOCK else SUPPORTED_STRUCTURES.UNSTRUCTURED
+    model_attributes['conv2d'].optimization_attributes.pattern_offset = 4  # Equivalent to RF = 4 (2 * 2 * 4 * 6 / 4)
+
+    masks, offsets = get_model_masks(model, model_attributes, s, ParameterEstimator, metric='l1', local=local_masking)
+    if s == 1:  # 100% sparsity - all masks are zero
+        print(np.count_nonzero(masks['dense'].flatten()))
+        assert (not np.any(masks['dense']))
+        assert (not np.any(masks['conv2d']))
+    elif s == 0.5:
+        conv2d_weights = conv_channels * conv_filters * np.prod(conv_shape)
+        dense_weights = dense_units * np.prod(input_shape) * conv_filters
+        if local_masking:
+            assert (np.count_nonzero(masks['conv2d']) == int((1 - s) * conv2d_weights))
+            assert (np.count_nonzero(masks['dense']) == int((1 - s) * dense_weights))
+        else:
+            # Less than or equal to, since Knapsack problem imposes a hard constrain on the active resources (ones)
+            assert (np.count_nonzero(masks['conv2d']) + np.count_nonzero(masks['dense']) <= int(
+                (1 - s) * (conv2d_weights + dense_weights)))
+    elif s == 0:  # 0% sparsity - all masks are one
+        assert (np.all(masks['dense']))
+        assert (np.all(masks['conv2d']))
+
+    assert (not np.any(offsets['dense']))
+    assert (not np.any(offsets['conv2d']))
+
+
+# Create a Dense layer with artificial weights, so that some consecutive patterns are pruned
+# Set consecutive patterns to 2, so that the 1st block [1st and 2nd pattern] are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_consecutive_pattern_masking(local_masking, dense):
+    weight_shape = (3, 4)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+    weights = model.layers[0].get_weights()
+
+    weights[0] = np.arange(np.prod(weight_shape)).reshape(weight_shape)
+
+    # Set 1st and 2nd pattern low
+    weights[0][0, 0] = 1e-6
+    weights[0][1, 0] = 1e-6
+    weights[0][2, 0] = 1e-6
+    weights[0][0, 1] = 1e-4
+    weights[0][1, 1] = 1e-4
+    weights[0][2, 1] = 1e-4
+
+    # Set 4th pattern lower than second
+    # This pattern should still remain unmasked [even if it has a lower value than the 2nd pattern],
+    # As its neigbouring block has a larger value than the 2nd pattern
+    weights[0][0, 3] = 1e-6
+    weights[0][1, 3] = 1e-6
+    weights[0][2, 3] = 1e-6
+
+    print(weights)
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.PATTERN
+    model_attributes['dense'].optimization_attributes.pattern_offset = 4
+    model_attributes['dense'].optimization_attributes.consecutive_patterns = 2
+
+    # 33% sparsity - zero 4 out of 12 weight, group by pattern [0.33 * 12 = 3.96]
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
+                                     local=local_masking)
+    zeros = np.array([
+        [0, 0], [1, 0], [2, 0],
+        [0, 1], [1, 1], [2, 1]
+    ],
+        dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert (not np.any(offsets['dense']))
+    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
+    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
diff --git a/test/pytest/optimization/keras/test_weight_sharing.py b/test/pytest/optimization/keras/test_weight_sharing.py
new file mode 100644
index 0000000000..63718ce08f
--- /dev/null
+++ b/test/pytest/optimization/keras/test_weight_sharing.py
@@ -0,0 +1,159 @@
+import pytest
+import numpy as np
+from qkeras import QDense
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.models import Sequential
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.masking import get_model_masks
+from hls4ml.optimization.objectives import ObjectiveEstimator
+from hls4ml.optimization.attributes import get_attributes_from_keras_model
+
+# Similar tests in test_masking.py, weight sharing instead of pruning
+sparsity = 0.33
+local_masking = [True, False]
+dense_layers = [Dense, QDense]
+
+'''
+A mock objective class for weight sharing
+When a group of weights is quantized to the mean value, resource savings are equal to the number of weights quantized
+This is similar to ParameterEstimator, but instead of pruning, weight sharing is performed and
+No savings are incurred with unstructured type (unstructured weight sharing doesn't make sense)
+'''
+
+class MockWeightSharingEstimator(ObjectiveEstimator):
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        else:
+            return [np.prod(layer_attributes.weight_shape)]
+
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
+
+        if layer_attributes.optimization_attributes.weight_sharing:
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                return [0]
+            elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                if 'Dense' in layer_attributes.layer_type.__name__:
+                    return [layer_attributes.weight_shape[1]]
+            elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                number_of_patterns = np.prod(
+                    layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns]
+            elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                return [np.prod(layer_attributes.optimization_attributes.block_shape)]
+        return [0]
+
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_weight_sharing_structured(local_masking, dense):
+    weight_shape = (4, 3)
+
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    weights = model.layers[0].get_weights()
+
+    weights[0][:, 1] = 0.5
+    weights[0][0, 1] -= 1e-4
+    weights[0][2, 1] += 1e-4
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = False
+    model_attributes['dense'].optimization_attributes.weight_sharing = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.STRUCTURED
+
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1',
+                                     local=local_masking)
+    frozen = np.array([
+        [0, 1], [1, 1], [2, 1]
+    ],
+        dtype=np.int32)
+
+    assert (not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]]))
+    assert (np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5))
+
+
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_weight_sharing_pattern(local_masking, dense):
+    weight_shape = (3, 4)
+
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    weights = model.layers[0].get_weights()
+    weights[0][0, 0] = 0.5 + 1e-4
+    weights[0][1, 0] = 0.5 - 1e-4
+    weights[0][2, 0] = 0.5
+
+    weights[0][0, 2] = 0.5 + 1e-4
+    weights[0][1, 2] = 0.5 - 1e-4
+    weights[0][2, 2] = 0.5
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = False
+    model_attributes['dense'].optimization_attributes.weight_sharing = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.PATTERN
+    model_attributes['dense'].optimization_attributes.pattern_offset = 4
+    model_attributes['dense'].optimization_attributes.consecutive_patterns = 1
+
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1',
+                                     local=local_masking)
+    frozen = np.array([
+        [0, 0], [1, 0], [2, 0],
+        [0, 2], [1, 2], [2, 2]
+    ],
+        dtype=np.int32)
+
+    assert (not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]]))
+    assert (np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5))
+
+
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_weight_sharing_block(local_masking, dense):
+    weight_shape = (4, 6)
+
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    weights = model.layers[0].get_weights()
+
+    weights[0][0, 0] = 0.5 + 1e-3
+    weights[0][0, 1] = 0.5 + 1e-3
+    weights[0][1, 0] = 0.5 - 1e-3
+    weights[0][1, 1] = 0.5 - 1e-3
+
+    weights[0][2, 2] = 0.5 + 1e-3
+    weights[0][2, 3] = 0.5 + 1e-3
+    weights[0][3, 2] = 0.5 - 1e-3
+    weights[0][3, 3] = 0.5 - 1e-3
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = False
+    model_attributes['dense'].optimization_attributes.weight_sharing = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.BLOCK
+    model_attributes['dense'].optimization_attributes.block_shape = (2, 2)
+
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1',
+                                     local=local_masking)
+    frozen = np.array([
+        [0, 0], [0, 1], [1, 0], [1, 1],
+        [2, 2], [2, 3], [3, 2], [3, 3]
+    ],
+        dtype=np.int32)
+
+    assert (not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]]))
+    assert (np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5))
\ No newline at end of file

From e655ab6049686c655a294a74a67bcfab74ff57f1 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 16:21:46 +0300
Subject: [PATCH 06/19] Remove unused channels w/ Keras Surgeon

---
 hls4ml/optimization/keras/reduction.py        |  59 ++++++++
 .../optimization/keras/test_reduction.py      | 134 ++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 hls4ml/optimization/keras/reduction.py
 create mode 100644 test/pytest/optimization/keras/test_reduction.py

diff --git a/hls4ml/optimization/keras/reduction.py b/hls4ml/optimization/keras/reduction.py
new file mode 100644
index 0000000000..3ccf691959
--- /dev/null
+++ b/hls4ml/optimization/keras/reduction.py
@@ -0,0 +1,59 @@
+import numpy as np
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Conv2D
+from hls4ml.optimization.keras.utils import get_last_layer_with_weights
+
+'''
+Function for removing zero neurons & filters from a model and rewiring the model graph
+This function is built on top of Keras Surgeon available at: https://github.com/BenWhetton/keras-surgeon 
+Keras Surgeon is no longer under active development and does not work for TensorFlow 2.3+ and QKeras
+The baseline version was forked and updated, available at: https://github.com/bo3z/keras-surgeon
+
+Args:
+    - model (keras.model): Input model
+    
+Return:
+    - reduced (keras.model): Modified model, with redundant structures removed
+
+    '''
+def reduce_model(model):
+    # TODO - Should we make Keras Surgeon a hard requirement in setup.cfg? If so, needs to be installed from git, @bo3z fork
+    try:
+        from kerassurgeon import Surgeon
+    except ModuleNotFoundError:
+        raise Exception('Keras Surgeon not installed. Unable to reduce model footprint '\
+                         'Please install up-to-date Keras Surgeon compatible wit TensorFlow 2.3+ and QKeras '\
+                         'Installation from git: https://github.com/bo3z/keras-surgeon')
+    
+    # Initiate surgeon
+    surgeon = Surgeon(model)
+
+    # Iterate through layers and identify neurons (columns) and filters (tensors, W x H x C) to be removed
+    last_idx = get_last_layer_with_weights(model)
+    for idx, layer in enumerate(model.layers):
+        # Last layer with weights cannot be removed, as it maps to data set labels
+        if (idx == last_idx):
+            break
+
+        # Currently supported Dense and Conv2D; these two can be combined in a single if-statement
+        # Keras Surgeon has a full range of support for Conv1D / Conv3D, reucurrent etc. - might extend in the future
+        if isinstance(layer, Dense):
+            weights = layer.get_weights()[0]
+            zeros = np.where(~weights.any(axis=0))[0].tolist()
+            surgeon.add_job('delete_channels', layer, channels=zeros)
+
+        elif isinstance(layer, Conv2D):
+            weights = layer.get_weights()[0]
+            zeros = np.where(~weights.reshape(-1, weights.shape[-1]).any(axis=0))[0].tolist()
+            surgeon.add_job('delete_channels', layer, channels=zeros)
+    
+    # Reduce model
+    reduced = surgeon.operate()
+
+    # By default, Keras surgeon returns a Functional model
+    # If the original was a Sequential, convert back
+    is_sequential =  model.__class__.__name__ == 'Sequential'
+    if is_sequential:
+        return Sequential(layers=reduced.layers)
+    else:
+        return reduced
diff --git a/test/pytest/optimization/keras/test_reduction.py b/test/pytest/optimization/keras/test_reduction.py
new file mode 100644
index 0000000000..3b9e75cd6e
--- /dev/null
+++ b/test/pytest/optimization/keras/test_reduction.py
@@ -0,0 +1,134 @@
+from qkeras import quantized_bits
+from qkeras import QDense, QActivation, QConv2D
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Softmax, BatchNormalization, ReLU, Flatten, AveragePooling2D
+from hls4ml.optimization.keras.reduction import reduce_model
+from hls4ml.optimization.keras.utils import get_model_sparsity
+
+'''
+Set some neurons / filters to zero and verify that these are removed
+Even is some neurons (columns) in the output layer are zero, these should not be removed (to match data set labels)
+Test verify the above property, by setting some zeros in the last layer and verifying these remain in place
+'''
+
+def test_keras_model_reduction():
+    model = Sequential()
+    model.add(Conv2D(8, (3, 3), input_shape=(64, 64, 1), name='conv2d_1', padding='same'))
+    model.add(MaxPooling2D())
+    model.add(BatchNormalization())
+    model.add(ReLU())
+    model.add(Conv2D(32, (5, 5), padding='same', name='conv2d_2'))
+    model.add(AveragePooling2D())
+    model.add(BatchNormalization())
+    model.add(ReLU())
+    model.add(Flatten())
+    model.add(Dense(32, input_shape=(16, ), name = 'dense_1', activation='relu'))
+    model.add(BatchNormalization())
+    model.add(Dense(14, name = 'dense_2', activation='relu'))
+    model.add(BatchNormalization())
+    model.add(Dense(5, name = 'dense_3'))
+    model.add(Softmax())
+
+    indices = {
+        'conv2d_1': [2, 4, 7],
+        'conv2d_2': [0, 1, 2, 3, 4, 5],
+        'dense_1': [0, 5, 17, 28],
+        'dense_2': [1, 9, 4],
+        'dense_3': [3],
+    }
+    for layer in model.layers:
+        if isinstance(layer, Dense):
+            weights = layer.get_weights()
+            weights[0][:, indices[layer.name]] = 0
+            layer.set_weights(weights)
+        if isinstance(layer, Conv2D):
+            weights = layer.get_weights()
+            weights[0][:, :, :, indices[layer.name]] = 0
+            layer.set_weights(weights)
+   
+    sparsity, _ = get_model_sparsity(model)
+    assert(sparsity > 0)
+
+    reduced = reduce_model(model)
+    assert(reduced.get_layer('conv2d_1').get_weights()[0].shape == (3, 3, 1, 5))
+    assert(reduced.get_layer('conv2d_2').get_weights()[0].shape == (5, 5, 5, 26))
+    assert(reduced.get_layer('dense_1').get_weights()[0].shape == (6656, 28))
+    assert(reduced.get_layer('dense_2').get_weights()[0].shape == (28, 11))
+    assert(reduced.get_layer('dense_3').get_weights()[0].shape == (11, 5))
+
+    _, layer_sparsity = get_model_sparsity(reduced)
+    assert(layer_sparsity['conv2d_1'] == 0)
+    assert(layer_sparsity['conv2d_2'] == 0)
+    assert(layer_sparsity['dense_1'] == 0)
+    assert(layer_sparsity['dense_2'] == 0)
+    assert(layer_sparsity['dense_3'] > 0)
+
+def test_qkeras_model_reduction():
+    bits = 8
+    activation = 'quantized_relu(4)'
+    quantizer = quantized_bits(bits, 0)
+    
+    model = Sequential()
+    model.add(QConv2D(8, (3, 3), input_shape=(64, 64, 1), name='qconv2d_1', padding='same', kernel_quantizer=quantizer))
+    model.add(MaxPooling2D())
+    model.add(BatchNormalization())
+    model.add(QActivation(activation, name='qrelu_1'))
+    model.add(QConv2D(32, (5, 5), padding='same', name='qconv2d_2', kernel_quantizer=quantizer))
+    model.add(AveragePooling2D())
+    model.add(BatchNormalization())
+    model.add(QActivation(activation, name='qrelu_2'))
+    model.add(Flatten())
+    model.add(QDense(32, input_shape=(16, ), name = 'qdense_1', kernel_quantizer=quantizer))
+    model.add(QActivation(activation, name='qrelu_3'))
+    model.add(BatchNormalization())
+    model.add(QDense(14, name = 'qdense_2', kernel_quantizer=quantizer))
+    model.add(QActivation(activation, name='qrelu_4'))
+    model.add(BatchNormalization())
+    model.add(QDense(5, name = 'qdense_3', kernel_quantizer=quantizer))
+    model.add(Softmax())
+
+    indices = {
+        'qconv2d_1': [2, 4, 7],
+        'qconv2d_2': [0, 1, 2, 3, 4, 5],
+        'qdense_1': [0, 5, 17, 28],
+        'qdense_2': [1, 9, 4],
+        'qdense_3': [3],
+    }
+    for layer in model.layers:
+        if isinstance(layer, QDense):
+            weights = layer.get_weights()
+            weights[0][:, indices[layer.name]] = 0
+            layer.set_weights(weights)
+        if isinstance(layer, QConv2D):
+            weights = layer.get_weights()
+            weights[0][:, :, :, indices[layer.name]] = 0
+            layer.set_weights(weights)
+   
+    sparsity, _ = get_model_sparsity(model)
+    assert(sparsity > 0)
+
+    reduced = reduce_model(model)
+    assert(reduced.get_layer('qconv2d_1').get_weights()[0].shape == (3, 3, 1, 5))
+    assert(reduced.get_layer('qconv2d_2').get_weights()[0].shape == (5, 5, 5, 26))
+    assert(reduced.get_layer('qdense_1').get_weights()[0].shape == (6656, 28))
+    assert(reduced.get_layer('qdense_2').get_weights()[0].shape == (28, 11))
+    assert(reduced.get_layer('qdense_3').get_weights()[0].shape == (11, 5))
+
+    _, layer_sparsity = get_model_sparsity(reduced)
+    assert(layer_sparsity['qconv2d_1'] == 0)
+    assert(layer_sparsity['qconv2d_2'] == 0)
+    assert(layer_sparsity['qdense_1'] == 0)
+    assert(layer_sparsity['qdense_2'] == 0)
+    assert(layer_sparsity['qdense_3'] > 0)
+
+    # Verify network surgery has no impact on quantization
+    assert(isinstance(reduced.get_layer('qrelu_1'), QActivation))
+    assert(isinstance(reduced.get_layer('qrelu_2'), QActivation))
+    assert(isinstance(reduced.get_layer('qrelu_3'), QActivation))
+    assert(isinstance(reduced.get_layer('qrelu_4'), QActivation))
+    assert(reduced.get_layer('qconv2d_1').kernel_quantizer['config']['bits'] == bits)
+    assert(reduced.get_layer('qconv2d_2').kernel_quantizer['config']['bits'] == bits)
+    assert(reduced.get_layer('qdense_1').kernel_quantizer['config']['bits'] == bits)
+    assert(reduced.get_layer('qdense_2').kernel_quantizer['config']['bits'] == bits)
+    assert(reduced.get_layer('qdense_3').kernel_quantizer['config']['bits'] == bits)
+    
\ No newline at end of file

From 399a98d41615846ce6c1e6f24b38b0302e9edc26 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 17:04:17 +0300
Subject: [PATCH 07/19] Hyperparameter tuning for pruning & weight sharing
 regularization

---
 hls4ml/optimization/keras/builder.py          | 187 ++++++++++++++++++
 .../optimization/keras/test_regularizers.py   | 105 +++++-----
 2 files changed, 242 insertions(+), 50 deletions(-)
 create mode 100644 hls4ml/optimization/keras/builder.py

diff --git a/hls4ml/optimization/keras/builder.py b/hls4ml/optimization/keras/builder.py
new file mode 100644
index 0000000000..c87b15c604
--- /dev/null
+++ b/hls4ml/optimization/keras/builder.py
@@ -0,0 +1,187 @@
+import re
+import numpy as np
+import tensorflow as tf
+import keras_tuner as kt
+from qkeras import QDense, QConv2D
+from tensorflow.keras.layers import Dense, Conv2D
+from tensorflow.keras.callbacks import EarlyStopping
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, TMP_DIRECTORY
+from hls4ml.optimization.keras.regularizers import DenseRegularizer, Conv2DRegularizer
+
+from qkeras.utils import _add_supported_quantized_objects; co = {}; _add_supported_quantized_objects(co)
+
+class HyperOptimizationModel(kt.HyperModel):
+    '''
+    Helper class for Keras Tuner
+
+    Args:
+        - model (keras.Model): Baseline model
+        - attributes (dict): Layer-wise dictionary of attributes
+        - optimizer (keras.optimizers.Optimizer or equvialent string description): Model optimizer
+        - loss_fn (keras.losses.Loss or equivalent string description): Model loss function
+        - validation_metric (keras.metrics.Metric or equivalent string description): Model validation metric
+        - regularization_range (list): List of suitable hyperparameters for weight decay
+    '''
+    
+    def __init__(self, model, attributes, optimizer, loss_fn, validation_metric, regularization_range):
+        self.model = model
+        self.attributes = attributes
+        self.optimizer = optimizer
+        self.loss_fn = loss_fn
+        self.validation_metric = validation_metric
+        self.regularization_range = regularization_range
+
+    def build(self, hp):
+        model_to_prune = tf.keras.models.clone_model(self.model)
+        default_regularizaton = self.regularization_range[len(self.regularization_range) // 2]
+
+        # Make regularization loss a tunable hyperparameter
+        for layer in model_to_prune.layers:
+            if isinstance(layer, SUPPORTED_LAYERS) and self.attributes[layer.name].optimizable:        
+                structure_type = self.attributes[layer.name].optimization_attributes.structure_type
+                block_shape = self.attributes[layer.name].optimization_attributes.block_shape
+                pattern_offset = self.attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = self.attributes[layer.name].optimization_attributes.consecutive_patterns
+                
+                pruning = self.attributes[layer.name].optimization_attributes.pruning
+                weight_sharing = self.attributes[layer.name].optimization_attributes.weight_sharing         
+            
+                alpha = hp.Choice(f'{layer.name}_alpha', values=self.regularization_range, default=default_regularizaton) if pruning else 0
+                beta = hp.Choice(f'{layer.name}_beta', values=self.regularization_range, default=default_regularizaton) if weight_sharing else 0
+
+                if isinstance(layer, (Dense, QDense)) and self.attributes[layer.name].optimizable:
+                    layer.kernel_regularizer = DenseRegularizer(alpha, beta, norm=1, structure_type=structure_type, block_shape=block_shape, pattern_offset=pattern_offset, consecutive_patterns=consecutive_patterns)
+                elif isinstance(layer, (Conv2D, QConv2D)) and self.attributes[layer.name].optimizable:
+                    layer.kernel_regularizer = Conv2DRegularizer(alpha, beta, norm=1, structure_type=structure_type, pattern_offset=pattern_offset, consecutive_patterns=consecutive_patterns)
+
+        # Rebuild model graph
+        model_to_prune = tf.keras.models.model_from_json(model_to_prune.to_json(), custom_objects=co)
+        model_to_prune.set_weights(self.model.get_weights())
+        model_to_prune.compile(optimizer=self.optimizer, loss=self.loss_fn, metrics=[self.validation_metric]) 
+
+        return model_to_prune
+
+def build_optimizable_model(
+    model, attributes, optimizer, loss_fn, validation_metric, increasing,
+    train_dataset, validation_dataset, batch_size, epochs, 
+    verbose=False, directory=TMP_DIRECTORY, tuner='Bayesian',
+    regularization_range=np.logspace(-6, -2, num=15).tolist()
+):
+    '''
+    Function identifying optimizable layers and adding a regularization loss
+
+    Args:
+    - model (keras.Model): Model to be optimized
+    - attributes (dict): Layer-wise model attributes, obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
+    - optimizer (keras.optimizers.Optimizer): Optimizer used during training
+    - loss_fn (keras.losses.Loss): Loss function used during training
+    - validation_metric (keras.metrics.Metric): Validation metric, used as a baseline
+    - train_dataset (tf.Dataset): Training inputs and labels, in the form of an iterable TF Dataset
+    - validation_dataset (tf.Dataset): Validation inputs and labels, in the form of an iterable TF Dataset
+    - batch_size (int): Batch size during training
+    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+
+    Kwargs:
+    - verbose (bool): Whether to log tuner outputs to the console
+    - directory (string): Directory to store tuning results
+    - tuner (str): Tuning alogorithm, choose between Bayesian and Hyperband
+    - regularization_range (list): List of suitable hyperparameters for weight decay
+    - learning_rate_range (list): List of suitable hyperparameters for learning rate
+
+    Notes:
+    - In general, the regularization and learning rate ranges do not need to be provided, as the implementation sets a generic enough range.
+      However, if the user has an idea on the possible range on hyperparameter ranges (eg. VGG-16 weight decay ~10^-5), the tuning will complete faster
+    - The default tuner is Bayesian & when coupled with the correct ranges of hyperparameters, it performs quite well, fast. However, older version of Keras Tuner had a crashing bug with Bayesian Tuner
+    - In general, the directory does not need to be specified. However, if pruning several models simultaneously, to avoid conflicting intermediate results, it is useful to specify directory
+    '''
+    # User provided manual hyper-parameters for regularisation loss
+    # TODO - Maybe we could extend this to be hyper-parameters per layer? or layer-type?
+    # Currently, the same (manually-set) hyper-parameter is set for every layer
+    if tuner == 'Manual':
+        model_to_prune = tf.keras.models.clone_model(model)
+        for layer in model_to_prune.layers:
+            if isinstance(layer, SUPPORTED_LAYERS) and attributes[layer.name].optimizable:        
+                structure_type = attributes[layer.name].optimization_attributes.structure_type
+                block_shape = attributes[layer.name].optimization_attributes.block_shape
+                pattern_offset = attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = attributes[layer.name].optimization_attributes.consecutive_patterns
+
+                pruning = attributes[layer.name].optimization_attributes.pruning
+                weight_sharing = attributes[layer.name].optimization_attributes.weight_sharing         
+
+                alpha = regularization_range[0] if pruning else 0
+                beta = regularization_range[0] if weight_sharing else 0
+
+                if isinstance(layer, (Dense, QDense)) and attributes[layer.name].optimizable:
+                    layer.kernel_regularizer = DenseRegularizer(alpha, beta, norm=1, structure_type=structure_type, block_shape=block_shape, pattern_offset=pattern_offset, consecutive_patterns=consecutive_patterns)
+                elif isinstance(layer, (Conv2D, QConv2D)) and attributes[layer.name].optimizable:
+                    layer.kernel_regularizer = Conv2DRegularizer(alpha, beta, norm=1, structure_type=structure_type, pattern_offset=pattern_offset, consecutive_patterns=consecutive_patterns)
+
+        # Rebuild model graph
+        model_to_prune = tf.keras.models.model_from_json(model_to_prune.to_json(), custom_objects=co)
+        model_to_prune.set_weights(model.get_weights())
+        model_to_prune.compile(optimizer=optimizer, loss=loss_fn, metrics=[validation_metric]) 
+
+        return model_to_prune    
+
+    # User opted for hyper-parameter tuning
+    else:
+        objective_direction = 'max' if increasing else 'min'
+        if isinstance(validation_metric, str):
+            objective_name = validation_metric
+        else:
+            objective_name = re.sub(r'(?<!^)(?=[A-Z])', '_', validation_metric.__class__.__name__).lower()
+        if tuner == 'Bayesian':
+            tuner = kt.BayesianOptimization(
+                hypermodel = HyperOptimizationModel(model, attributes, optimizer, loss_fn, validation_metric, regularization_range),
+                objective = kt.Objective(objective_name, objective_direction),
+                max_trials = 10,
+                overwrite = True,
+                directory = directory + '/tuning',
+            )
+        elif tuner == 'Hyperband':
+            tuner = kt.Hyperband(
+                hypermodel = HyperOptimizationModel(model, attributes, optimizer, loss_fn, validation_metric, regularization_range),
+                objective = kt.Objective(objective_name, objective_direction),
+                max_epochs = epochs,
+                factor = 3,
+                hyperband_iterations = 1,
+                overwrite = True,
+                directory = directory + '/tuning',
+            )
+        else:
+            raise Exception('Unknown tuner; possible options are Bayesian and Hyperband')
+
+        if verbose:
+            tuner.search_space_summary()
+        
+        tuner.search(
+            train_dataset, 
+            epochs = epochs, 
+            batch_size = batch_size, 
+            validation_data = validation_dataset,
+            callbacks = [ EarlyStopping(monitor='val_loss', patience=1) ]
+        )
+        
+        if verbose:
+            tuner.results_summary()
+
+        return tuner.get_best_models(num_models=1)[0]
+
+def remove_custom_regularizers(model):
+    '''
+    Helper function to remove custom regularizers (DenseRegularizer & Conv2DRegularizer)
+    This makes it possible to load the model in a different environment without hls4ml installed
+
+    Args:
+        - model (keras.Model): Baseline model
+    '''
+    weights = model.get_weights()
+    for layer in model.layers:
+        if hasattr(layer, 'kernel_regularizer'):
+            if isinstance(layer.kernel_regularizer, (DenseRegularizer, Conv2DRegularizer)):
+                layer.kernel_regularizer = None
+
+    model = tf.keras.models.model_from_json(model.to_json(), custom_objects=co)
+    model.set_weights(weights)
+    return model
diff --git a/test/pytest/optimization/keras/test_regularizers.py b/test/pytest/optimization/keras/test_regularizers.py
index 377d986365..64d199a37e 100644
--- a/test/pytest/optimization/keras/test_regularizers.py
+++ b/test/pytest/optimization/keras/test_regularizers.py
@@ -6,21 +6,20 @@
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Conv2D, Flatten
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.builder import remove_custom_regularizers
 from hls4ml.optimization.keras.regularizers import DenseRegularizer, Conv2DRegularizer
 
 # Constants
 pattern_offset = 4
-block_shape = (4, 4)
 consecutive_patterns = 2
+block_shape = (4, 4)
 
 dense_layers = [Dense, QDense]
 conv2d_layers = [Conv2D, QConv2D]
 
-
 # Sets the loss due to data to zero; train model only on regularization loss
 def zero_loss(y_true, y_pred):
-    return tf.reduce_mean(0 * tf.square(y_true - y_pred), axis=-1)
-
+    return tf.reduce_mean(0 * tf.square(y_true - y_pred), axis=-1) 
 
 # Helper function, calculates the group norm and variance for a single layer
 def get_norm_and_variance(weights, structure_type, layer='dense'):
@@ -28,7 +27,7 @@ def get_norm_and_variance(weights, structure_type, layer='dense'):
         norm = np.linalg.norm(weights.flatten(), ord=1)
         var = np.var(weights)
         return norm, var
-
+    
     if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
         if layer == 'conv2d':
             norm = np.linalg.norm(np.sum(np.linalg.norm(weights, axis=(0, 1), ord='fro'), axis=0), ord=1)
@@ -36,61 +35,54 @@ def get_norm_and_variance(weights, structure_type, layer='dense'):
         else:
             norm = np.linalg.norm(np.linalg.norm(weights, axis=0, ord=2), ord=1)
             var = np.linalg.norm(np.var(weights, axis=0), ord=1)
-
+        
         return norm, var
-
+    
     if structure_type == SUPPORTED_STRUCTURES.PATTERN:
-        if layer == 'dense':
-            weights = weights.T
-        elif layer == 'conv2d':
-            weights = np.transpose(weights, (3, 0, 1, 2))
-
         number_of_patterns = np.prod(weights.shape) // pattern_offset
-        target_shape = (pattern_offset, number_of_patterns)
+        target_shape = (number_of_patterns, pattern_offset)
         reshaped = np.reshape(weights, target_shape)
         total_blocks = pattern_offset // consecutive_patterns
         blocks = np.reshape(
-            tf.image.extract_patches(
-                np.expand_dims(np.expand_dims(reshaped, 2), 0),
-                [1, consecutive_patterns, number_of_patterns, 1],
-                [1, consecutive_patterns, number_of_patterns, 1],
-                [1, 1, 1, 1], 'SAME'
-            ).numpy(),
-            (total_blocks, -1)
-        )
+                            tf.image.extract_patches(
+                                np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                                [1, number_of_patterns, consecutive_patterns, 1],
+                                [1, number_of_patterns, consecutive_patterns, 1], 
+                                [1, 1, 1, 1], 'SAME'
+                            ).numpy(),
+                            (total_blocks, number_of_patterns * consecutive_patterns)
+                        )
         norm = np.linalg.norm(np.linalg.norm(blocks, axis=1, ord=2), ord=1)
         var = np.linalg.norm(np.var(blocks, axis=1), ord=1)
         return norm, var
 
-    if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+    if structure_type == SUPPORTED_STRUCTURES.BLOCK: 
         total_blocks = (weights.shape[0] * weights.shape[1]) // (block_shape[0] * block_shape[1])
         blocks = np.reshape(
-            tf.image.extract_patches(
-                np.expand_dims(np.expand_dims(weights, 2), 0),
-                [1, block_shape[0], block_shape[1], 1],
-                [1, block_shape[0], block_shape[1], 1],
-                [1, 1, 1, 1], 'SAME'
-            ).numpy(),
-            (total_blocks, block_shape[0] * block_shape[1])
-        )
-
+                            tf.image.extract_patches(
+                                np.expand_dims(np.expand_dims(weights, 2), 0),
+                                [1, block_shape[0], block_shape[1], 1],
+                                [1, block_shape[0], block_shape[1], 1], 
+                                [1, 1, 1, 1], 'SAME'
+                            ).numpy(),
+                            (total_blocks, block_shape[0] * block_shape[1])
+                        )
+        
         norm = np.linalg.norm(np.linalg.norm(blocks, axis=1, ord=2), ord=1)
         var = np.linalg.norm(np.var(blocks, axis=1), ord=1)
         return norm, var
 
-
 @pytest.mark.parametrize('dense', dense_layers)
-@pytest.mark.parametrize('structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED,
-                                            SUPPORTED_STRUCTURES.PATTERN, SUPPORTED_STRUCTURES.BLOCK])
+@pytest.mark.parametrize('structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED, SUPPORTED_STRUCTURES.PATTERN, SUPPORTED_STRUCTURES.BLOCK])
 def test_dense_regularizer(structure_type, dense):
     epochs = 10
     data_points = 10
-    input_shape = (32,)
-    output_shape = (16,)
+    input_shape = (32, )
+    output_shape = (16, )
     X = np.random.rand(data_points, *input_shape)
     y = np.random.rand(data_points, *output_shape)
     w = np.random.rand(input_shape[0], output_shape[0])
-
+    
     # First, fit a model without regularization
     model = Sequential()
     model.add(dense(output_shape[0], input_shape=input_shape))
@@ -105,21 +97,18 @@ def test_dense_regularizer(structure_type, dense):
     dense_weights = model.layers[0].get_weights()
     dense_weights[0] = w
     model.layers[0].set_weights(dense_weights)
-    regularizer = DenseRegularizer(alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset,
-                                   block_shape=block_shape)
-    model.layers[0].add_loss(lambda layer=model.layers[0]: regularizer(layer.kernel))
+    regularizer = DenseRegularizer(alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset, block_shape=block_shape)
+    model.layers[0].add_loss(lambda layer = model.layers[0] : regularizer(layer.kernel))
     model.compile(loss=zero_loss, optimizer=Adam(1.0))
     model.fit(X, y, epochs=epochs)
     reg_norm, reg_var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type)
 
     # Verify regularization decreased weight magnitude and variance
-    assert (reg_norm < norm)
-    assert (reg_var < var)
-
+    assert(reg_norm < norm)
+    assert(reg_var < var)
 
 @pytest.mark.parametrize('conv2d', conv2d_layers)
-@pytest.mark.parametrize('structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED,
-                                            SUPPORTED_STRUCTURES.PATTERN])
+@pytest.mark.parametrize('structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED, SUPPORTED_STRUCTURES.PATTERN])
 def test_conv2d_regularizer(structure_type, conv2d):
     epochs = 10
     data_points = 10
@@ -127,7 +116,7 @@ def test_conv2d_regularizer(structure_type, conv2d):
     num_filters = 4
     X = np.random.rand(data_points, *input_shape)
     y = np.random.rand(data_points, 1)
-
+    
     # First, fit a model without regularization
     model = Sequential()
     model.add(conv2d(num_filters, (3, 3), input_shape=input_shape))
@@ -137,15 +126,31 @@ def test_conv2d_regularizer(structure_type, conv2d):
     model.compile(loss=zero_loss, optimizer=Adam())
     model.fit(X, y, epochs=epochs, verbose=True)
     norm, var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type, 'conv2d')
-
+    
     # Now, fit a model with strong regularization, starting with the same initial weights
     model.layers[0].set_weights(conv_weights)
     regularizer = Conv2DRegularizer(alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset)
-    model.layers[0].add_loss(lambda layer=model.layers[0]: regularizer(layer.kernel))
+    model.layers[0].add_loss(lambda layer = model.layers[0] : regularizer(layer.kernel))
     model.compile(loss=zero_loss, optimizer=Adam())
     model.fit(X, y, epochs=epochs, verbose=True)
     reg_norm, reg_var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type, 'conv2d')
 
     # Verify regularization decreased weight magnitude and variance
-    assert (reg_norm < norm)
-    assert (reg_var < var)
+    assert(reg_norm < norm)
+    assert(reg_var < var)
+
+def test_removal_of_custom_regularizer():
+    model = Sequential()
+    model.add(Conv2D(8, (3, 3), input_shape=(16, 16, 3), kernel_regularizer=Conv2DRegularizer(1e-3)))
+    model.add(Flatten())
+    model.add(Dense(1, kernel_regularizer=DenseRegularizer(1e-3)))
+    weights = model.get_weights()
+
+    assert(isinstance(model.layers[0].kernel_regularizer, Conv2DRegularizer))
+    assert(isinstance(model.layers[2].kernel_regularizer, DenseRegularizer))
+
+    model = remove_custom_regularizers(model)
+
+    assert(model.layers[0].kernel_regularizer is None)
+    for i in range(len(weights)):
+        assert(np.all(weights[i] == model.get_weights()[i]))

From 47392ba9352fd82386ca608bf129a1161f1caabc Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 18:33:50 +0300
Subject: [PATCH 08/19] Top-level Keras optimization function & GPU FLOPs
 optimization

---
 hls4ml/optimization/keras/__init__.py         | 282 ++++++++++++++++++
 .../optimization/objectives/gpu_objectives.py |  57 ++++
 2 files changed, 339 insertions(+)
 create mode 100644 hls4ml/optimization/objectives/gpu_objectives.py

diff --git a/hls4ml/optimization/keras/__init__.py b/hls4ml/optimization/keras/__init__.py
index e69de29bb2..9f4f348ecb 100644
--- a/hls4ml/optimization/keras/__init__.py
+++ b/hls4ml/optimization/keras/__init__.py
@@ -0,0 +1,282 @@
+import os
+import time
+import numpy as np
+import tensorflow as tf
+import hls4ml.optimization.keras.utils as utils
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.reduction import reduce_model
+from hls4ml.optimization.keras.masking import get_model_masks
+from hls4ml.optimization.scheduler import OptimizationScheduler
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS, TMP_DIRECTORY
+from hls4ml.optimization.keras.builder import build_optimizable_model, remove_custom_regularizers
+
+# Enables printing of loss tensors during custom training loop
+from tensorflow.python.ops.numpy_ops import np_config
+np_config.enable_numpy_behavior()
+
+def optimize_model(
+    model, model_attributes, objective, scheduler, X_train, y_train, X_val, y_val, 
+    batch_size, epochs, optimizer, loss_fn, validation_metric, increasing, rtol,
+    callbacks=[], ranking_metric='l1', local=False, verbose=False, rewinding_epochs=1, cutoff_bad_trials=1,
+    directory=TMP_DIRECTORY, tuner='Bayesian', knapsack_solver='CBC_MIP',
+    regularization_range=np.logspace(-6, -2, num=16).tolist()
+  ):
+    '''
+    Top-level function for optimizing a Keras model, given objectives 
+
+    Args:
+    - model (keras.Model): Model to be optimized
+    - model_attributes (dict): Layer-wise model attributes, obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
+    - objective (hls4ml.optimization.objectives.ObjectiveEstimator): Parameter, hardware or user-defined objective of optimization
+    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler): Sparsity scheduler, choose between constant, polynomial and binary
+    - X_train (np.array): Training inputs
+    - y_train (np.array): Training labels
+    - X_val (np.array): Validation inputs
+    - y_val (np.array): Validation labels
+    - batch_size (int): Batch size during training
+    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+    - optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
+    - loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
+    - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
+    - increasing (boolean): If the metric improves with increased values; e.g. accuracy -> increasing = True, MSE -> increasing = False
+    - rtol (float): Relative tolerance; pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+    
+    Kwargs:
+    - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
+    - ranking_metric (string): Metric used for rannking weights and structures; currently supported l1, l2, saliency and Oracle
+    - local (boolean): Layer-wise or global pruning
+    - verbose (boolean): Display debug logs during model optimization
+    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing, allows regrowth of previously pruned weights
+    - cutoff_bad_trials (int): After how many bad trials (performance below threshold), should model pruning / weight sharing stop
+    - directory (string): Directory to store temporary results
+    - tuner (str): Tuning alogorithm, choose between Bayesian, Hyperband and None
+    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing; default usually works well; for very large networks, greedy algorithm might be more suitable
+    - regularization_range (list): List of suitable hyperparameters for weight decay
+    '''
+
+    if not isinstance(scheduler, OptimizationScheduler):
+        raise Exception('Scheduler must be an instance of from hls4ml.optimization.scheduler.OptimizationScheduler' +\
+                        'If you provided string description (e.g. \'constant\'), please use an object instance (i.e. ConstantScheduler())'
+                        'For a full list of supported schedulers and their description, refer to hls4ml.optimization.scheduler.'
+                      )
+
+    if epochs <= rewinding_epochs:
+       raise Exception('Please increase the number of epochs. \
+                       The current epoch number is too small to perform effective pruning & weight rewinding'
+                      )
+  
+    if ranking_metric not in SUPPORTED_METRICS:
+        raise Exception('Unknown metric for ranking weights')
+    
+    # Loss function needs to be converted to a function, string description cannot be used during custom training loop
+    if isinstance(loss_fn, str):
+      loss_fn = tf.keras.losses.get(loss_fn)
+
+    # Split data set into batches
+    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
+    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
+    validation_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
+    validation_dataset = validation_dataset.shuffle(buffer_size=1024).batch(batch_size)
+
+    # Evaluate baseline performance
+    # Use built-in function, and return as list - the metric is the second element (loss if first)
+    model.compile(optimizer, loss_fn, metrics=[validation_metric])
+    baseline_performance = model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
+    if verbose:
+      print('Baseline performance on validation set: {}'.format(baseline_performance))
+
+    # Save best weights
+    # Always save weights to a file, to reduce memory utilization
+    if not os.path.isdir(directory):
+      os.mkdir(directory)
+    if not os.path.isdir(f'{directory}/optimization'):
+      os.mkdir(f'{directory}/optimization')
+    model.save_weights(f'{directory}/optimization/best_weights.h5')
+
+    # Identify optimizable layers, given the current objective
+    last_optimizable_layer = utils.get_last_layer_with_weights(model)
+    for i, layer in enumerate(model.layers):
+        if isinstance(layer, SUPPORTED_LAYERS): 
+          optimizable, optimization_attributes = objective.is_layer_optimizable(model_attributes[layer.name])
+          model_attributes[layer.name].optimizable = optimizable
+          model_attributes[layer.name].optimization_attributes = optimization_attributes
+          
+          # In the last layer, structured pruning can't be applied, as it removes output labels
+          # Weight sharing, as well as all other types of pruning (unstructured, block etc.) are applicable
+          if i >= last_optimizable_layer and optimization_attributes.structure_type == SUPPORTED_STRUCTURES.STRUCTURED and optimization_attributes.pruning:
+            model_attributes[layer.name].optimization_attributes.pruning = False
+            model_attributes[layer.name].optimizable = model_attributes[layer.name].optimization_attributes.weight_sharing
+        else: 
+          model_attributes[layer.name].optimizable = False
+          model_attributes[layer.name].optimization_attributes = None
+        
+    # Add regularization loss to optimizable layers
+    optimizable_model = build_optimizable_model(
+      model, model_attributes, optimizer, loss_fn, validation_metric, 
+      increasing, train_dataset, validation_dataset, batch_size, epochs // 2, 
+      verbose=verbose, directory=directory, tuner=tuner,
+      regularization_range=regularization_range, 
+    )
+
+    # Create class for masked backprop (weight freezing)
+    masked_backprop = MaskedBackprop(optimizable_model, loss_fn, model_attributes)
+
+    # In certain cases, the model might underperform at the current sparsity level, but perform better at a higher sparsity
+    # Therefore, monitor the models performance over several sparsity levels and only stop pruning after high loss over several trials
+    bad_trials = 0  
+    sparsity_conditions = True
+    target_sparsity = scheduler.get_sparsity()
+
+    while sparsity_conditions:
+      # TODO - This might cause OOM issues on large models / data sets, since it is not done in batches
+      gradients = utils.get_model_gradients(optimizable_model, loss_fn, X_train, y_train) if ranking_metric == 'gradients' else {}
+      hessians = utils.get_model_hessians(optimizable_model, loss_fn, X_train, y_train) if ranking_metric == 'saliency' else {}
+      
+      # Mask weights
+      masks, offsets = get_model_masks(
+        optimizable_model, model_attributes, target_sparsity, 
+        objective, metric=ranking_metric, local=local, 
+        gradients=gradients, hessians=hessians, knapsack_solver=knapsack_solver
+      )
+      for layer in optimizable_model.layers:
+        if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
+          layer_weights = layer.get_weights()
+          layer_weights[0] = np.multiply(layer_weights[0], masks[layer.name]) + offsets[layer.name]
+          layer.set_weights(layer_weights)
+      
+      # Mask gradients
+      # Before training the model at the next sparsity level, reset internal states
+      # Furthemore, modern optimizers (e.g. Adam) accumulate gradients during backprop
+      # Therefore, even if the gradient for a weight is zero, it might be updated, due to previous gradients
+      # Avoid this by resetting the internal variables of an optimizer
+      optimizable_model.reset_metrics()
+      optimizable_model.reset_states()
+      for x in optimizable_model.optimizer.variables():
+        x.assign(tf.zeros_like(x))
+      masked_backprop.update_masks(masks)
+
+      # Train model with weight freezing [pruning]
+      if verbose:
+        print(f'Pruning with a target sparsity of {target_sparsity * 100.0}% [relative to objective]')
+      for epoch in range(epochs - rewinding_epochs):
+        start_time = time.time()
+        epoch_loss_avg = tf.keras.metrics.Mean()
+
+        # Masked backprop
+        for (X, y) in train_dataset:
+          loss_value = masked_backprop(tf.convert_to_tensor(X), tf.convert_to_tensor(y), target_sparsity)
+          epoch_loss_avg.update_state(loss_value)
+        
+        # Evaluate on validation set and print epoch summary
+        if verbose:
+          val_res = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)
+          print(f'Epoch: {epoch + 1} - Time: {time.time() - start_time}s - Average training loss: {round(epoch_loss_avg.result(), 3)}')
+          print(f'Epoch: {epoch + 1} - learning_rate: {optimizable_model.optimizer.learning_rate.numpy()}')
+          print(f'Epoch: {epoch + 1} - Loss on validation set: {val_res[0]} - Performance on validation set: {val_res[1]}')
+
+      # Check if model works after pruning
+      pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
+      if verbose:
+        print(f'Optimized model performance on validation set, after fine-tuning: {pruned_performance}')
+
+      if __compare__(pruned_performance, rtol * baseline_performance, not increasing):
+        bad_trials = 0
+        sparsity_conditions, target_sparsity = scheduler.update_step()
+        optimizable_model.save_weights(f'{directory}/optimization/best_weights.h5')
+      else:
+        bad_trials += 1
+        sparsity_conditions, target_sparsity = scheduler.repair_step()
+              
+      # If the model performed poorly over several sparsity levels, stop optimization [maximum sparsity reached]
+      if bad_trials > cutoff_bad_trials:
+        break
+
+      # Train model without weight freezing [rewinding] 
+      if verbose:
+        print(f'Starting weight rewinding for {rewinding_epochs} epochs')
+      optimizable_model.fit(
+        train_dataset, validation_data=validation_dataset,
+        batch_size=batch_size, epochs=rewinding_epochs, 
+        callbacks=callbacks, verbose=verbose
+      )
+
+    # Load best weights 
+    optimizable_model.load_weights(f'{directory}/optimization/best_weights.h5')
+    
+    # Remove regularizers and save best model
+    optimizable_model = remove_custom_regularizers(optimizable_model)
+    optimizable_model.compile(optimizer, loss_fn, metrics=[validation_metric])
+
+    # In GPU FLOP Optimization, remove structures to achieve speed-up & fine-tune the smaller architecture
+    # TODO - Extend for Resource strategy in hls4ml FF optimisation
+    if objective.__name__ in ('GPUFLOPEstimator'):
+      optimizable_model = reduce_model(optimizable_model)
+      optimizable_model.compile(optimizer, loss_fn, metrics=[validation_metric])
+      optimizable_model.fit(
+        train_dataset, validation_data=validation_dataset,
+        batch_size=batch_size, epochs=int(1.5 * epochs), 
+        callbacks=callbacks
+      )
+
+    # Evaluate final optimized model [purely for debugging / informative purposes]
+    if verbose:
+      pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]   
+      print(f'Optimized model performance on validation set: {pruned_performance}')
+
+    return optimizable_model
+
+class MaskedBackprop():
+    '''
+    A helper class to perform masked backprop (training with frozen weights)
+    The important function is __call__ as it masks gradients, based on frozen weights
+    While this function can exist without a class, taking masks as input would deplete memory
+    Since a new graph is created for every call, causing a large run-time
+    The trick is to set the masks, models etc. as class variables and then pass the sparsity
+    As the sparsity changes, a new graph of the function is created
+    '''
+    def __init__(self, model, loss_fn, attributes):
+      self.model = model
+      self.loss_fn = loss_fn
+      self.attributes = attributes
+      self.masks = {}
+
+    def update_masks(self, masks):
+      self.masks = masks
+      
+    @tf.function
+    def __call__(self, X, y, s):
+      '''
+      Helper function performing backprop
+
+      Args:
+          - X (tf.Tensor): Input data
+          - y (tf.Tensor): Output data
+          - s (float): Sparsity
+      
+      Return:
+          - loss (tf.Varilable): Model loss with input X and output y
+      '''
+      grads = []
+      with tf.GradientTape(persistent=True) as tape:
+        output = self.model(X, training=True)
+        loss = self.loss_fn(y, output) 
+        loss += tf.add_n(self.model.losses) 
+        for layer in self.model.layers:
+          if layer.trainable_weights:
+            grad = tape.gradient(loss, layer.trainable_weights)
+            if self.attributes[layer.name].optimizable:
+              grad[0] = tf.multiply(grad[0], self.masks[layer.name])
+            grads += grad
+      self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
+      return loss
+
+def __compare__(x, y, leq=False):
+  '''
+  Helper function for comparing two values, x & y
+  Sometimes, we use the >= sign - e.g. pruned_accuracy >= tolerance * baseline_accuracy [ 0 <= tolerance <= 1]
+  Other times, use the <= sign - e.g. pruned_mse <= tolerance * baseline_mse [tolerance >= 1]
+  '''
+  if leq:
+    return x <= y
+  else:
+    return x >= y
\ No newline at end of file
diff --git a/hls4ml/optimization/objectives/gpu_objectives.py b/hls4ml/optimization/objectives/gpu_objectives.py
new file mode 100644
index 0000000000..0de608865e
--- /dev/null
+++ b/hls4ml/optimization/objectives/gpu_objectives.py
@@ -0,0 +1,57 @@
+import logging
+import numpy as np
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.objectives import ObjectiveEstimator
+from hls4ml.optimization.attributes import OptimizationAttributes
+
+class GPUFLOPEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return False, None
+        else:
+            return True, OptimizationAttributes(structure_type=SUPPORTED_STRUCTURES.STRUCTURED, pruning=True, weight_sharing=False)
+    
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        else:
+            if 'Dense' in layer_attributes.layer_type.__name__:
+                return [2 * np.prod(layer_attributes.weight_shape) + layer_attributes.weight_shape[1]]
+            elif 'Conv2D' in layer_attributes.layer_type.__name__:
+                return [2 * np.prod(layer_attributes.weight_shape) * layer_attributes.output_shape[0] * layer_attributes.output_shape[1] + layer_attributes.weight_shape[3]]
+            else:
+                raise Exception('Unknown layer encountered when estimating FLOP utilization.')
+    
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        
+        structure_type = layer_attributes.optimization_attributes.structure_type 
+        pruning = layer_attributes.optimization_attributes.pruning
+        weight_sharing = layer_attributes.optimization_attributes.weight_sharing
+
+        if weight_sharing:
+            logging.warn('Weight sharing does not decrease FLOPs. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            return [0]
+        
+        if not pruning:
+            logging.warn('Pruning needs to be enabled to decrease FLOPs. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            return [0]
+
+        # TODO - The below formulas underestimate FLOP savings [removing a filter in a lyer removes channels / neurons in subsequent layers]
+        if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+            if 'Dense' in layer_attributes.layer_type.__name__:
+                return [2 * layer_attributes.weight_shape[0] + 1]
+            elif 'Conv2D' in layer_attributes.layer_type.__name__:
+                return [2 * np.prod(layer_attributes.weight_shape[0:3]) * layer_attributes.output_shape[0] * layer_attributes.output_shape[1] + 1]
+            else:
+                raise Exception('Unknown layer encountered when estimating FLOP savings.')
+        else:
+            logging.warn('FLOP savings occur with structured pruning. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            return [0]   
\ No newline at end of file

From a778e393e8d31ffea476bdac3c675b4047d60e47 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 20:05:43 +0300
Subject: [PATCH 09/19] hls4ml objectives & top-level optimization function

---
 hls4ml/optimization/__init__.py               |  57 ++++
 .../objectives/vivado_objectives.py           | 256 ++++++++++++++++++
 2 files changed, 313 insertions(+)
 create mode 100644 hls4ml/optimization/objectives/vivado_objectives.py

diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
index e69de29bb2..46caa33dcc 100644
--- a/hls4ml/optimization/__init__.py
+++ b/hls4ml/optimization/__init__.py
@@ -0,0 +1,57 @@
+import numpy as np
+from hls4ml.optimization.keras import optimize_model
+from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
+
+def optimize_keras_for_hls4ml(
+    keras_model, hls_config, objective, scheduler, X_train, y_train, X_val, y_val, 
+    batch_size, epochs, optimizer, loss_fn, validation_metric, increasing, rtol, 
+    callbacks=[], ranking_metric='l1', local=False, verbose=False, rewinding_epochs=1, cutoff_bad_trials=3,
+    directory='hls4ml-optimization', tuner='Bayesian', knapsack_solver='CBC_MIP',
+    regularization_range=np.logspace(-6, -2, num=16).tolist()
+):
+    '''
+    Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s)
+
+    Args:
+    - keras_model (keras.Model): Model to be optimized
+    - hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...)
+    - objective (hls4ml.optimization.objectives.ObjectiveEstimator): Parameter, hardware or user-defined objective of optimization
+    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler): Sparsity scheduler, choose between constant, polynomial and binary
+    - X_train (np.array): Training inputs
+    - y_train (np.array): Training labels
+    - X_val (np.array): Validation inputs
+    - y_val (np.array): Validation labels
+    - batch_size (int): Batch size during training
+    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+    - optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
+    - loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
+    - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
+    - increasing (boolean): If the metric improves with increased values; e.g. accuracy -> increasing = True, MSE -> increasing = False
+    - rtol (float): Relative tolerance; pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+    
+    Kwargs:
+    - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
+    - ranking_metric (string): Metric used for rannking weights and structures; currently supported l1, l2, saliency and Oracle
+    - local (boolean): Layer-wise or global pruning
+    - verbose (boolean): Display debug logs during model optimization
+    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing, allows regrowth of previously pruned weights
+    - cutoff_bad_trials (int): After how many bad trials (performance below threshold), should model pruning / weight sharing stop
+    - directory (string): Directory to store temporary results
+    - tuner (str): Tuning alogorithm, choose between Bayesian, Hyperband and None
+    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing; default usually works well; for very large networks, greedy algorithm might be more suitable
+    - regularization_range (list): List of suitable hyperparameters for weight decay
+    '''
+
+    # Extract model attributes
+    model_attributes = get_attributes_from_keras_model_and_hls4ml_config(keras_model, hls_config)
+
+    # Optimize model
+    return optimize_model(
+        keras_model, model_attributes, objective, scheduler,
+        X_train, y_train, X_val, y_val, batch_size, epochs, 
+        optimizer, loss_fn, validation_metric, increasing, rtol, 
+        callbacks=callbacks, ranking_metric=ranking_metric, local=local, verbose=verbose, 
+        rewinding_epochs=rewinding_epochs, cutoff_bad_trials=cutoff_bad_trials,
+        directory=directory, tuner=tuner, knapsack_solver=knapsack_solver,
+        regularization_range=regularization_range
+    )
diff --git a/hls4ml/optimization/objectives/vivado_objectives.py b/hls4ml/optimization/objectives/vivado_objectives.py
new file mode 100644
index 0000000000..07a1ad0352
--- /dev/null
+++ b/hls4ml/optimization/objectives/vivado_objectives.py
@@ -0,0 +1,256 @@
+import math
+import logging 
+import numpy as np
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.objectives import ObjectiveEstimator
+from hls4ml.optimization.attributes import OptimizationAttributes
+
+# Optimizes DSP utilisation for Vivado backend
+class VivadoDSPEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
+            return False, None
+        else:
+            if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+                return True, OptimizationAttributes(
+                            SUPPORTED_STRUCTURES.PATTERN, pruning = True, weight_sharing = False,
+                            pattern_offset=np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor, 
+                            consecutive_patterns=1
+                        )
+            else:
+                return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning = True, weight_sharing = False)
+    
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
+            return [0]
+        else:
+            # TOOD - Extend for parallelisation factor
+            return [np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor]
+    
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
+            return [0]
+
+        # TODO - Once we know how to implement constant coefficient multiplication via LUT, enable for weight sharing
+        pruning = layer_attributes.optimization_attributes.pruning
+        if not pruning:
+            logging.warn(
+                'Pruning needs to be enabled to decrease the number of DSPs used. \
+                It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+        
+        structure_type = layer_attributes.optimization_attributes.structure_type 
+        if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                return [1]
+            else:
+                return [0]
+        else:
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                    return [1]
+                else:
+                    return [0]
+            elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                if layer_attributes.args['hls4ml_attributes'].reuse_factor == layer_attributes.args['hls4ml_attributes'].n_in:
+                    return [1]
+                else:
+                    return [0]
+            elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                pattern_offset = layer_attributes.optimization_attributes.pattern_offset
+                number_of_patterns = np.prod(layer_attributes.weight_shape)// pattern_offset
+
+                if number_of_patterns == layer_attributes.args['hls4ml_attributes'].reuse_factor:
+                    return [layer_attributes.optimization_attributes.consecutive_patterns]
+                else:
+                    return [0]  
+            elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                logging.warn('hls4ml does not support block sparsity patterns...setting layer savings to zero')
+                return [0]
+
+# Optimizes BRAM and DSP for Vivado backend     
+class VivadoMultiObjectiveEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return False, None
+        
+        if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource' and layer_attributes.args['hls4ml_attributes'].reuse_factor > 1 :
+            if 36 % layer_attributes.args['hls4ml_attributes'].weight_precision.width == 0:
+                consecutive_patterns = int(36 // layer_attributes.args['hls4ml_attributes'].weight_precision.width)
+            else:
+                consecutive_patterns = int(math.ceil(2 * 36 / layer_attributes.args['hls4ml_attributes'].weight_precision.width))
+            
+            return True, OptimizationAttributes(
+                SUPPORTED_STRUCTURES.PATTERN, pruning = True, weight_sharing = False,
+                    pattern_offset=int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor), 
+                    consecutive_patterns=consecutive_patterns
+                )
+        else:
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning = True, weight_sharing = False)
+    
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        else:
+            # TOOD - Extend for parallelisation factor
+            # For RF > 1, BRAM utilised by weights can be estimated by (bit_width * n_in * n_out) / (RF * 36)
+            return [
+                int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
+                int(math.ceil(np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width / (layer_attributes.args['hls4ml_attributes'].reuse_factor * 36))),
+            ]
+    
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        
+        # TODO - Once we know how to implement constant coefficient multiplication via LUT, enable for weight sharing
+        pruning = layer_attributes.optimization_attributes.pruning
+        if not pruning:
+            logging.warn(
+                'Pruning needs to be enabled to decrease the number of DSPs used. \
+                It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+        
+        structure_type = layer_attributes.optimization_attributes.structure_type 
+        if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                    return [1]
+            else:
+                    return [0]
+        else:
+            if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource' and layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                return [1, 0]
+            else:
+                if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                    pattern_offset = layer_attributes.optimization_attributes.pattern_offset
+                    consecutive_patterns = layer_attributes.optimization_attributes.consecutive_patterns
+                    weight_precision = layer_attributes.args['hls4ml_attributes'].weight_precision.width
+
+                    number_of_patterns = np.prod(layer_attributes.weight_shape) // pattern_offset
+                    saved_one_bram_block = \
+                        (36 == consecutive_patterns * weight_precision and 36 % weight_precision == 0) \
+                        or (72 == consecutive_patterns * weight_precision)
+
+                    if number_of_patterns == layer_attributes.args['hls4ml_attributes'].reuse_factor and saved_one_bram_block:
+                        return [consecutive_patterns, 1]
+                    else:
+                        logging.warn('Support for multi-objective optimisation is not fully implemented yet....')
+                        return [0, 0]  
+                else:
+                    logging.warn('Support for multi-objective optimisation is not fully implemented yet....')
+                    return [0, 0]
+                
+class VivadoFFEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return False, None
+        
+        # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; skipping
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+            logging.warn('FFs are at minimum utilization with io_stream and Resource strategy')
+            return False, None
+
+        # With io_stream in Latency, weight are stored in FFs, so unstructured pruning will benefit the most
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning = True, weight_sharing = False)
+
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so structured pruning is the most suitable, it reduces the size out output before compile-time
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.STRUCTURED, pruning = True, weight_sharing = False)
+
+        # In io_parallel with Latency, weights and activation tensors are all stored in FFs, so it is equivalent to unstructured, high sparsity pruning
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning = True, weight_sharing = False)
+    
+    # TODO - This method is inaccurate (accross all cases); in general, estimating FFs is hard, but as long as it is consistent(ly wrong), it should not matter for the pruning
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        
+        # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; minimal FF utilization
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+            return [0]
+
+        # With io_stream in Latency, weight are stored in FFs, so FF ~ number_of_weights x weight_precision
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            return [np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width ]
+
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so FF ~ number_of_outputs x weight_precision
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+            return [np.prod(layer_attributes.output_shape) * layer_attributes.args['hls4ml_attributes'].output_precision.width ]
+    
+        # In io_parallel with Latency, weights and latency are all stored in FFs, so it is equivalent to the sum of the above two cases
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            return [
+                np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width +
+                np.prod(layer_attributes.output_shape) * layer_attributes.args['hls4ml_attributes'].output_precision.width
+            ]
+    
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        
+        structure_type = layer_attributes.optimization_attributes.structure_type 
+        pruning = layer_attributes.optimization_attributes.pruning
+        weight_sharing = layer_attributes.optimization_attributes.weight_sharing
+        
+        if weight_sharing:
+            logging.warn('Weight sharing does not decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            return [0]
+        
+        if not pruning:
+            logging.warn('Pruning needs to be enabled to decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            return [0]
+
+        # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; minimal FF utilization
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+            return [0]
+
+        # With io_stream in Latency, weight are stored in FFs, so any type of pruning will help:
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                return [layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+            elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                return [layer_attributes.args['hls4ml_attributes'].n_in * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+            elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                number_of_patterns = np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+            elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                return [np.prod(layer_attributes.optimization_attributes.block_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so only structured pruning helps
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+            if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                return [layer_attributes.args['hls4ml_attributes'].output_precision.width]
+            else:
+                return [0]
+            
+        # In io_parallel with Latency, weights and latency are all stored in FFs, so any type of pruning helps
+        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            # This is a significant under-estimate, as some savings are incurred due to less intermediate results
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                return [layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+            elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                print('here')
+                return [
+                    layer_attributes.args['hls4ml_attributes'].n_in * layer_attributes.args['hls4ml_attributes'].weight_precision.width +
+                    layer_attributes.args['hls4ml_attributes'].output_precision.width
+                    ]
+            elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                number_of_patterns = np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+            elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                return [np.prod(layer_attributes.optimization_attributes.block_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width]

From 7cd25a016ff28a673cca37b4310deaf9df331d69 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 14 Apr 2023 20:38:22 +0300
Subject: [PATCH 10/19] Add docs for hls4ml Optimization API

---
 docs/advanced/model_optimization.rst | 120 +++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 docs/advanced/model_optimization.rst

diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
new file mode 100644
index 0000000000..75367c3088
--- /dev/null
+++ b/docs/advanced/model_optimization.rst
@@ -0,0 +1,120 @@
+========================
+hls4ml Optimization API
+========================
+
+Pruning and weight sharing are effective techniques to reduce model footprint and computational requirements. The hls4ml Optimization API introduces hardware-aware pruning and weight sharing.
+By defining custom objectives, the algorithm solves a Knapsack optimization problem aimed at maximizing model performance, while keeping the target resource(s) at a minimum. Out-of-the box objectives include network sparsity, GPU FLOPs, Vivado DSPs, memory utilization etc. 
+
+The code block below showcases three use cases of the hls4ml Optimization API - network sparsity (unstructured pruning), GPU FLOPs (structured pruning) and Vivado DSP utilization (pattern pruning). First, we start with unstructured pruning:
+
+.. code-block:: Python
+    from sklearn.metrics import accuracy_score
+    from tensorflow.keras.optimizers import Adam
+    from tensorflow.keras.metrics import CategoricalAccuracy
+    from tensorflow.keras.losses import CategoricalCrossentropy
+    from hls4ml.optimization.keras import optimize_model
+    from hls4ml.optimization.keras.utils import get_model_sparsity
+    from hls4ml.optimization.attributes import get_attributes_from_keras_model
+    from hls4ml.optimization.objectives import ParameterEstimator
+    from hls4ml.optimization.scheduler import PolynomialScheduler
+    # Define baseline model and load data
+    # X_train, y_train = ...
+    # X_val, y_val = ...
+    # X_test, y_test = ...
+    # baseline_model = ...
+    # Evaluate baseline model
+    y_baseline = baseline_model.predict(X_test)
+    acc_base = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_baseline, axis=1))
+    sparsity, layers = get_model_sparsity(baseline_model)
+    print(f'Baseline Keras accuracy: {acc_base}')
+    print(f'Baseline Keras sparsity, overall: {sparsity}')
+    print(f'Baseline Keras sparsity, per-layer: {layers}')
+    # Defining training parameters
+    # Epochs refers to the number of maximum epochs to train a model, after imposing some sparsity
+    # If the model is pre-trained, a good rule of thumb is to use between a 1/3 and 1/2 of the number of epochs used to train baseline model
+    epochs = 10
+    batch_size = 128
+    metric = 'accuracy'
+    optimizer = Adam()
+    loss_fn = CategoricalCrossentropy(from_logits=True)
+
+    # Define the metric to monitor, as well as if its increasing or decreasing
+    # This disctinction allows us to optimize both regression and classification models
+    # In regression, e.g. minimize validation MSE & for classification e.g. maximize accuracy
+    metric, increasing = CategoricalAccuracy(), True
+    # Relative tolerance (rtol) is the the relative loss in metric the optimized model is allowed to incur
+    rtol = 0.975
+
+    # A scheduler defines how the sparsity is incremented at each step
+    # In this case, the maximum sparsity is 50% and it will be applied at a polynomially decreasing rate, for 10 steps
+    # If the final sparsity is unspecified, it is set to 100%
+    # The optimization algorithm stops either when (i) the relative drop in performance is below threshold or (ii) final sparsity reached
+    scheduler = PolynomialScheduler(5, final_sparsity=0.5)
+    # Get model attributes
+    model_attributes = get_attributes_from_keras_model(baseline_model)
+
+    # Optimize model
+    # ParameterEstimator is the objective and, in this case, the objective is to minimize the total number of parameters
+    optimized_model = optimize_model(
+        baseline_model, model_attributes, ParameterEstimator, scheduler,
+        X_train, y_train, X_val, y_val, batch_size, epochs, optimizer, loss_fn, metric, increasing, rtol
+    )
+    # Evaluate optimized model
+    y_optimized = optimized_model.predict(X_test)
+    acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
+    sparsity, layers = get_model_sparsity(optimized_model)
+    print(f'Optimized Keras accuracy: {acc_optimized}')
+    print(f'Optimized Keras sparsity, overall: {sparsity}')
+    print(f'Opimized Keras sparsity, per-layer: {layers}')
+
+In a similar manner, it is possible to target GPU FLOPs or Vivado DSPs. However, in that case, sparsity is not equivalent to model sparsity.
+Instead, it is the sparsity of the target resource. As an example: Starting with a network utilizing 512 DSPs and a final sparsity of 50%; the optimized network will use 256 DSPs.
+
+To optimize GPU FLOPs, the code is similar to above:
+.. code-block:: Python
+    from hls4ml.optimization.objectives.gpu_objectives import GPUFLOPEstimator
+
+    # Optimize model
+    # Note the change from ParameterEstimator to GPUFLOPEstimator
+    optimized_model = optimize_model(
+        baseline_model, model_attributes, GPUFLOPEstimator, scheduler,
+        X_train, y_train, X_val, y_val, batch_size, epochs, optimizer, loss_fn, metric, increasing, rtol
+    )
+    # Evaluate optimized model
+    y_optimized = optimized_model.predict(X_test)
+    acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
+    print(f'Optimized Keras accuracy: {acc_optimized}')
+    # Note the difference in total number of parameters
+    # Optimizing GPU FLOPs is equivalent to removing entire structures (filters, neurons) from the network
+    print(baseline_model.summary())
+    print(optimized_model.summary())
+
+Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
+.. code-block:: Python
+    from hls4ml.utils.config import config_from_keras_model
+    from hls4ml.optimization.objectives.vivado_objectives import VivadoDSPEstimator
+
+    # Note the change from optimize_model to optimize_keras_for_hls4ml
+    # The function optimize_keras_for_hls4ml acts as a wrapper for the function, parsing hls4ml config to model attributes
+    from hls4ml.optimization import optimize_keras_for_hls4ml
+
+    # Create hls4ml config
+    default_reuse_factor = 4
+    default_precision = 'ac_fixed<16, 6>'
+    hls_config = config_from_keras_model(baseline_model, granularity='name', default_precision=default_precision, default_reuse_factor=default_reuse_factor)
+    hls_config['IOType'] = 'io_parallel'
+     hls_config['Model']['Strategy'] = 'Resource'   # Strategy must be present for optimisation
+
+    # Optimize model
+    # Note the change from ParameterEstimator to VivadoDSPEstimator
+    optimized_model = optimize_keras_for_hls4ml(
+        baseline_model, model_attributes, VivadoDSPEstimator, scheduler,
+        X_train, y_train, X_val, y_val, batch_size, epochs, optimizer, loss_fn, metric, increasing, rtol
+    )
+
+There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilisation and VivadoMultiObjectiveEstimator, aimed at optimising BRAM and DSP utilisation.
+Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesing HLS, by modifying the config:
+.. code-block:: Python
+    hls_config = config_from_keras_model(optimized_model)     
+    hls_config['Model']['DenseResourceImplementation'] = 'Unrolled'
+    # Any addition hls4ml config, such as strategy, reuse factor etc...

From f792ea6acb8cc6026f853abbd26e671373021a99 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 16 Jun 2023 11:51:45 +0100
Subject: [PATCH 11/19] Full support for multi-objective Vivado optimisation

---
 .../objectives/vivado_objectives.py           | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/hls4ml/optimization/objectives/vivado_objectives.py b/hls4ml/optimization/objectives/vivado_objectives.py
index 07a1ad0352..7589ffed26 100644
--- a/hls4ml/optimization/objectives/vivado_objectives.py
+++ b/hls4ml/optimization/objectives/vivado_objectives.py
@@ -97,13 +97,26 @@ def is_layer_optimizable(self, layer_attributes):
     def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
-        else:
-            # TOOD - Extend for parallelisation factor
-            # For RF > 1, BRAM utilised by weights can be estimated by (bit_width * n_in * n_out) / (RF * 36)
+
+        # TOOD - Extend for parallelisation factor
+        if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
             return [
                 int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
-                int(math.ceil(np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width / (layer_attributes.args['hls4ml_attributes'].reuse_factor * 36))),
+                0,
             ]
+        else:
+            # Resource strategy, RF == 1 is similar to Latency strategy (but slower)
+            if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                return [
+                    int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
+                    0,
+                ]
+            else:
+                # For RF > 1, BRAM utilised by weights can be estimated by (bit_width * n_in * n_out) / (RF * 36)
+                return [
+                    int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
+                    int(math.ceil(np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width / (layer_attributes.args['hls4ml_attributes'].reuse_factor * 36))),
+                ]
     
     @classmethod
     def layer_savings(self, layer_attributes):
@@ -122,9 +135,9 @@ def layer_savings(self, layer_attributes):
         structure_type = layer_attributes.optimization_attributes.structure_type 
         if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
             if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
-                    return [1]
+                    return [1, 0]
             else:
-                    return [0]
+                    return [0, 0]
         else:
             if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource' and layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
                 return [1, 0]
@@ -244,7 +257,6 @@ def layer_savings(self, layer_attributes):
             if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
                 return [layer_attributes.args['hls4ml_attributes'].weight_precision.width]
             elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
-                print('here')
                 return [
                     layer_attributes.args['hls4ml_attributes'].n_in * layer_attributes.args['hls4ml_attributes'].weight_precision.width +
                     layer_attributes.args['hls4ml_attributes'].output_precision.width

From 82779ff7a1da60bc54a7891e6eaf88428096e905 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 16 Jun 2023 12:10:20 +0100
Subject: [PATCH 12/19] part of pre-commit of Optimization API pt.1

---
 hls4ml/optimization/__init__.py               |  68 ++-
 hls4ml/optimization/attributes.py             | 136 +++---
 hls4ml/optimization/config.py                 |  10 +-
 hls4ml/optimization/keras/__init__.py         | 396 ++++++++++--------
 hls4ml/optimization/keras/builder.py          | 147 +++++--
 hls4ml/optimization/keras/config.py           |   4 +-
 hls4ml/optimization/keras/masking.py          | 161 ++++---
 hls4ml/optimization/keras/reduction.py        |  25 +-
 hls4ml/optimization/keras/regularizers.py     |  57 ++-
 hls4ml/optimization/keras/utils.py            |  40 +-
 hls4ml/optimization/knapsack.py               |  29 +-
 hls4ml/optimization/objectives/__init__.py    |  66 +--
 .../optimization/objectives/gpu_objectives.py |  55 ++-
 .../objectives/vivado_objectives.py           | 262 ++++++++----
 hls4ml/optimization/scheduler.py              |  43 +-
 .../pytest/optimization/keras/test_masking.py | 185 ++++----
 .../optimization/keras/test_reduction.py      |  89 ++--
 .../optimization/keras/test_regularizers.py   | 110 +++--
 .../optimization/keras/test_weight_sharing.py |  56 ++-
 test/pytest/optimization/test_attributes.py   | 111 ++---
 test/pytest/optimization/test_knapsack.py     |  31 +-
 test/pytest/optimization/test_objectives.py   |  41 +-
 test/pytest/optimization/test_scheduler.py    |  22 +-
 23 files changed, 1318 insertions(+), 826 deletions(-)

diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
index 46caa33dcc..4f18998c1c 100644
--- a/hls4ml/optimization/__init__.py
+++ b/hls4ml/optimization/__init__.py
@@ -1,13 +1,35 @@
 import numpy as np
-from hls4ml.optimization.keras import optimize_model
+
 from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
+from hls4ml.optimization.keras import optimize_model
+
 
 def optimize_keras_for_hls4ml(
-    keras_model, hls_config, objective, scheduler, X_train, y_train, X_val, y_val, 
-    batch_size, epochs, optimizer, loss_fn, validation_metric, increasing, rtol, 
-    callbacks=[], ranking_metric='l1', local=False, verbose=False, rewinding_epochs=1, cutoff_bad_trials=3,
-    directory='hls4ml-optimization', tuner='Bayesian', knapsack_solver='CBC_MIP',
-    regularization_range=np.logspace(-6, -2, num=16).tolist()
+    keras_model,
+    hls_config,
+    objective,
+    scheduler,
+    X_train,
+    y_train,
+    X_val,
+    y_val,
+    batch_size,
+    epochs,
+    optimizer,
+    loss_fn,
+    validation_metric,
+    increasing,
+    rtol,
+    callbacks=[],
+    ranking_metric='l1',
+    local=False,
+    verbose=False,
+    rewinding_epochs=1,
+    cutoff_bad_trials=3,
+    directory='hls4ml-optimization',
+    tuner='Bayesian',
+    knapsack_solver='CBC_MIP',
+    regularization_range=np.logspace(-6, -2, num=16).tolist(),
 ):
     '''
     Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s)
@@ -28,7 +50,7 @@ def optimize_keras_for_hls4ml(
     - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
     - increasing (boolean): If the metric improves with increased values; e.g. accuracy -> increasing = True, MSE -> increasing = False
     - rtol (float): Relative tolerance; pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
-    
+
     Kwargs:
     - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
     - ranking_metric (string): Metric used for rannking weights and structures; currently supported l1, l2, saliency and Oracle
@@ -47,11 +69,29 @@ def optimize_keras_for_hls4ml(
 
     # Optimize model
     return optimize_model(
-        keras_model, model_attributes, objective, scheduler,
-        X_train, y_train, X_val, y_val, batch_size, epochs, 
-        optimizer, loss_fn, validation_metric, increasing, rtol, 
-        callbacks=callbacks, ranking_metric=ranking_metric, local=local, verbose=verbose, 
-        rewinding_epochs=rewinding_epochs, cutoff_bad_trials=cutoff_bad_trials,
-        directory=directory, tuner=tuner, knapsack_solver=knapsack_solver,
-        regularization_range=regularization_range
+        keras_model,
+        model_attributes,
+        objective,
+        scheduler,
+        X_train,
+        y_train,
+        X_val,
+        y_val,
+        batch_size,
+        epochs,
+        optimizer,
+        loss_fn,
+        validation_metric,
+        increasing,
+        rtol,
+        callbacks=callbacks,
+        ranking_metric=ranking_metric,
+        local=local,
+        verbose=verbose,
+        rewinding_epochs=rewinding_epochs,
+        cutoff_bad_trials=cutoff_bad_trials,
+        directory=directory,
+        tuner=tuner,
+        knapsack_solver=knapsack_solver,
+        regularization_range=regularization_range,
     )
diff --git a/hls4ml/optimization/attributes.py b/hls4ml/optimization/attributes.py
index d19572163b..02154931a5 100644
--- a/hls4ml/optimization/attributes.py
+++ b/hls4ml/optimization/attributes.py
@@ -1,8 +1,10 @@
-import hls4ml
 import numpy as np
+
+import hls4ml
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 from hls4ml.optimization.keras.config import SUPPORTED_LAYERS
-from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType
+
 
 class hls4mlAttributes:
     '''
@@ -18,19 +20,22 @@ class hls4mlAttributes:
     - reuse_factor (int): Layer reuse factor
     - parallelization_factor (int): Layer parallelization factor - [applicable to io_parallel Conv2D]
     '''
-    def __init__(self, n_in, n_out, io_type, strategy, weight_precision, output_precision, reuse_factor, parallelization_factor=1):    
+
+    def __init__(
+        self, n_in, n_out, io_type, strategy, weight_precision, output_precision, reuse_factor, parallelization_factor=1
+    ):
         if not isinstance(weight_precision, (FixedPrecisionType, IntegerPrecisionType)):
             raise Exception('Layer weight precision is not in valid format')
-    
+
         if not isinstance(output_precision, (FixedPrecisionType, IntegerPrecisionType)):
             raise Exception('Layer weight precision is not in valid format')
 
         if not strategy in ('Latency', 'latency', 'Resource', 'resource'):
             raise Exception('Unknown layer strategy')
-        
+
         if not io_type in ('io_parallel', 'io_stream'):
             raise Exception('Unknown IO type')
-        
+
         self.n_in = n_in
         self.n_out = n_out
         self.io_type = io_type
@@ -40,26 +45,36 @@ def __init__(self, n_in, n_out, io_type, strategy, weight_precision, output_prec
         self.reuse_factor = reuse_factor
         self.parallelization_factor = parallelization_factor
 
+
 class OptimizationAttributes:
     '''
     A class for storing layer optimization attributes
-    
+
     Args:
         - structure_type (enum): Targeted structure - unstructured, structured, pattern, block
         - pruning (boolean): Should pruning be applied to the layer
         - weight_sharing (boolean): Should weight sharing be applied to the layer
         - block_shape (tuple): Block shape if structure_type == block
-        - pattern_offset (int): Length of each pattern if structure_type == pattern 
-        - consecutive_patterns (int): How many consecutive patterns are grouped together if structure_type == pattern 
-    
+        - pattern_offset (int): Length of each pattern if structure_type == pattern
+        - consecutive_patterns (int): How many consecutive patterns are grouped together if structure_type == pattern
+
     Notes:
         - In the case of hls4ml, pattern_offset is equivalent to the number of weights processed in parallel
         - The pattern_offset is n_in * n_out / reuse_factor; default case (=1) is equivalent to no unrolling
     '''
-    def __init__(self, structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=False, weight_sharing=False, block_shape=(1, 1), pattern_offset=1, consecutive_patterns=1):
+
+    def __init__(
+        self,
+        structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        pruning=False,
+        weight_sharing=False,
+        block_shape=(1, 1),
+        pattern_offset=1,
+        consecutive_patterns=1,
+    ):
         if not isinstance(structure_type, SUPPORTED_STRUCTURES):
             raise Exception(f'{self.__class__.__name__} unknown structure type')
-    
+
         self.structure_type = structure_type
         self.pruning = pruning
         self.weight_sharing = weight_sharing
@@ -67,10 +82,11 @@ def __init__(self, structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=Fal
         self.pattern_offset = pattern_offset
         self.consecutive_patterns = consecutive_patterns
 
+
 class LayerAttributes:
     '''
     A class for storing layer information
-    
+
     Args:
         - name (string): Layer name
         - layer_type (keras.Layer): Layer type (e.g. Dense, Conv2D etc.)
@@ -82,7 +98,19 @@ class LayerAttributes:
         - optimization_attributes (OptimizationAttributes): Type of optimization, optimization vs weight sharing, block shape and pattern offset
         - args (dict): Additional information, e.g. hls4mlAttributes; dictionary so it can be generic enough for different platforms
     '''
-    def __init__(self, name, layer_type, inbound_layers, weight_shape, input_shape,output_shape,optimizable, optimization_attributes, args):
+
+    def __init__(
+        self,
+        name,
+        layer_type,
+        inbound_layers,
+        weight_shape,
+        input_shape,
+        output_shape,
+        optimizable,
+        optimization_attributes,
+        args,
+    ):
         self.name = name
         self.layer_type = layer_type
         self.inbound_layers = inbound_layers
@@ -90,69 +118,79 @@ def __init__(self, name, layer_type, inbound_layers, weight_shape, input_shape,o
         self.input_shape = input_shape
         self.output_shape = output_shape
         self.optimizable = optimizable
-        self.optimization_attributes = optimization_attributes 
+        self.optimization_attributes = optimization_attributes
         self.args = args
 
     def update_args(self, updates):
         self.args.update(updates)
 
     def __str__(self):
-        return  f'name: {self.name}, ' \
-                f'layer_type: {self.layer_type}, ' \
-                f'inbound_layers: {self.inbound_layers}, ' \
-                f'weight_shape: {self.weight_shape}, ' \
-                f'input_shape: {self.input_shape}, ' \
-                f'output_shape: {self.output_shape}, ' \
-                f'optimizable: {self.optimizable}, ' \
-                f'optimization_attributes: {self.optimization_attributes}, ' \
-                f'args: {self.args}, ' \
+        return (
+            f'name: {self.name}, '
+            f'layer_type: {self.layer_type}, '
+            f'inbound_layers: {self.inbound_layers}, '
+            f'weight_shape: {self.weight_shape}, '
+            f'input_shape: {self.input_shape}, '
+            f'output_shape: {self.output_shape}, '
+            f'optimizable: {self.optimizable}, '
+            f'optimization_attributes: {self.optimization_attributes}, '
+            f'args: {self.args}, '
+        )
+
 
 def get_attributes_from_keras_model(model):
     '''
     Given a Keras model, builds a dictionary of class attributes
     Additional arguments (e.g. reuse factor), depend on the target hardware platform and are inserted later
     Per-layer pruning sype (structured, pattern etc.), depend on the pruning objective and are inserted later
-    
+
     Args:
         - model (keras.model): Model to extract attributes from
 
     Return:
-        - model_attributes (dict): Each key corresponds to a layer name, values are instances of LayerAttribute 
+        - model_attributes (dict): Each key corresponds to a layer name, values are instances of LayerAttribute
     '''
-    is_sequential =  model.__class__.__name__ == 'Sequential'
+    is_sequential = model.__class__.__name__ == 'Sequential'
     model_attributes = {}
 
     for i, layer in enumerate(model.layers):
         inbound_layers = []
         if is_sequential and i > 0:
-            inbound_layers.append(model.layers[i-1])
+            inbound_layers.append(model.layers[i - 1])
         elif not is_sequential:
             nodes = model.get_config()['layers'][i]['inbound_nodes']
             if len(nodes) > 0:
                 inbound_layers.append(node[0] for node in nodes[0])
-        
+
         layer_weights = layer.get_weights()
         weight_shape = layer_weights[0].shape if len(layer_weights) > 0 else ()
 
         model_attributes[layer.name] = LayerAttributes(
-            layer.name, layer.__class__, inbound_layers,
-            weight_shape, layer.input_shape[1:], layer.output_shape[1:],
-            False, OptimizationAttributes(), {}
+            layer.name,
+            layer.__class__,
+            inbound_layers,
+            weight_shape,
+            layer.input_shape[1:],
+            layer.output_shape[1:],
+            False,
+            OptimizationAttributes(),
+            {},
         )
-    
+
     return model_attributes
 
+
 def get_attributes_from_keras_model_and_hls4ml_config(model, config):
     '''
     Given a Keras model and hls4ml configuration, builds a dictionary of class attributes
     Per-layer pruning sype (structured, pruning etc.), depend on the pruning objective and are inserted later
-    
+
     Args:
         - model (keras.model): Model to extract attributes from
-        - config (dict): hls4ml dictionary 
-    
+        - config (dict): hls4ml dictionary
+
     Return:
-        - model_attributes (dict): Each key corresponds to a layer name, values are LayerAttribute instances 
+        - model_attributes (dict): Each key corresponds to a layer name, values are LayerAttribute instances
     '''
 
     # Extract Keras attributes
@@ -163,30 +201,32 @@ def get_attributes_from_keras_model_and_hls4ml_config(model, config):
     default_reuse_factor = config['Model']['ReuseFactor']
     default_strategy = config['Model']['Strategy']
     default_precision = config['Model']['Precision']
-    
+
     # Build dictionary
     for layer in model_attributes:
         if model_attributes[layer].layer_type in SUPPORTED_LAYERS:
             n_in, n_out = __get_layer_mult_size(model_attributes[layer])
             layer_config = config['LayerName'][layer] if layer in config['LayerName'] else {}
-            reuse_factor = layer_config['ReuseFactor'] if 'ReuseFactor' in layer_config else default_reuse_factor 
-            parallelization_factor =  layer_config['ParallelizationFactor'] if 'ParallelizationFactor' in layer_config else 1
+            reuse_factor = layer_config['ReuseFactor'] if 'ReuseFactor' in layer_config else default_reuse_factor
+            parallelization_factor = layer_config['ParallelizationFactor'] if 'ParallelizationFactor' in layer_config else 1
             strategy = layer_config['Strategy'] if 'Strategy' in layer_config else default_strategy
-            weight_precision = layer_config['Precision']['weight'] if 'weight' in layer_config['Precision'] else default_precision
+            weight_precision = (
+                layer_config['Precision']['weight'] if 'weight' in layer_config['Precision'] else default_precision
+            )
             weight_precision = hls4ml.backends.fpga.fpga_backend.FPGABackend.convert_precision_string(weight_precision)
-            output_precision = layer_config['Precision']['result'] if 'result' in layer_config['Precision'] else default_precision
+            output_precision = (
+                layer_config['Precision']['result'] if 'result' in layer_config['Precision'] else default_precision
+            )
             output_precision = hls4ml.backends.fpga.fpga_backend.FPGABackend.convert_precision_string(output_precision)
-        
+
             hls4ml_attributes = hls4mlAttributes(
-                n_in, n_out,
-                io_type, strategy,
-                weight_precision, output_precision,
-                reuse_factor, parallelization_factor
+                n_in, n_out, io_type, strategy, weight_precision, output_precision, reuse_factor, parallelization_factor
             )
             model_attributes[layer].update_args({'hls4ml_attributes': hls4ml_attributes})
-    
+
     return model_attributes
 
+
 def __get_layer_mult_size(attributes):
     '''
     Helper function to calculate layer multiplication size
diff --git a/hls4ml/optimization/config.py b/hls4ml/optimization/config.py
index fb4e9c4de0..90fb4acceb 100644
--- a/hls4ml/optimization/config.py
+++ b/hls4ml/optimization/config.py
@@ -16,7 +16,7 @@
     - Description: Zeroes out or quantizes all the weights in a structure:
         - Dense: Neurons, determined by their outgoing connections (columns in Keras weight tensors)
         - Conv2D: Filters (structures of size filt_width x filt_height x n_chan)
-        - Notes: 
+        - Notes:
             - For Dense, it was also possible optimize by incoming connections (rows); however, removing zero neurons becomes harder
             - For Conv2D, significant literature explored pruning channels; currently not supported
     - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
@@ -36,8 +36,10 @@
     - Supports: All rank-2 (e.g. Dense, but not Conv2D) layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
 
 '''
+
+
 class SUPPORTED_STRUCTURES(Enum):
     UNSTRUCTURED = 'unstructured'
-    STRUCTURED   = 'structured'
-    PATTERN      = 'pattern'
-    BLOCK        = 'block' 
+    STRUCTURED = 'structured'
+    PATTERN = 'pattern'
+    BLOCK = 'block'
diff --git a/hls4ml/optimization/keras/__init__.py b/hls4ml/optimization/keras/__init__.py
index 9f4f348ecb..c9088d50b5 100644
--- a/hls4ml/optimization/keras/__init__.py
+++ b/hls4ml/optimization/keras/__init__.py
@@ -1,28 +1,52 @@
 import os
 import time
+
 import numpy as np
 import tensorflow as tf
+
+# Enables printing of loss tensors during custom training loop
+from tensorflow.python.ops.numpy_ops import np_config
+
 import hls4ml.optimization.keras.utils as utils
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.keras.reduction import reduce_model
+from hls4ml.optimization.keras.builder import build_optimizable_model, remove_custom_regularizers
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS, TMP_DIRECTORY
 from hls4ml.optimization.keras.masking import get_model_masks
+from hls4ml.optimization.keras.reduction import reduce_model
 from hls4ml.optimization.scheduler import OptimizationScheduler
-from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS, TMP_DIRECTORY
-from hls4ml.optimization.keras.builder import build_optimizable_model, remove_custom_regularizers
 
-# Enables printing of loss tensors during custom training loop
-from tensorflow.python.ops.numpy_ops import np_config
 np_config.enable_numpy_behavior()
 
+
 def optimize_model(
-    model, model_attributes, objective, scheduler, X_train, y_train, X_val, y_val, 
-    batch_size, epochs, optimizer, loss_fn, validation_metric, increasing, rtol,
-    callbacks=[], ranking_metric='l1', local=False, verbose=False, rewinding_epochs=1, cutoff_bad_trials=1,
-    directory=TMP_DIRECTORY, tuner='Bayesian', knapsack_solver='CBC_MIP',
-    regularization_range=np.logspace(-6, -2, num=16).tolist()
-  ):
+    model,
+    model_attributes,
+    objective,
+    scheduler,
+    X_train,
+    y_train,
+    X_val,
+    y_val,
+    batch_size,
+    epochs,
+    optimizer,
+    loss_fn,
+    validation_metric,
+    increasing,
+    rtol,
+    callbacks=[],
+    ranking_metric='l1',
+    local=False,
+    verbose=False,
+    rewinding_epochs=1,
+    cutoff_bad_trials=1,
+    directory=TMP_DIRECTORY,
+    tuner='Bayesian',
+    knapsack_solver='CBC_MIP',
+    regularization_range=np.logspace(-6, -2, num=16).tolist(),
+):
     '''
-    Top-level function for optimizing a Keras model, given objectives 
+    Top-level function for optimizing a Keras model, given objectives
 
     Args:
     - model (keras.Model): Model to be optimized
@@ -40,7 +64,7 @@ def optimize_model(
     - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
     - increasing (boolean): If the metric improves with increased values; e.g. accuracy -> increasing = True, MSE -> increasing = False
     - rtol (float): Relative tolerance; pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
-    
+
     Kwargs:
     - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
     - ranking_metric (string): Metric used for rannking weights and structures; currently supported l1, l2, saliency and Oracle
@@ -55,22 +79,24 @@ def optimize_model(
     '''
 
     if not isinstance(scheduler, OptimizationScheduler):
-        raise Exception('Scheduler must be an instance of from hls4ml.optimization.scheduler.OptimizationScheduler' +\
-                        'If you provided string description (e.g. \'constant\'), please use an object instance (i.e. ConstantScheduler())'
-                        'For a full list of supported schedulers and their description, refer to hls4ml.optimization.scheduler.'
-                      )
+        raise Exception(
+            'Scheduler must be an instance of from hls4ml.optimization.scheduler.OptimizationScheduler'
+            + 'If you provided string description (e.g. \'constant\'), please use an object instance (i.e. ConstantScheduler())'
+            'For a full list of supported schedulers and their description, refer to hls4ml.optimization.scheduler.'
+        )
 
     if epochs <= rewinding_epochs:
-       raise Exception('Please increase the number of epochs. \
+        raise Exception(
+            'Please increase the number of epochs. \
                        The current epoch number is too small to perform effective pruning & weight rewinding'
-                      )
-  
+        )
+
     if ranking_metric not in SUPPORTED_METRICS:
         raise Exception('Unknown metric for ranking weights')
-    
+
     # Loss function needs to be converted to a function, string description cannot be used during custom training loop
     if isinstance(loss_fn, str):
-      loss_fn = tf.keras.losses.get(loss_fn)
+        loss_fn = tf.keras.losses.get(loss_fn)
 
     # Split data set into batches
     train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
@@ -83,39 +109,55 @@ def optimize_model(
     model.compile(optimizer, loss_fn, metrics=[validation_metric])
     baseline_performance = model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
     if verbose:
-      print('Baseline performance on validation set: {}'.format(baseline_performance))
+        print(f'Baseline performance on validation set: {baseline_performance}')
 
     # Save best weights
     # Always save weights to a file, to reduce memory utilization
     if not os.path.isdir(directory):
-      os.mkdir(directory)
+        os.mkdir(directory)
     if not os.path.isdir(f'{directory}/optimization'):
-      os.mkdir(f'{directory}/optimization')
+        os.mkdir(f'{directory}/optimization')
     model.save_weights(f'{directory}/optimization/best_weights.h5')
 
     # Identify optimizable layers, given the current objective
     last_optimizable_layer = utils.get_last_layer_with_weights(model)
     for i, layer in enumerate(model.layers):
-        if isinstance(layer, SUPPORTED_LAYERS): 
-          optimizable, optimization_attributes = objective.is_layer_optimizable(model_attributes[layer.name])
-          model_attributes[layer.name].optimizable = optimizable
-          model_attributes[layer.name].optimization_attributes = optimization_attributes
-          
-          # In the last layer, structured pruning can't be applied, as it removes output labels
-          # Weight sharing, as well as all other types of pruning (unstructured, block etc.) are applicable
-          if i >= last_optimizable_layer and optimization_attributes.structure_type == SUPPORTED_STRUCTURES.STRUCTURED and optimization_attributes.pruning:
-            model_attributes[layer.name].optimization_attributes.pruning = False
-            model_attributes[layer.name].optimizable = model_attributes[layer.name].optimization_attributes.weight_sharing
-        else: 
-          model_attributes[layer.name].optimizable = False
-          model_attributes[layer.name].optimization_attributes = None
-        
+        if isinstance(layer, SUPPORTED_LAYERS):
+            optimizable, optimization_attributes = objective.is_layer_optimizable(model_attributes[layer.name])
+            model_attributes[layer.name].optimizable = optimizable
+            model_attributes[layer.name].optimization_attributes = optimization_attributes
+
+            # In the last layer, structured pruning can't be applied, as it removes output labels
+            # Weight sharing, as well as all other types of pruning (unstructured, block etc.) are applicable
+            if (
+                i >= last_optimizable_layer
+                and optimization_attributes.structure_type == SUPPORTED_STRUCTURES.STRUCTURED
+                and optimization_attributes.pruning
+            ):
+                model_attributes[layer.name].optimization_attributes.pruning = False
+                model_attributes[layer.name].optimizable = model_attributes[
+                    layer.name
+                ].optimization_attributes.weight_sharing
+        else:
+            model_attributes[layer.name].optimizable = False
+            model_attributes[layer.name].optimization_attributes = None
+
     # Add regularization loss to optimizable layers
     optimizable_model = build_optimizable_model(
-      model, model_attributes, optimizer, loss_fn, validation_metric, 
-      increasing, train_dataset, validation_dataset, batch_size, epochs // 2, 
-      verbose=verbose, directory=directory, tuner=tuner,
-      regularization_range=regularization_range, 
+        model,
+        model_attributes,
+        optimizer,
+        loss_fn,
+        validation_metric,
+        increasing,
+        train_dataset,
+        validation_dataset,
+        batch_size,
+        epochs // 2,
+        verbose=verbose,
+        directory=directory,
+        tuner=tuner,
+        regularization_range=regularization_range,
     )
 
     # Create class for masked backprop (weight freezing)
@@ -123,86 +165,103 @@ def optimize_model(
 
     # In certain cases, the model might underperform at the current sparsity level, but perform better at a higher sparsity
     # Therefore, monitor the models performance over several sparsity levels and only stop pruning after high loss over several trials
-    bad_trials = 0  
+    bad_trials = 0
     sparsity_conditions = True
     target_sparsity = scheduler.get_sparsity()
 
     while sparsity_conditions:
-      # TODO - This might cause OOM issues on large models / data sets, since it is not done in batches
-      gradients = utils.get_model_gradients(optimizable_model, loss_fn, X_train, y_train) if ranking_metric == 'gradients' else {}
-      hessians = utils.get_model_hessians(optimizable_model, loss_fn, X_train, y_train) if ranking_metric == 'saliency' else {}
-      
-      # Mask weights
-      masks, offsets = get_model_masks(
-        optimizable_model, model_attributes, target_sparsity, 
-        objective, metric=ranking_metric, local=local, 
-        gradients=gradients, hessians=hessians, knapsack_solver=knapsack_solver
-      )
-      for layer in optimizable_model.layers:
-        if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
-          layer_weights = layer.get_weights()
-          layer_weights[0] = np.multiply(layer_weights[0], masks[layer.name]) + offsets[layer.name]
-          layer.set_weights(layer_weights)
-      
-      # Mask gradients
-      # Before training the model at the next sparsity level, reset internal states
-      # Furthemore, modern optimizers (e.g. Adam) accumulate gradients during backprop
-      # Therefore, even if the gradient for a weight is zero, it might be updated, due to previous gradients
-      # Avoid this by resetting the internal variables of an optimizer
-      optimizable_model.reset_metrics()
-      optimizable_model.reset_states()
-      for x in optimizable_model.optimizer.variables():
-        x.assign(tf.zeros_like(x))
-      masked_backprop.update_masks(masks)
-
-      # Train model with weight freezing [pruning]
-      if verbose:
-        print(f'Pruning with a target sparsity of {target_sparsity * 100.0}% [relative to objective]')
-      for epoch in range(epochs - rewinding_epochs):
-        start_time = time.time()
-        epoch_loss_avg = tf.keras.metrics.Mean()
-
-        # Masked backprop
-        for (X, y) in train_dataset:
-          loss_value = masked_backprop(tf.convert_to_tensor(X), tf.convert_to_tensor(y), target_sparsity)
-          epoch_loss_avg.update_state(loss_value)
-        
-        # Evaluate on validation set and print epoch summary
+        # TODO - This might cause OOM issues on large models / data sets, since it is not done in batches
+        gradients = (
+            utils.get_model_gradients(optimizable_model, loss_fn, X_train, y_train) if ranking_metric == 'gradients' else {}
+        )
+        hessians = (
+            utils.get_model_hessians(optimizable_model, loss_fn, X_train, y_train) if ranking_metric == 'saliency' else {}
+        )
+
+        # Mask weights
+        masks, offsets = get_model_masks(
+            optimizable_model,
+            model_attributes,
+            target_sparsity,
+            objective,
+            metric=ranking_metric,
+            local=local,
+            gradients=gradients,
+            hessians=hessians,
+            knapsack_solver=knapsack_solver,
+        )
+        for layer in optimizable_model.layers:
+            if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
+                layer_weights = layer.get_weights()
+                layer_weights[0] = np.multiply(layer_weights[0], masks[layer.name]) + offsets[layer.name]
+                layer.set_weights(layer_weights)
+
+        # Mask gradients
+        # Before training the model at the next sparsity level, reset internal states
+        # Furthemore, modern optimizers (e.g. Adam) accumulate gradients during backprop
+        # Therefore, even if the gradient for a weight is zero, it might be updated, due to previous gradients
+        # Avoid this by resetting the internal variables of an optimizer
+        optimizable_model.reset_metrics()
+        optimizable_model.reset_states()
+        for x in optimizable_model.optimizer.variables():
+            x.assign(tf.zeros_like(x))
+        masked_backprop.update_masks(masks)
+
+        # Train model with weight freezing [pruning]
+        if verbose:
+            print(f'Pruning with a target sparsity of {target_sparsity * 100.0}% [relative to objective]')
+        for epoch in range(epochs - rewinding_epochs):
+            start_time = time.time()
+            epoch_loss_avg = tf.keras.metrics.Mean()
+
+            # Masked backprop
+            for X, y in train_dataset:
+                loss_value = masked_backprop(tf.convert_to_tensor(X), tf.convert_to_tensor(y), target_sparsity)
+                epoch_loss_avg.update_state(loss_value)
+
+            # Evaluate on validation set and print epoch summary
+            if verbose:
+                val_res = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)
+                print(
+                    f'Epoch: {epoch + 1} - Time: {time.time() - start_time}s - Average training loss: {round(epoch_loss_avg.result(), 3)}'
+                )
+                print(f'Epoch: {epoch + 1} - learning_rate: {optimizable_model.optimizer.learning_rate.numpy()}')
+                print(
+                    f'Epoch: {epoch + 1} - Loss on validation set: {val_res[0]} - Performance on validation set: {val_res[1]}'
+                )
+
+        # Check if model works after pruning
+        pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
+        if verbose:
+            print(f'Optimized model performance on validation set, after fine-tuning: {pruned_performance}')
+
+        if __compare__(pruned_performance, rtol * baseline_performance, not increasing):
+            bad_trials = 0
+            sparsity_conditions, target_sparsity = scheduler.update_step()
+            optimizable_model.save_weights(f'{directory}/optimization/best_weights.h5')
+        else:
+            bad_trials += 1
+            sparsity_conditions, target_sparsity = scheduler.repair_step()
+
+        # If the model performed poorly over several sparsity levels, stop optimization [maximum sparsity reached]
+        if bad_trials > cutoff_bad_trials:
+            break
+
+        # Train model without weight freezing [rewinding]
         if verbose:
-          val_res = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)
-          print(f'Epoch: {epoch + 1} - Time: {time.time() - start_time}s - Average training loss: {round(epoch_loss_avg.result(), 3)}')
-          print(f'Epoch: {epoch + 1} - learning_rate: {optimizable_model.optimizer.learning_rate.numpy()}')
-          print(f'Epoch: {epoch + 1} - Loss on validation set: {val_res[0]} - Performance on validation set: {val_res[1]}')
-
-      # Check if model works after pruning
-      pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
-      if verbose:
-        print(f'Optimized model performance on validation set, after fine-tuning: {pruned_performance}')
-
-      if __compare__(pruned_performance, rtol * baseline_performance, not increasing):
-        bad_trials = 0
-        sparsity_conditions, target_sparsity = scheduler.update_step()
-        optimizable_model.save_weights(f'{directory}/optimization/best_weights.h5')
-      else:
-        bad_trials += 1
-        sparsity_conditions, target_sparsity = scheduler.repair_step()
-              
-      # If the model performed poorly over several sparsity levels, stop optimization [maximum sparsity reached]
-      if bad_trials > cutoff_bad_trials:
-        break
-
-      # Train model without weight freezing [rewinding] 
-      if verbose:
-        print(f'Starting weight rewinding for {rewinding_epochs} epochs')
-      optimizable_model.fit(
-        train_dataset, validation_data=validation_dataset,
-        batch_size=batch_size, epochs=rewinding_epochs, 
-        callbacks=callbacks, verbose=verbose
-      )
-
-    # Load best weights 
+            print(f'Starting weight rewinding for {rewinding_epochs} epochs')
+        optimizable_model.fit(
+            train_dataset,
+            validation_data=validation_dataset,
+            batch_size=batch_size,
+            epochs=rewinding_epochs,
+            callbacks=callbacks,
+            verbose=verbose,
+        )
+
+    # Load best weights
     optimizable_model.load_weights(f'{directory}/optimization/best_weights.h5')
-    
+
     # Remove regularizers and save best model
     optimizable_model = remove_custom_regularizers(optimizable_model)
     optimizable_model.compile(optimizer, loss_fn, metrics=[validation_metric])
@@ -210,22 +269,25 @@ def optimize_model(
     # In GPU FLOP Optimization, remove structures to achieve speed-up & fine-tune the smaller architecture
     # TODO - Extend for Resource strategy in hls4ml FF optimisation
     if objective.__name__ in ('GPUFLOPEstimator'):
-      optimizable_model = reduce_model(optimizable_model)
-      optimizable_model.compile(optimizer, loss_fn, metrics=[validation_metric])
-      optimizable_model.fit(
-        train_dataset, validation_data=validation_dataset,
-        batch_size=batch_size, epochs=int(1.5 * epochs), 
-        callbacks=callbacks
-      )
+        optimizable_model = reduce_model(optimizable_model)
+        optimizable_model.compile(optimizer, loss_fn, metrics=[validation_metric])
+        optimizable_model.fit(
+            train_dataset,
+            validation_data=validation_dataset,
+            batch_size=batch_size,
+            epochs=int(1.5 * epochs),
+            callbacks=callbacks,
+        )
 
     # Evaluate final optimized model [purely for debugging / informative purposes]
     if verbose:
-      pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]   
-      print(f'Optimized model performance on validation set: {pruned_performance}')
+        pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
+        print(f'Optimized model performance on validation set: {pruned_performance}')
 
     return optimizable_model
 
-class MaskedBackprop():
+
+class MaskedBackprop:
     '''
     A helper class to perform masked backprop (training with frozen weights)
     The important function is __call__ as it masks gradients, based on frozen weights
@@ -234,49 +296,51 @@ class MaskedBackprop():
     The trick is to set the masks, models etc. as class variables and then pass the sparsity
     As the sparsity changes, a new graph of the function is created
     '''
+
     def __init__(self, model, loss_fn, attributes):
-      self.model = model
-      self.loss_fn = loss_fn
-      self.attributes = attributes
-      self.masks = {}
+        self.model = model
+        self.loss_fn = loss_fn
+        self.attributes = attributes
+        self.masks = {}
 
     def update_masks(self, masks):
-      self.masks = masks
-      
+        self.masks = masks
+
     @tf.function
     def __call__(self, X, y, s):
-      '''
-      Helper function performing backprop
-
-      Args:
-          - X (tf.Tensor): Input data
-          - y (tf.Tensor): Output data
-          - s (float): Sparsity
-      
-      Return:
-          - loss (tf.Varilable): Model loss with input X and output y
-      '''
-      grads = []
-      with tf.GradientTape(persistent=True) as tape:
-        output = self.model(X, training=True)
-        loss = self.loss_fn(y, output) 
-        loss += tf.add_n(self.model.losses) 
-        for layer in self.model.layers:
-          if layer.trainable_weights:
-            grad = tape.gradient(loss, layer.trainable_weights)
-            if self.attributes[layer.name].optimizable:
-              grad[0] = tf.multiply(grad[0], self.masks[layer.name])
-            grads += grad
-      self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
-      return loss
+        '''
+        Helper function performing backprop
+
+        Args:
+            - X (tf.Tensor): Input data
+            - y (tf.Tensor): Output data
+            - s (float): Sparsity
+
+        Return:
+            - loss (tf.Varilable): Model loss with input X and output y
+        '''
+        grads = []
+        with tf.GradientTape(persistent=True) as tape:
+            output = self.model(X, training=True)
+            loss = self.loss_fn(y, output)
+            loss += tf.add_n(self.model.losses)
+            for layer in self.model.layers:
+                if layer.trainable_weights:
+                    grad = tape.gradient(loss, layer.trainable_weights)
+                    if self.attributes[layer.name].optimizable:
+                        grad[0] = tf.multiply(grad[0], self.masks[layer.name])
+                    grads += grad
+        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
+        return loss
+
 
 def __compare__(x, y, leq=False):
-  '''
-  Helper function for comparing two values, x & y
-  Sometimes, we use the >= sign - e.g. pruned_accuracy >= tolerance * baseline_accuracy [ 0 <= tolerance <= 1]
-  Other times, use the <= sign - e.g. pruned_mse <= tolerance * baseline_mse [tolerance >= 1]
-  '''
-  if leq:
-    return x <= y
-  else:
-    return x >= y
\ No newline at end of file
+    '''
+    Helper function for comparing two values, x & y
+    Sometimes, we use the >= sign - e.g. pruned_accuracy >= tolerance * baseline_accuracy [ 0 <= tolerance <= 1]
+    Other times, use the <= sign - e.g. pruned_mse <= tolerance * baseline_mse [tolerance >= 1]
+    '''
+    if leq:
+        return x <= y
+    else:
+        return x >= y
diff --git a/hls4ml/optimization/keras/builder.py b/hls4ml/optimization/keras/builder.py
index c87b15c604..9bb4db6626 100644
--- a/hls4ml/optimization/keras/builder.py
+++ b/hls4ml/optimization/keras/builder.py
@@ -1,14 +1,19 @@
 import re
+
+import keras_tuner as kt
 import numpy as np
 import tensorflow as tf
-import keras_tuner as kt
-from qkeras import QDense, QConv2D
-from tensorflow.keras.layers import Dense, Conv2D
+from qkeras import QConv2D, QDense
+from qkeras.utils import _add_supported_quantized_objects
 from tensorflow.keras.callbacks import EarlyStopping
+from tensorflow.keras.layers import Conv2D, Dense
+
 from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, TMP_DIRECTORY
-from hls4ml.optimization.keras.regularizers import DenseRegularizer, Conv2DRegularizer
+from hls4ml.optimization.keras.regularizers import Conv2DRegularizer, DenseRegularizer
+
+co = {}
+_add_supported_quantized_objects(co)
 
-from qkeras.utils import _add_supported_quantized_objects; co = {}; _add_supported_quantized_objects(co)
 
 class HyperOptimizationModel(kt.HyperModel):
     '''
@@ -22,7 +27,7 @@ class HyperOptimizationModel(kt.HyperModel):
         - validation_metric (keras.metrics.Metric or equivalent string description): Model validation metric
         - regularization_range (list): List of suitable hyperparameters for weight decay
     '''
-    
+
     def __init__(self, model, attributes, optimizer, loss_fn, validation_metric, regularization_range):
         self.model = model
         self.attributes = attributes
@@ -37,35 +42,69 @@ def build(self, hp):
 
         # Make regularization loss a tunable hyperparameter
         for layer in model_to_prune.layers:
-            if isinstance(layer, SUPPORTED_LAYERS) and self.attributes[layer.name].optimizable:        
+            if isinstance(layer, SUPPORTED_LAYERS) and self.attributes[layer.name].optimizable:
                 structure_type = self.attributes[layer.name].optimization_attributes.structure_type
                 block_shape = self.attributes[layer.name].optimization_attributes.block_shape
                 pattern_offset = self.attributes[layer.name].optimization_attributes.pattern_offset
                 consecutive_patterns = self.attributes[layer.name].optimization_attributes.consecutive_patterns
-                
+
                 pruning = self.attributes[layer.name].optimization_attributes.pruning
-                weight_sharing = self.attributes[layer.name].optimization_attributes.weight_sharing         
-            
-                alpha = hp.Choice(f'{layer.name}_alpha', values=self.regularization_range, default=default_regularizaton) if pruning else 0
-                beta = hp.Choice(f'{layer.name}_beta', values=self.regularization_range, default=default_regularizaton) if weight_sharing else 0
+                weight_sharing = self.attributes[layer.name].optimization_attributes.weight_sharing
+
+                alpha = (
+                    hp.Choice(f'{layer.name}_alpha', values=self.regularization_range, default=default_regularizaton)
+                    if pruning
+                    else 0
+                )
+                beta = (
+                    hp.Choice(f'{layer.name}_beta', values=self.regularization_range, default=default_regularizaton)
+                    if weight_sharing
+                    else 0
+                )
 
                 if isinstance(layer, (Dense, QDense)) and self.attributes[layer.name].optimizable:
-                    layer.kernel_regularizer = DenseRegularizer(alpha, beta, norm=1, structure_type=structure_type, block_shape=block_shape, pattern_offset=pattern_offset, consecutive_patterns=consecutive_patterns)
+                    layer.kernel_regularizer = DenseRegularizer(
+                        alpha,
+                        beta,
+                        norm=1,
+                        structure_type=structure_type,
+                        block_shape=block_shape,
+                        pattern_offset=pattern_offset,
+                        consecutive_patterns=consecutive_patterns,
+                    )
                 elif isinstance(layer, (Conv2D, QConv2D)) and self.attributes[layer.name].optimizable:
-                    layer.kernel_regularizer = Conv2DRegularizer(alpha, beta, norm=1, structure_type=structure_type, pattern_offset=pattern_offset, consecutive_patterns=consecutive_patterns)
+                    layer.kernel_regularizer = Conv2DRegularizer(
+                        alpha,
+                        beta,
+                        norm=1,
+                        structure_type=structure_type,
+                        pattern_offset=pattern_offset,
+                        consecutive_patterns=consecutive_patterns,
+                    )
 
         # Rebuild model graph
         model_to_prune = tf.keras.models.model_from_json(model_to_prune.to_json(), custom_objects=co)
         model_to_prune.set_weights(self.model.get_weights())
-        model_to_prune.compile(optimizer=self.optimizer, loss=self.loss_fn, metrics=[self.validation_metric]) 
+        model_to_prune.compile(optimizer=self.optimizer, loss=self.loss_fn, metrics=[self.validation_metric])
 
         return model_to_prune
 
+
 def build_optimizable_model(
-    model, attributes, optimizer, loss_fn, validation_metric, increasing,
-    train_dataset, validation_dataset, batch_size, epochs, 
-    verbose=False, directory=TMP_DIRECTORY, tuner='Bayesian',
-    regularization_range=np.logspace(-6, -2, num=15).tolist()
+    model,
+    attributes,
+    optimizer,
+    loss_fn,
+    validation_metric,
+    increasing,
+    train_dataset,
+    validation_dataset,
+    batch_size,
+    epochs,
+    verbose=False,
+    directory=TMP_DIRECTORY,
+    tuner='Bayesian',
+    regularization_range=np.logspace(-6, -2, num=15).tolist(),
 ):
     '''
     Function identifying optimizable layers and adding a regularization loss
@@ -100,29 +139,44 @@ def build_optimizable_model(
     if tuner == 'Manual':
         model_to_prune = tf.keras.models.clone_model(model)
         for layer in model_to_prune.layers:
-            if isinstance(layer, SUPPORTED_LAYERS) and attributes[layer.name].optimizable:        
+            if isinstance(layer, SUPPORTED_LAYERS) and attributes[layer.name].optimizable:
                 structure_type = attributes[layer.name].optimization_attributes.structure_type
                 block_shape = attributes[layer.name].optimization_attributes.block_shape
                 pattern_offset = attributes[layer.name].optimization_attributes.pattern_offset
                 consecutive_patterns = attributes[layer.name].optimization_attributes.consecutive_patterns
 
                 pruning = attributes[layer.name].optimization_attributes.pruning
-                weight_sharing = attributes[layer.name].optimization_attributes.weight_sharing         
+                weight_sharing = attributes[layer.name].optimization_attributes.weight_sharing
 
                 alpha = regularization_range[0] if pruning else 0
                 beta = regularization_range[0] if weight_sharing else 0
 
                 if isinstance(layer, (Dense, QDense)) and attributes[layer.name].optimizable:
-                    layer.kernel_regularizer = DenseRegularizer(alpha, beta, norm=1, structure_type=structure_type, block_shape=block_shape, pattern_offset=pattern_offset, consecutive_patterns=consecutive_patterns)
+                    layer.kernel_regularizer = DenseRegularizer(
+                        alpha,
+                        beta,
+                        norm=1,
+                        structure_type=structure_type,
+                        block_shape=block_shape,
+                        pattern_offset=pattern_offset,
+                        consecutive_patterns=consecutive_patterns,
+                    )
                 elif isinstance(layer, (Conv2D, QConv2D)) and attributes[layer.name].optimizable:
-                    layer.kernel_regularizer = Conv2DRegularizer(alpha, beta, norm=1, structure_type=structure_type, pattern_offset=pattern_offset, consecutive_patterns=consecutive_patterns)
+                    layer.kernel_regularizer = Conv2DRegularizer(
+                        alpha,
+                        beta,
+                        norm=1,
+                        structure_type=structure_type,
+                        pattern_offset=pattern_offset,
+                        consecutive_patterns=consecutive_patterns,
+                    )
 
         # Rebuild model graph
         model_to_prune = tf.keras.models.model_from_json(model_to_prune.to_json(), custom_objects=co)
         model_to_prune.set_weights(model.get_weights())
-        model_to_prune.compile(optimizer=optimizer, loss=loss_fn, metrics=[validation_metric]) 
+        model_to_prune.compile(optimizer=optimizer, loss=loss_fn, metrics=[validation_metric])
 
-        return model_to_prune    
+        return model_to_prune
 
     # User opted for hyper-parameter tuning
     else:
@@ -133,41 +187,46 @@ def build_optimizable_model(
             objective_name = re.sub(r'(?<!^)(?=[A-Z])', '_', validation_metric.__class__.__name__).lower()
         if tuner == 'Bayesian':
             tuner = kt.BayesianOptimization(
-                hypermodel = HyperOptimizationModel(model, attributes, optimizer, loss_fn, validation_metric, regularization_range),
-                objective = kt.Objective(objective_name, objective_direction),
-                max_trials = 10,
-                overwrite = True,
-                directory = directory + '/tuning',
+                hypermodel=HyperOptimizationModel(
+                    model, attributes, optimizer, loss_fn, validation_metric, regularization_range
+                ),
+                objective=kt.Objective(objective_name, objective_direction),
+                max_trials=10,
+                overwrite=True,
+                directory=directory + '/tuning',
             )
         elif tuner == 'Hyperband':
             tuner = kt.Hyperband(
-                hypermodel = HyperOptimizationModel(model, attributes, optimizer, loss_fn, validation_metric, regularization_range),
-                objective = kt.Objective(objective_name, objective_direction),
-                max_epochs = epochs,
-                factor = 3,
-                hyperband_iterations = 1,
-                overwrite = True,
-                directory = directory + '/tuning',
+                hypermodel=HyperOptimizationModel(
+                    model, attributes, optimizer, loss_fn, validation_metric, regularization_range
+                ),
+                objective=kt.Objective(objective_name, objective_direction),
+                max_epochs=epochs,
+                factor=3,
+                hyperband_iterations=1,
+                overwrite=True,
+                directory=directory + '/tuning',
             )
         else:
             raise Exception('Unknown tuner; possible options are Bayesian and Hyperband')
 
         if verbose:
             tuner.search_space_summary()
-        
+
         tuner.search(
-            train_dataset, 
-            epochs = epochs, 
-            batch_size = batch_size, 
-            validation_data = validation_dataset,
-            callbacks = [ EarlyStopping(monitor='val_loss', patience=1) ]
+            train_dataset,
+            epochs=epochs,
+            batch_size=batch_size,
+            validation_data=validation_dataset,
+            callbacks=[EarlyStopping(monitor='val_loss', patience=1)],
         )
-        
+
         if verbose:
             tuner.results_summary()
 
         return tuner.get_best_models(num_models=1)[0]
 
+
 def remove_custom_regularizers(model):
     '''
     Helper function to remove custom regularizers (DenseRegularizer & Conv2DRegularizer)
diff --git a/hls4ml/optimization/keras/config.py b/hls4ml/optimization/keras/config.py
index 1d651cd480..08bf67aa7b 100644
--- a/hls4ml/optimization/keras/config.py
+++ b/hls4ml/optimization/keras/config.py
@@ -1,5 +1,5 @@
-from qkeras import QDense, QConv2D
-from tensorflow.keras.layers import Dense, Conv2D
+from qkeras import QConv2D, QDense
+from tensorflow.keras.layers import Conv2D, Dense
 
 '''
 Optimizable layers in Keras / QKeras
diff --git a/hls4ml/optimization/keras/masking.py b/hls4ml/optimization/keras/masking.py
index 50052e7297..76d84dbea2 100644
--- a/hls4ml/optimization/keras/masking.py
+++ b/hls4ml/optimization/keras/masking.py
@@ -1,16 +1,27 @@
-import sys
 import logging
+import sys
+
 import numpy as np
 import tensorflow as tf
-from qkeras import QDense, QConv2D
-from tensorflow.keras.layers import Dense, Conv2D
-from hls4ml.optimization.knapsack import solve_knapsack
+from qkeras import QConv2D, QDense
+from tensorflow.keras.layers import Conv2D, Dense
+
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS
+from hls4ml.optimization.knapsack import solve_knapsack
 
 
-def get_model_masks(keras_model, model_attributes, sparsity, objective, metric='l1', local=False, gradients=None,
-                    hessians=None, knapsack_solver='CBC_MIP'):
+def get_model_masks(
+    keras_model,
+    model_attributes,
+    sparsity,
+    objective,
+    metric='l1',
+    local=False,
+    gradients=None,
+    hessians=None,
+    knapsack_solver='CBC_MIP',
+):
     '''
     Function calculating a binary mask for all optimizable layers
     Entries equal to one correspond to the weight being updated during the training
@@ -55,11 +66,13 @@ def get_model_masks(keras_model, model_attributes, sparsity, objective, metric='
         raise Exception('Saliency ranking requires second order derivatives')
 
     if local:
-        return __get_masks_local(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians,
-                                 knapsack_solver)
+        return __get_masks_local(
+            keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver
+        )
     else:
-        return __get_masks_global(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians,
-                                  knapsack_solver)
+        return __get_masks_global(
+            keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver
+        )
 
 
 def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver):
@@ -117,7 +130,7 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
                         np.array([g.value for g in groups]),
                         np.array([g.resources for g in groups]).T,
                         target_resources,
-                        implementation=knapsack_solver
+                        implementation=knapsack_solver,
                     )
 
                     # Selected weights are not masked
@@ -157,7 +170,7 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
                         np.array([g.value for g in groups]),
                         np.array([g.resources for g in groups]).T,
                         target_resources,
-                        implementation=knapsack_solver
+                        implementation=knapsack_solver,
                     )
 
                     # Selected neurons are not masked
@@ -198,7 +211,7 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
                         np.array([g.value for g in groups]),
                         np.array([g.resources for g in groups]).T,
                         target_resources,
-                        implementation=knapsack_solver
+                        implementation=knapsack_solver,
                     )
 
                     mask = np.zeros(value.shape, value.dtype)
@@ -247,8 +260,10 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
                         np.expand_dims(np.expand_dims(reshaped, 2), 0),
                         [1, consecutive_patterns, number_of_patterns, 1],
                         [1, consecutive_patterns, number_of_patterns, 1],
-                        [1, 1, 1, 1], 'SAME'
-                    ).numpy(), (total_blocks, -1)
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, -1),
                 )
 
                 # If pruning enabled, find cost associated with pruning each neuron
@@ -276,7 +291,7 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
                     np.array([g.value for g in groups]),
                     np.array([g.resources for g in groups]).T,
                     target_resources,
-                    implementation=knapsack_solver
+                    implementation=knapsack_solver,
                 )
 
                 # Decode masked groups into transposed shape and set selected groups to one
@@ -323,9 +338,10 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
                         np.expand_dims(np.expand_dims(value, 2), 0),
                         [1, block_shape[0], block_shape[1], 1],
                         [1, block_shape[0], block_shape[1], 1],
-                        [1, 1, 1, 1], 'SAME'
+                        [1, 1, 1, 1],
+                        'SAME',
                     ).numpy(),
-                    (total_blocks, block_shape[0] * block_shape[1])
+                    (total_blocks, block_shape[0] * block_shape[1]),
                 )
 
                 # If pruning enabled, find cost associated with pruning each neuron
@@ -353,7 +369,7 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
                     np.array([g.value for g in groups]),
                     np.array([g.resources for g in groups]).T,
                     target_resources,
-                    implementation=knapsack_solver
+                    implementation=knapsack_solver,
                 )
 
                 # Decode position of masked weights and set selected weights to one
@@ -385,8 +401,7 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
     return masks, offsets
 
 
-def __get_masks_global(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians,
-                       knapsack_solver):
+def __get_masks_global(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver):
     '''
     Function calculating a layer-wise binary mask for all optimizable layers
     Global masking, with layers of different sparsity; masks are calculated by solving a Knapsack problem
@@ -404,8 +419,9 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
         # Optimizable should be always enabled if either pruning or weight sharing are enabled
         # However, if the objectives are implemented incorrectly, it is possible to have optimizatons enabled without any types of optimization (pruning, weight sharing etc.) enabled
         layer_optimizable = model_attributes[layer.name].optimizable and (
-                    model_attributes[layer.name].optimization_attributes.weight_sharing or model_attributes[
-                layer.name].optimization_attributes.pruning)
+            model_attributes[layer.name].optimization_attributes.weight_sharing
+            or model_attributes[layer.name].optimization_attributes.pruning
+        )
         if isinstance(layer, SUPPORTED_LAYERS) and layer_optimizable:
             value = layer.get_weights()[0]
             structure_type = model_attributes[layer.name].optimization_attributes.structure_type
@@ -432,8 +448,9 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
                     value = value / np.max(value)
                     indices = np.indices(value.shape).reshape(value.ndim, -1).T
                     for i in indices:
-                        group = __WeightGroups__(value[tuple(i)], layer_savings, tuple(i), structure_type, layer.name,
-                                                 'pruning')
+                        group = __WeightGroups__(
+                            value[tuple(i)], layer_savings, tuple(i), structure_type, layer.name, 'pruning'
+                        )
                         groups.append(group)
 
             if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
@@ -451,11 +468,17 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
 
                     for i in range(vals_norm.shape[0]):
                         if vals_norm[i] <= vals_var[i]:
-                            groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, structure_type, layer.name,
-                                                           optimization_type='pruning'))
+                            groups.append(
+                                __WeightGroups__(
+                                    vals_norm[i], layer_savings, i, structure_type, layer.name, optimization_type='pruning'
+                                )
+                            )
                         else:
-                            groups.append(__WeightGroups__(vals_var[i], layer_savings, i, structure_type, layer.name,
-                                                           optimization_type='sharing'))
+                            groups.append(
+                                __WeightGroups__(
+                                    vals_var[i], layer_savings, i, structure_type, layer.name, optimization_type='sharing'
+                                )
+                            )
 
                 elif isinstance(layer, (Conv2D, QConv2D)):
                     if model_attributes[layer.name].optimization_attributes.pruning:
@@ -471,11 +494,17 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
 
                     for i in range(vals_norm.shape[0]):
                         if vals_norm[i] <= vals_var[i]:
-                            groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, structure_type, layer.name,
-                                                           optimization_type='pruning'))
+                            groups.append(
+                                __WeightGroups__(
+                                    vals_norm[i], layer_savings, i, structure_type, layer.name, optimization_type='pruning'
+                                )
+                            )
                         else:
-                            groups.append(__WeightGroups__(vals_var[i], layer_savings, i, structure_type, layer.name,
-                                                           optimization_type='sharing'))
+                            groups.append(
+                                __WeightGroups__(
+                                    vals_var[i], layer_savings, i, structure_type, layer.name, optimization_type='sharing'
+                                )
+                            )
 
             if structure_type == SUPPORTED_STRUCTURES.PATTERN:
                 pattern_offset = model_attributes[layer.name].optimization_attributes.pattern_offset
@@ -509,8 +538,10 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
                         np.expand_dims(np.expand_dims(reshaped, 2), 0),
                         [1, consecutive_patterns, number_of_patterns, 1],
                         [1, consecutive_patterns, number_of_patterns, 1],
-                        [1, 1, 1, 1], 'SAME'
-                    ).numpy(), (total_blocks, -1)
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, -1),
                 )
 
                 # If pruning enabled, find cost associated with pruning each neuron
@@ -528,11 +559,17 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
                 # Choose min(pruning, weight sharing)
                 for i in range(vals_norm.shape[0]):
                     if vals_norm[i] <= vals_var[i]:
-                        groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, structure_type, layer.name,
-                                                       optimization_type='pruning'))
+                        groups.append(
+                            __WeightGroups__(
+                                vals_norm[i], layer_savings, i, structure_type, layer.name, optimization_type='pruning'
+                            )
+                        )
                     else:
-                        groups.append(__WeightGroups__(vals_var[i], layer_savings, i, structure_type, layer.name,
-                                                       optimization_type='sharing'))
+                        groups.append(
+                            __WeightGroups__(
+                                vals_var[i], layer_savings, i, structure_type, layer.name, optimization_type='sharing'
+                            )
+                        )
 
             if structure_type == SUPPORTED_STRUCTURES.BLOCK:
                 if len(value.shape) != 2:
@@ -548,9 +585,10 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
                         np.expand_dims(np.expand_dims(value, 2), 0),
                         [1, block_shape[0], block_shape[1], 1],
                         [1, block_shape[0], block_shape[1], 1],
-                        [1, 1, 1, 1], 'SAME'
+                        [1, 1, 1, 1],
+                        'SAME',
                     ).numpy(),
-                    (total_blocks, block_shape[0] * block_shape[1])
+                    (total_blocks, block_shape[0] * block_shape[1]),
                 )
 
                 if model_attributes[layer.name].optimization_attributes.pruning:
@@ -566,11 +604,17 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
 
                 for i in range(vals_norm.shape[0]):
                     if vals_norm[i] <= vals_var[i]:
-                        groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, structure_type, layer.name,
-                                                       optimization_type='pruning'))
+                        groups.append(
+                            __WeightGroups__(
+                                vals_norm[i], layer_savings, i, structure_type, layer.name, optimization_type='pruning'
+                            )
+                        )
                     else:
-                        groups.append(__WeightGroups__(vals_var[i], layer_savings, i, structure_type, layer.name,
-                                                       optimization_type='sharing'))
+                        groups.append(
+                            __WeightGroups__(
+                                vals_var[i], layer_savings, i, structure_type, layer.name, optimization_type='sharing'
+                            )
+                        )
 
     # The goal is to maximize network accuracy (values) subject to resorces (objective) staying under some threshold
     # This is a Knapsack problem; several implementations are provided in the helper functions
@@ -581,7 +625,7 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
         np.array([s.value for s in groups]),
         np.array([s.resources for s in groups]).T,
         target_resources,
-        implementation=knapsack_solver
+        implementation=knapsack_solver,
     )
     # Update masks and offsets
     masks = {}
@@ -591,8 +635,9 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
         if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
             structure_type = model_attributes[layer.name].optimization_attributes.structure_type
             selected_layer = [i for i in selected if groups[i].layer_name == layer.name]
-            not_selected_layer = [i for i in range(len(groups)) if
-                                  groups[i].layer_name == layer.name and i not in selected_layer]
+            not_selected_layer = [
+                i for i in range(len(groups)) if groups[i].layer_name == layer.name and i not in selected_layer
+            ]
 
             if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
                 mask = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
@@ -615,7 +660,12 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
                         mask[:, :, :, groups[i].layer_position] = 1
                     for i in not_selected_layer:
                         if groups[i].optimization_type == 'sharing':
-                            offset[:, :, :, groups[i].layer_position, ] = np.mean(layer.get_weights()[0][:, :, :, i])
+                            offset[
+                                :,
+                                :,
+                                :,
+                                groups[i].layer_position,
+                            ] = np.mean(layer.get_weights()[0][:, :, :, i])
                 masks[layer.name] = mask
                 offsets[layer.name] = offset
 
@@ -654,8 +704,10 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
                         np.expand_dims(np.expand_dims(reshaped, 2), 0),
                         [1, consecutive_patterns, number_of_patterns, 1],
                         [1, consecutive_patterns, number_of_patterns, 1],
-                        [1, 1, 1, 1], 'SAME'
-                    ).numpy(), (total_blocks, -1)
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, -1),
                 )
 
                 for i in not_selected_layer:
@@ -684,9 +736,10 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
                         np.expand_dims(np.expand_dims(layer.get_weights()[0], 2), 0),
                         [1, block_shape[0], block_shape[1], 1],
                         [1, block_shape[0], block_shape[1], 1],
-                        [1, 1, 1, 1], 'SAME'
+                        [1, 1, 1, 1],
+                        'SAME',
                     ).numpy(),
-                    (total_blocks, block_shape[0] * block_shape[1])
+                    (total_blocks, block_shape[0] * block_shape[1]),
                 )
 
                 mask = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
@@ -726,4 +779,4 @@ def __init__(self, value, resources, layer_position, structure_type=None, layer_
         self.layer_position = layer_position
         self.structure_type = structure_type
         self.layer_name = layer_name
-        self.optimization_type = optimization_type
\ No newline at end of file
+        self.optimization_type = optimization_type
diff --git a/hls4ml/optimization/keras/reduction.py b/hls4ml/optimization/keras/reduction.py
index 3ccf691959..17821ef13e 100644
--- a/hls4ml/optimization/keras/reduction.py
+++ b/hls4ml/optimization/keras/reduction.py
@@ -1,30 +1,35 @@
 import numpy as np
+from tensorflow.keras.layers import Conv2D, Dense
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense, Conv2D
+
 from hls4ml.optimization.keras.utils import get_last_layer_with_weights
 
 '''
 Function for removing zero neurons & filters from a model and rewiring the model graph
-This function is built on top of Keras Surgeon available at: https://github.com/BenWhetton/keras-surgeon 
+This function is built on top of Keras Surgeon available at: https://github.com/BenWhetton/keras-surgeon
 Keras Surgeon is no longer under active development and does not work for TensorFlow 2.3+ and QKeras
 The baseline version was forked and updated, available at: https://github.com/bo3z/keras-surgeon
 
 Args:
     - model (keras.model): Input model
-    
+
 Return:
     - reduced (keras.model): Modified model, with redundant structures removed
 
     '''
+
+
 def reduce_model(model):
     # TODO - Should we make Keras Surgeon a hard requirement in setup.cfg? If so, needs to be installed from git, @bo3z fork
     try:
         from kerassurgeon import Surgeon
     except ModuleNotFoundError:
-        raise Exception('Keras Surgeon not installed. Unable to reduce model footprint '\
-                         'Please install up-to-date Keras Surgeon compatible wit TensorFlow 2.3+ and QKeras '\
-                         'Installation from git: https://github.com/bo3z/keras-surgeon')
-    
+        raise Exception(
+            'Keras Surgeon not installed. Unable to reduce model footprint '
+            'Please install up-to-date Keras Surgeon compatible wit TensorFlow 2.3+ and QKeras '
+            'Installation from git: https://github.com/bo3z/keras-surgeon'
+        )
+
     # Initiate surgeon
     surgeon = Surgeon(model)
 
@@ -32,7 +37,7 @@ def reduce_model(model):
     last_idx = get_last_layer_with_weights(model)
     for idx, layer in enumerate(model.layers):
         # Last layer with weights cannot be removed, as it maps to data set labels
-        if (idx == last_idx):
+        if idx == last_idx:
             break
 
         # Currently supported Dense and Conv2D; these two can be combined in a single if-statement
@@ -46,13 +51,13 @@ def reduce_model(model):
             weights = layer.get_weights()[0]
             zeros = np.where(~weights.reshape(-1, weights.shape[-1]).any(axis=0))[0].tolist()
             surgeon.add_job('delete_channels', layer, channels=zeros)
-    
+
     # Reduce model
     reduced = surgeon.operate()
 
     # By default, Keras surgeon returns a Functional model
     # If the original was a Sequential, convert back
-    is_sequential =  model.__class__.__name__ == 'Sequential'
+    is_sequential = model.__class__.__name__ == 'Sequential'
     if is_sequential:
         return Sequential(layers=reduced.layers)
     else:
diff --git a/hls4ml/optimization/keras/regularizers.py b/hls4ml/optimization/keras/regularizers.py
index 18c72a1a9c..255036f4f3 100644
--- a/hls4ml/optimization/keras/regularizers.py
+++ b/hls4ml/optimization/keras/regularizers.py
@@ -1,7 +1,9 @@
 import numpy as np
 import tensorflow as tf
+
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 
+
 @tf.keras.utils.register_keras_serializable(name='DenseRegularizer')
 class DenseRegularizer(tf.keras.regularizers.Regularizer):
     '''
@@ -31,8 +33,16 @@ class DenseRegularizer(tf.keras.regularizers.Regularizer):
           In that case, we need to group several patterns together, so the entire block of patterns can be removed, thus saving DSP and BRAM
     '''
 
-    def __init__(self, alpha, beta=0, norm=1, structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED, block_shape=(1, 1),
-                 pattern_offset=1, consecutive_patterns=1):
+    def __init__(
+        self,
+        alpha,
+        beta=0,
+        norm=1,
+        structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        block_shape=(1, 1),
+        pattern_offset=1,
+        consecutive_patterns=1,
+    ):
         if norm != 1 and norm != 2:
             raise Exception(f'{self.__class__.__name__} currently supports l1- and l2-based regularization')
 
@@ -71,8 +81,7 @@ def __call__(self, weights):
                 raise Exception(f'{self.__class__.__name__}: pattern offset needs to be a factor of matrix size')
 
             if self.pattern_offset % self.consecutive_patterns != 0:
-                raise Exception(
-                    f'{self.__class__.__name__}: consecutive patterns need to be a factor of pattern offset')
+                raise Exception(f'{self.__class__.__name__}: consecutive patterns need to be a factor of pattern offset')
 
             # Reshape weight matrix into [number_of_patterns, pattern_offset]
             number_of_patterns = np.prod(weights.shape) // self.pattern_offset
@@ -86,9 +95,10 @@ def __call__(self, weights):
                     tf.expand_dims(tf.expand_dims(reshaped, 2), 0),
                     [1, self.consecutive_patterns, number_of_patterns, 1],
                     [1, self.consecutive_patterns, number_of_patterns, 1],
-                    [1, 1, 1, 1], 'SAME'
+                    [1, 1, 1, 1],
+                    'SAME',
                 ),
-                (total_blocks, -1)
+                (total_blocks, -1),
             )
 
             # Calculate penalty
@@ -108,9 +118,10 @@ def __call__(self, weights):
                     tf.expand_dims(tf.expand_dims(weights, 2), 0),
                     [1, self.block_shape[0], self.block_shape[1], 1],
                     [1, self.block_shape[0], self.block_shape[1], 1],
-                    [1, 1, 1, 1], 'SAME'
+                    [1, 1, 1, 1],
+                    'SAME',
                 ),
-                (total_blocks, self.block_shape[0] * self.block_shape[1])
+                (total_blocks, self.block_shape[0] * self.block_shape[1]),
             )
 
             sparse_penalty = self.alpha * tf.norm(tf.norm(blocks, axis=1, ord=2), ord=self.norm)
@@ -125,7 +136,7 @@ def get_config(self):
             'structure_type': self.structure_type,
             'block_shape': self.block_shape,
             'pattern_offset': self.pattern_offset,
-            'consecutive_patterns': self.consecutive_patterns
+            'consecutive_patterns': self.consecutive_patterns,
         }
 
 
@@ -151,8 +162,15 @@ class Conv2DRegularizer(tf.keras.regularizers.Regularizer):
         - structure_type = pattern: regularization on groups of every n-th weight in flattened array (e.g. grouping by reuse factor in hls4ml)
     '''
 
-    def __init__(self, alpha, beta=0, norm=1, structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED, pattern_offset=1,
-                 consecutive_patterns=1):
+    def __init__(
+        self,
+        alpha,
+        beta=0,
+        norm=1,
+        structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        pattern_offset=1,
+        consecutive_patterns=1,
+    ):
         if norm != 1 and norm != 2:
             raise Exception(f'{self.__class__.__name__} currently supports l1- and l2-based regularization')
 
@@ -184,8 +202,9 @@ def __call__(self, weights):
             return sparse_penalty + variance_penalty
 
         if self.structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
-            sparse_penalty = self.alpha * tf.norm(tf.reduce_sum(tf.norm(weights, axis=(0, 1), ord='fro'), axis=0),
-                                                  ord=self.norm)
+            sparse_penalty = self.alpha * tf.norm(
+                tf.reduce_sum(tf.norm(weights, axis=(0, 1), ord='fro'), axis=0), ord=self.norm
+            )
             variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(weights, axis=(0, 1, 2)), ord=self.norm)
             return sparse_penalty + variance_penalty
 
@@ -194,8 +213,7 @@ def __call__(self, weights):
                 raise Exception(f'{self.__class__.__name__}: pattern offset needs to be a factor of matrix size')
 
             if self.pattern_offset % self.consecutive_patterns != 0:
-                raise Exception(
-                    f'{self.__class__.__name__}: consecutive patterns need to be a factor of pattern offset')
+                raise Exception(f'{self.__class__.__name__}: consecutive patterns need to be a factor of pattern offset')
 
             number_of_patterns = np.prod(weights.shape) // self.pattern_offset
             target_shape = (self.pattern_offset, number_of_patterns)
@@ -207,9 +225,10 @@ def __call__(self, weights):
                     tf.expand_dims(tf.expand_dims(reshaped, 2), 0),
                     [1, self.consecutive_patterns, number_of_patterns, 1],
                     [1, self.consecutive_patterns, number_of_patterns, 1],
-                    [1, 1, 1, 1], 'SAME'
+                    [1, 1, 1, 1],
+                    'SAME',
                 ),
-                (total_blocks, -1)
+                (total_blocks, -1),
             )
 
             sparse_penalty = self.alpha * tf.norm(tf.norm(blocks, axis=1, ord=2), ord=self.norm)
@@ -223,5 +242,5 @@ def get_config(self):
             'norm': self.norm,
             'structure_type': self.structure_type,
             'pattern_offset': self.pattern_offset,
-            'consecutive_patterns': self.consecutive_patterns
-        }
\ No newline at end of file
+            'consecutive_patterns': self.consecutive_patterns,
+        }
diff --git a/hls4ml/optimization/keras/utils.py b/hls4ml/optimization/keras/utils.py
index 4d9111cc96..92ed1e12ab 100644
--- a/hls4ml/optimization/keras/utils.py
+++ b/hls4ml/optimization/keras/utils.py
@@ -1,6 +1,7 @@
 import numpy as np
 import tensorflow as tf
 
+
 @tf.function
 def get_model_gradients(model, loss_fn, X, y):
     '''
@@ -11,7 +12,7 @@ def get_model_gradients(model, loss_fn, X, y):
         - loss_fn (keras.losses.Loss): Model loss function
         - X (np.array): Input data
         - y (np.array): Output data
-    
+
     Return:
         - grads (dict): Per-layer gradients of loss with respect to weights
     '''
@@ -20,25 +21,26 @@ def get_model_gradients(model, loss_fn, X, y):
     with tf.GradientTape(persistent=True) as tape:
         output = model(X, training=True)
         loss_value = loss_fn(y, output)
-        
+
         for layer in model.layers:
             if len(layer.trainable_weights) > 0:
                 grads[layer.name] = tape.gradient(loss_value, layer.kernel)
 
     return grads
 
+
 @tf.function
 def get_model_hessians(model, loss_fn, X, y):
     '''
     Calculate the second derivatives of the loss with repsect to model weights
     Note, only diagonal elements of the Hessian are computed
-    
+
     Args:
         - model (keras.model): Input model
         - loss_fn (keras.losses.Loss): Model loss function
         - X (np.array): Input data
         - y (np.array): Output data
-    
+
     Return:
         - grads (dict): Per-layer second derivatives of loss with respect to weights
     '''
@@ -46,43 +48,45 @@ def get_model_hessians(model, loss_fn, X, y):
     with tf.GradientTape(persistent=True) as tape:
         output = model(X, training=False)
         loss_value = loss_fn(y, output)
-        
+
         for layer in model.layers:
             if hasattr(layer, 'kernel'):
                 grads[layer.name] = tape.gradient(tape.gradient(loss_value, layer.kernel), layer.kernel)
 
     return grads
 
+
 def get_model_sparsity(model):
     '''
     Calculate total and per-layer model sparsity
 
     Args:
         - model (keras.model): Model to be evaluated
-    
+
     Return:
-        - sparsity (float): Model sparsity, as a percentage of zero weights w.r.t to total number of model weights 
-        - layers (dict): Key-value dictionary; each key is a layer name and the associated value is the layer's sparsity 
-    
+        - sparsity (float): Model sparsity, as a percentage of zero weights w.r.t to total number of model weights
+        - layers (dict): Key-value dictionary; each key is a layer name and the associated value is the layer's sparsity
+
     TODO - Extend support for recurrent layers (reccurent_kernel)
     '''
-    
+
     total_weights = 0
     zero_weights = 0
     layer_sparsity = {}
-    
+
     for layer in model.layers:
         if hasattr(layer, 'kernel'):
-          weights = layer.get_weights()[0].flatten()
-          total_weights = total_weights + len(weights)
-          zero_weights = zero_weights + len(weights) - np.count_nonzero(weights)
-          layer_sparsity[layer.name] = 1.0 - np.count_nonzero(weights) / len(weights)
-    
+            weights = layer.get_weights()[0].flatten()
+            total_weights = total_weights + len(weights)
+            zero_weights = zero_weights + len(weights) - np.count_nonzero(weights)
+            layer_sparsity[layer.name] = 1.0 - np.count_nonzero(weights) / len(weights)
+
     try:
         return zero_weights / total_weights, layer_sparsity
     except:
         return 0.0, layer_sparsity
 
+
 # TODO - Does this work for non-linear models (e.g. skip connections) ?
 def get_last_layer_with_weights(model):
     '''
@@ -91,11 +95,11 @@ def get_last_layer_with_weights(model):
     As an example, consider a network with 16 - 32 - 5 neurons - the last layer's neuron (5) cannot be removed, as they map to the data labels
     Args:
         - model (keras.model): Input model
-    
+
     Return:
         - idx (int): Index location of last layer with params
     '''
     for idx, layer in reversed(list(enumerate(model.layers))):
         if hasattr(layer, 'kernel'):
             return idx
-    return len(model.layers)
\ No newline at end of file
+    return len(model.layers)
diff --git a/hls4ml/optimization/knapsack.py b/hls4ml/optimization/knapsack.py
index 5b31212da5..5487f4347e 100644
--- a/hls4ml/optimization/knapsack.py
+++ b/hls4ml/optimization/knapsack.py
@@ -1,5 +1,6 @@
 import sys
 import time
+
 import numpy as np
 
 
@@ -53,12 +54,16 @@ def solve_knapsack(values, weights, capacity, implementation='CBC_MIP', **kwargs
         raise Exception('Unknown algorithm for solving Knapsack')
 
     if len(values.shape) != 1:
-        raise Exception('Current implementations of Knapsack optimization support single-objective problems. \
-                        Values must be one-dimensional')
+        raise Exception(
+            'Current implementations of Knapsack optimization support single-objective problems. \
+                        Values must be one-dimensional'
+        )
 
     if len(weights.shape) != 2:
-        raise Exception('Current implementation of Knapsack assumes weight vector is 2-dimensional, to allow for multi-dimensional Knapsack problem. \
-                        If solve a one-dimensional Knapsack problem, extend dimensions of weights to a one-row matrix')
+        raise Exception(
+            'Current implementation of Knapsack assumes weight vector is 2-dimensional, to allow for multi-dimensional Knapsack problem. \
+                        If solve a one-dimensional Knapsack problem, extend dimensions of weights to a one-row matrix'
+        )
 
     if values.shape[0] != weights.shape[1]:
         raise Exception('Uneven number of items and weights')
@@ -108,7 +113,7 @@ def __solve_1d_knapsack_dp(values, weights, capacity):
     Furthermore, it has a high computational complexity and it is not suitable for highly-dimensional arrays
     NOTE: The weights and corresponding weight constraint need to be integers; if not, the they should be scaled and rounded beforehand
     '''
-    assert (len(weights.shape) == 1)
+    assert len(weights.shape) == 1
 
     # Build look-up table in bottom-up approach
     N = values.shape[0]
@@ -188,10 +193,10 @@ def __solve_knapsack_branch_and_bound(values, weights, capacity, time_limit=sys.
         raise Exception('OR-Tools not found. Please insteal Google OR-Tools from pip.')
 
     solver = pywrapknapsack_solver.KnapsackSolver(
-        pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_BRANCH_AND_BOUND_SOLVER, 'BB')
+        pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_BRANCH_AND_BOUND_SOLVER, 'BB'
+    )
     solver.set_time_limit(time_limit)
-    solver.Init((values * scaling_factor).astype(int).tolist(), weights.astype(int).tolist(),
-                capacity.astype(int).tolist())
+    solver.Init((values * scaling_factor).astype(int).tolist(), weights.astype(int).tolist(), capacity.astype(int).tolist())
     optimal = solver.Solve()
     selected = [i for i in range(values.shape[0]) if solver.BestSolutionContains(i)]
     return optimal / scaling_factor, selected
@@ -211,10 +216,10 @@ def __solve_knapsack_cbc_mip(values, weights, capacity, time_limit=sys.float_inf
         raise Exception('OR-Tools not found. Please insteal Google OR-Tools from pip.')
 
     solver = pywrapknapsack_solver.KnapsackSolver(
-        pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_CBC_MIP_SOLVER, 'CBC')
+        pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_CBC_MIP_SOLVER, 'CBC'
+    )
     solver.set_time_limit(time_limit)
-    solver.Init((values * scaling_factor).astype(int).tolist(), weights.astype(int).tolist(),
-                capacity.astype(int).tolist())
+    solver.Init((values * scaling_factor).astype(int).tolist(), weights.astype(int).tolist(), capacity.astype(int).tolist())
     optimal = solver.Solve()
     selected = [i for i in range(values.shape[0]) if solver.BestSolutionContains(i)]
     return optimal / scaling_factor, selected
@@ -227,7 +232,7 @@ def __solve_knapsack_equal_weights(values, weights, capacity):
     It occurs often in pruning - e.g. in pattern pruning, each DSP block saves one DSP; however, as a counter-example
     In structured pruning, each structure can save a different amount of FLOPs (Conv2D filter vs Dense neuron)
     '''
-    assert (np.all([weights[i, :] == weights[i, 0] for i in range(weights.shape[0])]))
+    assert np.all([weights[i, :] == weights[i, 0] for i in range(weights.shape[0])])
 
     # Find items with the highest value
     indices = np.argsort(values)
diff --git a/hls4ml/optimization/objectives/__init__.py b/hls4ml/optimization/objectives/__init__.py
index fc3a4c618e..bc58f56a6b 100644
--- a/hls4ml/optimization/objectives/__init__.py
+++ b/hls4ml/optimization/objectives/__init__.py
@@ -1,14 +1,17 @@
 import logging
-import numpy as np
 from abc import ABC, abstractmethod
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+
+import numpy as np
+
 from hls4ml.optimization.attributes import OptimizationAttributes
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 
 '''
 Pruning & weight sharing are formulated as an optimization problem, with the aim of minimising some metric
 Metrics can include: total number of weights, DSP utilization, latency, FLOPs etc.
 '''
 
+
 class ObjectiveEstimator(ABC):
     '''
     Abstract class with methods for estimating the utilization and savings of a certain layer, with respect to some objective
@@ -21,16 +24,16 @@ def is_layer_optimizable(self, layer_attributes):
         '''
         For a given layer, checks whether optimizations make sense, with respect to the given objective(s)
         Furthermore, it returns the type of optimization (structured, unstructured etc.) most suitable for minimising the objective(s).
-        
+
         Args:
             - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
-        
+
         Return:
             - optimizable (boolean) - can optimizations be applied to this layer
             - optimization_attributes (hls4ml.optimiation.attributes.OptimizationAttributes) - most suitable approach for optimization
-        
-        Examples: 
-            - Metric = Total weights, Layer = Dense, shape = (4, 4) -> return True, unstructured 
+
+        Examples:
+            - Metric = Total weights, Layer = Dense, shape = (4, 4) -> return True, unstructured
             - Metric = DSP, Layer = Dense, Precision = ap_fixed<8, 0> -> return False (Vivado doesn't use DSP when precision < 9)
             - Metric = DSP, Layer = Dense, Precision = ap_fixed<16, 6> -> return True, pattern structure, both pruning and weight sharing
         '''
@@ -40,39 +43,42 @@ def is_layer_optimizable(self, layer_attributes):
     def layer_resources(self, layer_attributes):
         '''
         For a given layer, how many units of the metric are used, given a generic weight matrix
-        
+
         Args:
             - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
-        
+
         Return:
             - resources (list, int) - total resources (w.r.t every dimension of the objective) used
-                
+
         Example: Metric = Total weights, Layer = Dense, shape = (4, 4) -> return [16] [regardless of layer sparsity]
         '''
         pass
-    
+
     @abstractmethod
     def layer_savings(self, layer_attributes):
         '''
-        For a given layer, how many units of the metric are saved, when optimizing one structure 
+        For a given layer, how many units of the metric are saved, when optimizing one structure
         The structure type, alongside its parameters (e.g. block shape) are stored in layer attributes
         For best results, OptimizationAttributes in layer_attribtues should be obtained from is_layer_optimizable
-        
+
         Args:
             - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
-        
+
         Return:
             - savings (list, int) - savings achieved (one for every dimenson of objective) with OptimizationAttributes from layer_attributes
-        
+
         Example: Metric = Total weights, Layer = Dense, shape = (4, 4):
             - structure_type == unstructured -> return [1]
-            - structure_type == structured -> return [4] 
+            - structure_type == structured -> return [4]
         '''
         pass
-    
+
+
 '''
 A class containing objective estimation with the goal of minimizing the number of non-zero weights in a layer [corresponds to unstructured pruning]
 '''
+
+
 class ParameterEstimator(ObjectiveEstimator):
     @classmethod
     def is_layer_optimizable(self, layer_attributes):
@@ -80,31 +86,35 @@ def is_layer_optimizable(self, layer_attributes):
             return False, None
         else:
             return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
-    
+
     @classmethod
     def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
         else:
             return [np.prod(layer_attributes.weight_shape)]
-    
+
     @classmethod
     def layer_savings(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
-        
-        structure_type = layer_attributes.optimization_attributes.structure_type 
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
         pruning = layer_attributes.optimization_attributes.pruning
         weight_sharing = layer_attributes.optimization_attributes.weight_sharing
 
         if weight_sharing:
-            logging.warn('Weight sharing does not decrease the number of parameters. \
-                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            logging.warn(
+                'Weight sharing does not decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
             return [0]
-        
+
         if not pruning:
-            logging.warn('Pruning needs to be enabled to decrease the number of parameters. \
-                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            logging.warn(
+                'Pruning needs to be enabled to decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
             return [0]
 
         # In this case, pruning = True and weight_sharing = False, so calculate savings incurred by removing a group of weights
@@ -116,7 +126,9 @@ def layer_savings(self, layer_attributes):
             else:
                 raise Exception('Unknown layer encountered when estimating parameter savings.')
         elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
-            number_of_patterns = np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+            number_of_patterns = (
+                np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+            )
             return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns]
         elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
             return [np.prod(layer_attributes.optimization_attributes.block_shape)]
diff --git a/hls4ml/optimization/objectives/gpu_objectives.py b/hls4ml/optimization/objectives/gpu_objectives.py
index 0de608865e..1f8014171a 100644
--- a/hls4ml/optimization/objectives/gpu_objectives.py
+++ b/hls4ml/optimization/objectives/gpu_objectives.py
@@ -1,8 +1,11 @@
 import logging
+
 import numpy as np
+
+from hls4ml.optimization.attributes import OptimizationAttributes
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 from hls4ml.optimization.objectives import ObjectiveEstimator
-from hls4ml.optimization.attributes import OptimizationAttributes
+
 
 class GPUFLOPEstimator(ObjectiveEstimator):
     @classmethod
@@ -10,8 +13,10 @@ def is_layer_optimizable(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return False, None
         else:
-            return True, OptimizationAttributes(structure_type=SUPPORTED_STRUCTURES.STRUCTURED, pruning=True, weight_sharing=False)
-    
+            return True, OptimizationAttributes(
+                structure_type=SUPPORTED_STRUCTURES.STRUCTURED, pruning=True, weight_sharing=False
+            )
+
     @classmethod
     def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape:
@@ -20,27 +25,37 @@ def layer_resources(self, layer_attributes):
             if 'Dense' in layer_attributes.layer_type.__name__:
                 return [2 * np.prod(layer_attributes.weight_shape) + layer_attributes.weight_shape[1]]
             elif 'Conv2D' in layer_attributes.layer_type.__name__:
-                return [2 * np.prod(layer_attributes.weight_shape) * layer_attributes.output_shape[0] * layer_attributes.output_shape[1] + layer_attributes.weight_shape[3]]
+                return [
+                    2
+                    * np.prod(layer_attributes.weight_shape)
+                    * layer_attributes.output_shape[0]
+                    * layer_attributes.output_shape[1]
+                    + layer_attributes.weight_shape[3]
+                ]
             else:
                 raise Exception('Unknown layer encountered when estimating FLOP utilization.')
-    
+
     @classmethod
     def layer_savings(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
-        
-        structure_type = layer_attributes.optimization_attributes.structure_type 
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
         pruning = layer_attributes.optimization_attributes.pruning
         weight_sharing = layer_attributes.optimization_attributes.weight_sharing
 
         if weight_sharing:
-            logging.warn('Weight sharing does not decrease FLOPs. \
-                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            logging.warn(
+                'Weight sharing does not decrease FLOPs. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
             return [0]
-        
+
         if not pruning:
-            logging.warn('Pruning needs to be enabled to decrease FLOPs. \
-                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            logging.warn(
+                'Pruning needs to be enabled to decrease FLOPs. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
             return [0]
 
         # TODO - The below formulas underestimate FLOP savings [removing a filter in a lyer removes channels / neurons in subsequent layers]
@@ -48,10 +63,18 @@ def layer_savings(self, layer_attributes):
             if 'Dense' in layer_attributes.layer_type.__name__:
                 return [2 * layer_attributes.weight_shape[0] + 1]
             elif 'Conv2D' in layer_attributes.layer_type.__name__:
-                return [2 * np.prod(layer_attributes.weight_shape[0:3]) * layer_attributes.output_shape[0] * layer_attributes.output_shape[1] + 1]
+                return [
+                    2
+                    * np.prod(layer_attributes.weight_shape[0:3])
+                    * layer_attributes.output_shape[0]
+                    * layer_attributes.output_shape[1]
+                    + 1
+                ]
             else:
                 raise Exception('Unknown layer encountered when estimating FLOP savings.')
         else:
-            logging.warn('FLOP savings occur with structured pruning. \
-                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
-            return [0]   
\ No newline at end of file
+            logging.warn(
+                'FLOP savings occur with structured pruning. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
diff --git a/hls4ml/optimization/objectives/vivado_objectives.py b/hls4ml/optimization/objectives/vivado_objectives.py
index 7589ffed26..bba25dbb84 100644
--- a/hls4ml/optimization/objectives/vivado_objectives.py
+++ b/hls4ml/optimization/objectives/vivado_objectives.py
@@ -1,9 +1,12 @@
+import logging
 import math
-import logging 
+
 import numpy as np
+
+from hls4ml.optimization.attributes import OptimizationAttributes
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 from hls4ml.optimization.objectives import ObjectiveEstimator
-from hls4ml.optimization.attributes import OptimizationAttributes
+
 
 # Optimizes DSP utilisation for Vivado backend
 class VivadoDSPEstimator(ObjectiveEstimator):
@@ -14,13 +17,16 @@ def is_layer_optimizable(self, layer_attributes):
         else:
             if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
                 return True, OptimizationAttributes(
-                            SUPPORTED_STRUCTURES.PATTERN, pruning = True, weight_sharing = False,
-                            pattern_offset=np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor, 
-                            consecutive_patterns=1
-                        )
+                    SUPPORTED_STRUCTURES.PATTERN,
+                    pruning=True,
+                    weight_sharing=False,
+                    pattern_offset=np.prod(layer_attributes.weight_shape)
+                    // layer_attributes.args['hls4ml_attributes'].reuse_factor,
+                    consecutive_patterns=1,
+                )
             else:
-                return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning = True, weight_sharing = False)
-    
+                return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+
     @classmethod
     def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
@@ -28,7 +34,7 @@ def layer_resources(self, layer_attributes):
         else:
             # TOOD - Extend for parallelisation factor
             return [np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor]
-    
+
     @classmethod
     def layer_savings(self, layer_attributes):
         if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
@@ -42,8 +48,8 @@ def layer_savings(self, layer_attributes):
                 It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
             )
             return [0]
-        
-        structure_type = layer_attributes.optimization_attributes.structure_type 
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
         if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
             if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
                 return [1]
@@ -56,43 +62,56 @@ def layer_savings(self, layer_attributes):
                 else:
                     return [0]
             elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
-                if layer_attributes.args['hls4ml_attributes'].reuse_factor == layer_attributes.args['hls4ml_attributes'].n_in:
+                if (
+                    layer_attributes.args['hls4ml_attributes'].reuse_factor
+                    == layer_attributes.args['hls4ml_attributes'].n_in
+                ):
                     return [1]
                 else:
                     return [0]
             elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
                 pattern_offset = layer_attributes.optimization_attributes.pattern_offset
-                number_of_patterns = np.prod(layer_attributes.weight_shape)// pattern_offset
+                number_of_patterns = np.prod(layer_attributes.weight_shape) // pattern_offset
 
                 if number_of_patterns == layer_attributes.args['hls4ml_attributes'].reuse_factor:
                     return [layer_attributes.optimization_attributes.consecutive_patterns]
                 else:
-                    return [0]  
+                    return [0]
             elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
                 logging.warn('hls4ml does not support block sparsity patterns...setting layer savings to zero')
                 return [0]
 
-# Optimizes BRAM and DSP for Vivado backend     
+
+# Optimizes BRAM and DSP for Vivado backend
 class VivadoMultiObjectiveEstimator(ObjectiveEstimator):
     @classmethod
     def is_layer_optimizable(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return False, None
-        
-        if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource' and layer_attributes.args['hls4ml_attributes'].reuse_factor > 1 :
+
+        if (
+            layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+            and layer_attributes.args['hls4ml_attributes'].reuse_factor > 1
+        ):
             if 36 % layer_attributes.args['hls4ml_attributes'].weight_precision.width == 0:
                 consecutive_patterns = int(36 // layer_attributes.args['hls4ml_attributes'].weight_precision.width)
             else:
-                consecutive_patterns = int(math.ceil(2 * 36 / layer_attributes.args['hls4ml_attributes'].weight_precision.width))
-            
-            return True, OptimizationAttributes(
-                SUPPORTED_STRUCTURES.PATTERN, pruning = True, weight_sharing = False,
-                    pattern_offset=int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor), 
-                    consecutive_patterns=consecutive_patterns
+                consecutive_patterns = int(
+                    math.ceil(2 * 36 / layer_attributes.args['hls4ml_attributes'].weight_precision.width)
                 )
+
+            return True, OptimizationAttributes(
+                SUPPORTED_STRUCTURES.PATTERN,
+                pruning=True,
+                weight_sharing=False,
+                pattern_offset=int(
+                    np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor
+                ),
+                consecutive_patterns=consecutive_patterns,
+            )
         else:
-            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning = True, weight_sharing = False)
-    
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+
     @classmethod
     def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape:
@@ -115,14 +134,20 @@ def layer_resources(self, layer_attributes):
                 # For RF > 1, BRAM utilised by weights can be estimated by (bit_width * n_in * n_out) / (RF * 36)
                 return [
                     int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
-                    int(math.ceil(np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width / (layer_attributes.args['hls4ml_attributes'].reuse_factor * 36))),
+                    int(
+                        math.ceil(
+                            np.prod(layer_attributes.weight_shape)
+                            * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                            / (layer_attributes.args['hls4ml_attributes'].reuse_factor * 36)
+                        )
+                    ),
                 ]
-    
+
     @classmethod
     def layer_savings(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
-        
+
         # TODO - Once we know how to implement constant coefficient multiplication via LUT, enable for weight sharing
         pruning = layer_attributes.optimization_attributes.pruning
         if not pruning:
@@ -131,15 +156,18 @@ def layer_savings(self, layer_attributes):
                 It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
             )
             return [0]
-        
-        structure_type = layer_attributes.optimization_attributes.structure_type 
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
         if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
             if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
-                    return [1, 0]
+                return [1, 0]
             else:
-                    return [0, 0]
+                return [0, 0]
         else:
-            if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource' and layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+            if (
+                layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+                and layer_attributes.args['hls4ml_attributes'].reuse_factor == 1
+            ):
                 return [1, 0]
             else:
                 if structure_type == SUPPORTED_STRUCTURES.PATTERN:
@@ -148,121 +176,191 @@ def layer_savings(self, layer_attributes):
                     weight_precision = layer_attributes.args['hls4ml_attributes'].weight_precision.width
 
                     number_of_patterns = np.prod(layer_attributes.weight_shape) // pattern_offset
-                    saved_one_bram_block = \
-                        (36 == consecutive_patterns * weight_precision and 36 % weight_precision == 0) \
-                        or (72 == consecutive_patterns * weight_precision)
+                    saved_one_bram_block = (
+                        36 == consecutive_patterns * weight_precision and 36 % weight_precision == 0
+                    ) or (72 == consecutive_patterns * weight_precision)
 
-                    if number_of_patterns == layer_attributes.args['hls4ml_attributes'].reuse_factor and saved_one_bram_block:
+                    if (
+                        number_of_patterns == layer_attributes.args['hls4ml_attributes'].reuse_factor
+                        and saved_one_bram_block
+                    ):
                         return [consecutive_patterns, 1]
                     else:
                         logging.warn('Support for multi-objective optimisation is not fully implemented yet....')
-                        return [0, 0]  
+                        return [0, 0]
                 else:
                     logging.warn('Support for multi-objective optimisation is not fully implemented yet....')
                     return [0, 0]
-                
+
+
 class VivadoFFEstimator(ObjectiveEstimator):
     @classmethod
     def is_layer_optimizable(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return False, None
-        
+
         # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; skipping
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
             logging.warn('FFs are at minimum utilization with io_stream and Resource strategy')
             return False, None
 
         # With io_stream in Latency, weight are stored in FFs, so unstructured pruning will benefit the most
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
-            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning = True, weight_sharing = False)
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
 
         # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so structured pruning is the most suitable, it reduces the size out output before compile-time
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
-            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.STRUCTURED, pruning = True, weight_sharing = False)
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.STRUCTURED, pruning=True, weight_sharing=False)
 
         # In io_parallel with Latency, weights and activation tensors are all stored in FFs, so it is equivalent to unstructured, high sparsity pruning
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
-            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning = True, weight_sharing = False)
-    
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+
     # TODO - This method is inaccurate (accross all cases); in general, estimating FFs is hard, but as long as it is consistent(ly wrong), it should not matter for the pruning
     @classmethod
     def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
-        
+
         # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; minimal FF utilization
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
             return [0]
 
         # With io_stream in Latency, weight are stored in FFs, so FF ~ number_of_weights x weight_precision
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
-            return [np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width ]
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            return [
+                np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+            ]
 
         # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so FF ~ number_of_outputs x weight_precision
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
-            return [np.prod(layer_attributes.output_shape) * layer_attributes.args['hls4ml_attributes'].output_precision.width ]
-    
-        # In io_parallel with Latency, weights and latency are all stored in FFs, so it is equivalent to the sum of the above two cases
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
             return [
-                np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width +
                 np.prod(layer_attributes.output_shape) * layer_attributes.args['hls4ml_attributes'].output_precision.width
             ]
-    
+
+        # In io_parallel with Latency, weights and latency are all stored in FFs, so it is equivalent to the sum of the above two cases
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            return [
+                np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                + np.prod(layer_attributes.output_shape) * layer_attributes.args['hls4ml_attributes'].output_precision.width
+            ]
+
     @classmethod
     def layer_savings(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
-        
-        structure_type = layer_attributes.optimization_attributes.structure_type 
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
         pruning = layer_attributes.optimization_attributes.pruning
         weight_sharing = layer_attributes.optimization_attributes.weight_sharing
-        
+
         if weight_sharing:
-            logging.warn('Weight sharing does not decrease the number of parameters. \
-                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            logging.warn(
+                'Weight sharing does not decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
             return [0]
-        
+
         if not pruning:
-            logging.warn('Pruning needs to be enabled to decrease the number of parameters. \
-                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)')
+            logging.warn(
+                'Pruning needs to be enabled to decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
             return [0]
 
         # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; minimal FF utilization
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
             return [0]
 
         # With io_stream in Latency, weight are stored in FFs, so any type of pruning will help:
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
             if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
                 return [layer_attributes.args['hls4ml_attributes'].weight_precision.width]
             elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
-                return [layer_attributes.args['hls4ml_attributes'].n_in * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+                return [
+                    layer_attributes.args['hls4ml_attributes'].n_in
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
             elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
-                number_of_patterns = np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
-                return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+                number_of_patterns = (
+                    np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                )
+                return [
+                    number_of_patterns
+                    * layer_attributes.optimization_attributes.consecutive_patterns
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
             elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
-                return [np.prod(layer_attributes.optimization_attributes.block_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+                return [
+                    np.prod(layer_attributes.optimization_attributes.block_shape)
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
 
         # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so only structured pruning helps
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
             if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
                 return [layer_attributes.args['hls4ml_attributes'].output_precision.width]
             else:
                 return [0]
-            
+
         # In io_parallel with Latency, weights and latency are all stored in FFs, so any type of pruning helps
-        if layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel' and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
             # This is a significant under-estimate, as some savings are incurred due to less intermediate results
             if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
                 return [layer_attributes.args['hls4ml_attributes'].weight_precision.width]
             elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
                 return [
-                    layer_attributes.args['hls4ml_attributes'].n_in * layer_attributes.args['hls4ml_attributes'].weight_precision.width +
-                    layer_attributes.args['hls4ml_attributes'].output_precision.width
-                    ]
+                    layer_attributes.args['hls4ml_attributes'].n_in
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                    + layer_attributes.args['hls4ml_attributes'].output_precision.width
+                ]
             elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
-                number_of_patterns = np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
-                return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+                number_of_patterns = (
+                    np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                )
+                return [
+                    number_of_patterns
+                    * layer_attributes.optimization_attributes.consecutive_patterns
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
             elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
-                return [np.prod(layer_attributes.optimization_attributes.block_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+                return [
+                    np.prod(layer_attributes.optimization_attributes.block_shape)
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
diff --git a/hls4ml/optimization/scheduler.py b/hls4ml/optimization/scheduler.py
index fe02688fd7..9b390dc8a4 100644
--- a/hls4ml/optimization/scheduler.py
+++ b/hls4ml/optimization/scheduler.py
@@ -1,29 +1,30 @@
 from abc import ABC, abstractmethod
 
+
 class OptimizationScheduler(ABC):
     '''
     Baseline class handling logic regarding target sparsity and its updates at every step
     '''
-    
+
     def __init__(self, initial_sparsity=0, final_sparsity=1):
         '''
         intial_sparsity and final_sparsity are between 0.0 and 1.0, NOT 0% and 100%
         '''
-        if (initial_sparsity < 0 or initial_sparsity > 1):
+        if initial_sparsity < 0 or initial_sparsity > 1:
             raise Exception('intial_sparsity must be between 0.0 and 1.0')
-        if (final_sparsity < 0 or final_sparsity > 1):
+        if final_sparsity < 0 or final_sparsity > 1:
             raise Exception('final_sparsity must be between 0.0 and 1.0')
         self.sparsity = initial_sparsity
         self.lower_bound = initial_sparsity
         self.upper_bound = final_sparsity
-    
+
     @abstractmethod
     def update_step(self):
         '''
         Increments the current sparsity, according to the rule, examples:
             - ConstantScheduler, sparsity = 0.5, increment = 0.05 -> sparsity = 0.55
             - BinaryScheduler, sparsity = 0.5, target = 1.0 -> sparsity = 0.75
-        
+
         Return:
             - updated (boolean) - Has the sparsity changed? If not, the optimization algorithm can stop
             - sparsity (float) - Updated sparsity
@@ -37,7 +38,7 @@ def repair_step(self):
         Then, the target sparsity is decreased according to the rule, examples:
             - ConstantScheduler, sparsity = 0.5, increment = 0.05 -> sparsity = 0.55 [see ConstantScheduler for explanation]
             - BinaryScheduler, sparsity = 0.75, target = 1.0, previous = 0.5 -> sparsity = (0.5 + 0.75) / 2 = 0.625
-        
+
         Return:
             - updated (boolean) - Has the sparsity changed? If not, the optimization algorithm can stop
             - sparsity (float) - Updated sparsity
@@ -47,14 +48,16 @@ def repair_step(self):
     def get_sparsity(self):
         return self.sparsity
 
+
 class ConstantScheduler(OptimizationScheduler):
     '''
     Sparsity updated by a constant term, until (i) sparsity target reached (ii) optimization algorithm stops requesting state updates
     '''
+
     def __init__(self, initial_sparsity=0, final_sparsity=1.0, update_step=0.05):
         self.increment = update_step
-        super(ConstantScheduler, self).__init__(initial_sparsity, final_sparsity)
-    
+        super().__init__(initial_sparsity, final_sparsity)
+
     def update_step(self):
         if self.sparsity + self.increment <= self.upper_bound:
             self.sparsity += self.increment
@@ -66,21 +69,24 @@ def update_step(self):
     In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity
     In this case, constant sparsity (since it increments by a small amount every time), will simply jump to the next sparsity level
     The model's performance over several sparsity levels optimization is stoped after high loss over several trials (see top level pruning/optimization function)
-    
+
     '''
+
     def repair_step(self):
         return self.update_step()
-    
+
+
 class BinaryScheduler(OptimizationScheduler):
     '''
     Sparsity updated by binary halving the search space; constantly updates lower and upper bounds
     In the update step, sparsity is incremented, as the midpoint between previous sparsity and target sparsity (upper bound)
     In the repair step, sparsity is decrement, as the midpoint between between the lower bound and previous sparsity
     '''
+
     def __init__(self, initial_sparsity=0, final_sparsity=1.0, threshold=0.01):
         self.threshold = threshold
-        super(BinaryScheduler, self).__init__(initial_sparsity, final_sparsity)
-    
+        super().__init__(initial_sparsity, final_sparsity)
+
     def update_step(self):
         if self.upper_bound - self.sparsity >= self.threshold:
             self.lower_bound = self.sparsity
@@ -99,30 +105,35 @@ def repair_step(self):
             self.upper_bound = self.sparsity
             return False, self.sparsity
 
+
 class PolynomialScheduler(OptimizationScheduler):
     '''
     Sparsity updated by at a polynomial decay, until (i) sparsity target reached (ii) optimization algorithm stops requesting state updates
     For more information, see Zhu & Gupta (2016) - 'To prune, or not to prune: exploring the efficacy of pruning for model compression' and TensorFlow Model Optimization library
     Note, the implementation is slightly different, as TensorFlow Prune API depends on the total number of epochs and update frequency
     '''
+
     def __init__(self, maximum_steps, initial_sparsity=0, final_sparsity=1.0, decay_power=3):
         self.decay_power = decay_power
         self.current_step = 0
         self.maximum_steps = maximum_steps
-        super(PolynomialScheduler, self).__init__(initial_sparsity, final_sparsity)
-    
+        super().__init__(initial_sparsity, final_sparsity)
+
     def update_step(self):
         if self.current_step < self.maximum_steps:
             self.current_step += 1
-            self.sparsity = self.upper_bound + (self.lower_bound - self.upper_bound) * ((1 - self.current_step / self.maximum_steps) ** self.decay_power)
+            self.sparsity = self.upper_bound + (self.lower_bound - self.upper_bound) * (
+                (1 - self.current_step / self.maximum_steps) ** self.decay_power
+            )
             return True, self.sparsity
         else:
             return False, self.sparsity
-        
+
     '''
     In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity
     In this case, polynomial sparsity, will simply jump to the next sparsity level
     The model's performance over several sparsity levels optimization is stoped after high loss over several trials (see top level pruning/optimization function)
     '''
+
     def repair_step(self):
         return self.update_step()
diff --git a/test/pytest/optimization/keras/test_masking.py b/test/pytest/optimization/keras/test_masking.py
index 6726dcc638..aa9b9be564 100644
--- a/test/pytest/optimization/keras/test_masking.py
+++ b/test/pytest/optimization/keras/test_masking.py
@@ -1,12 +1,13 @@
-import pytest
 import numpy as np
-from qkeras import QDense, QConv2D
+import pytest
+from qkeras import QConv2D, QDense
+from tensorflow.keras.layers import Conv2D, Dense, Flatten
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense, Conv2D, Flatten
+
+from hls4ml.optimization.attributes import get_attributes_from_keras_model
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 from hls4ml.optimization.keras.masking import get_model_masks
 from hls4ml.optimization.objectives import ParameterEstimator
-from hls4ml.optimization.attributes import get_attributes_from_keras_model
 
 '''
 In all the tests, an artifical network with one Dense/Conv2D layer and pre-determined weights is created
@@ -44,18 +45,13 @@ def test_dense_masking_unstructured(local_masking, dense):
     model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.UNSTRUCTURED
 
     # 33% sparsity - zero 2 out of 6 blocks with lowest norm [0.33 * 6 = 1.98 -> next largest int is 2]
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
-                                     local=local_masking)
-    zeros = np.array([
-        [0, 0],
-        [1, 2]
-    ],
-        dtype=np.int32)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0], [1, 2]], dtype=np.int32)
     nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
 
-    assert (not np.any(offsets['dense']))
-    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
-    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
 
 
 # Create a Dense layer with artificial weights, so that the 1st and 3rd column (neuron) are pruned
@@ -78,18 +74,23 @@ def test_dense_masking_structured(local_masking, dense):
     model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.STRUCTURED
 
     # 33% sparsity - zero 2 out of 6 blocks with lowest norm [0.33 * 6 = 1.98 -> next largest int is 2]
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
-                                     local=local_masking)
-    zeros = np.array([
-        [0, 0], [1, 0], [2, 0],  # First neuron
-        [0, 2], [1, 2], [2, 2],  # Third neuron
-    ],
-        dtype=np.int32)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array(
+        [
+            [0, 0],
+            [1, 0],
+            [2, 0],  # First neuron
+            [0, 2],
+            [1, 2],
+            [2, 2],  # Third neuron
+        ],
+        dtype=np.int32,
+    )
     nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
 
-    assert (not np.any(offsets['dense']))
-    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
-    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
 
 
 # Create a Dense layer with artificial weights, so that some patterns are pruned
@@ -125,18 +126,13 @@ def test_dense_masking_pattern(local_masking, dense):
     model_attributes['dense'].optimization_attributes.consecutive_patterns = 1
 
     # 33% sparsity - zero 4 out of 12 weights, group by pattern [0.33 * 12 = 3.96] - so will select 2 patterns, 6 weights (>=)
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
-                                     local=local_masking)
-    zeros = np.array([
-        [0, 0], [1, 0], [2, 0],
-        [0, 2], [1, 2], [2, 2]
-    ],
-        dtype=np.int32)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0], [1, 0], [2, 0], [0, 2], [1, 2], [2, 2]], dtype=np.int32)
     nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
 
-    assert (not np.any(offsets['dense']))
-    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
-    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
 
 
 # Create a Dense layer with artificial weights, so that the 1st and 4th block are pruned
@@ -172,18 +168,13 @@ def test_dense_masking_block(local_masking, dense):
 
     # 33% sparsity - zero 2 out of 6 blocks with lowest norm
     # The first block is the smallest, the fourth block is set to zero
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
-                                     local=local_masking)
-    zeros = np.array([
-        [0, 0], [0, 1], [1, 0], [1, 1],
-        [2, 2], [2, 3], [3, 2], [3, 3]
-    ],
-        dtype=np.int32)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [2, 2], [2, 3], [3, 2], [3, 3]], dtype=np.int32)
     nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
 
-    assert (not np.any(offsets['dense']))
-    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
-    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
 
 
 # Create a Conv2D layer with artificial weights and mask some small weights
@@ -215,15 +206,13 @@ def test_conv2d_masking_unstructured(local_masking, conv2d):
     model_attributes['conv2d'].optimization_attributes.pruning = True
     model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.UNSTRUCTURED
 
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
-                                     local=local_masking)
-    zeros = np.array([[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1]],
-                     dtype=np.int32)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1]], dtype=np.int32)
     nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
 
-    assert (not np.any(offsets['conv2d']))
-    assert (not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]]))
-    assert ((filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0]))
+    assert not np.any(offsets['conv2d'])
+    assert not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]])
+    assert (filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0])
 
 
 # Create a Conv2D layer with artificial weights, so that second and last filter are pruned
@@ -253,20 +242,21 @@ def test_conv2d_masking_structured(local_masking, conv2d):
     # 33% sparsity - zero 2 out of 6 filters with lowest norm
     # Generate all possible combinations of width and height pixels with channel using np.meshgrid()
     # This represents all the positions for a single filter; then append filter position to the last columns
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
-                                     local=local_masking)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
     width_pixels = np.array(range(0, filt_width))
     height_pixels = np.array(range(0, filt_height))
     channels = np.array(range(0, n_channels))
     combinations = np.array(np.meshgrid(width_pixels, height_pixels, channels)).T.reshape(-1, 3)
-    zeros = np.array(np.append(combinations, np.full((filt_width * filt_height * n_channels, 1), 1), axis=1).tolist() +
-                     np.append(combinations, np.full((filt_width * filt_height * n_channels, 1), 5), axis=1).tolist(),
-                     dtype=np.int32)
+    zeros = np.array(
+        np.append(combinations, np.full((filt_width * filt_height * n_channels, 1), 1), axis=1).tolist()
+        + np.append(combinations, np.full((filt_width * filt_height * n_channels, 1), 5), axis=1).tolist(),
+        dtype=np.int32,
+    )
     nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
 
-    assert (not np.any(offsets['conv2d']))
-    assert (not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]]))
-    assert ((filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0]))
+    assert not np.any(offsets['conv2d'])
+    assert not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]])
+    assert (filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0])
 
 
 # Create a Conv2D layer with artificial weights, so that the first and second pattern are pruned
@@ -303,16 +293,15 @@ def test_conv2d_masking_pattern(local_masking, conv2d):
     model_attributes['conv2d'].optimization_attributes.pattern_offset = 4
 
     # 33% sparsity - zero out the two of the lowest groups
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
-                                     local=local_masking)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
     print(masks['conv2d'].shape)
     print(weights[0].shape)
     zeros = np.array([[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 1]], dtype=np.int32)
     nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
 
-    assert (not np.any(offsets['conv2d']))
-    assert (not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]]))
-    assert ((filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0]))
+    assert not np.any(offsets['conv2d'])
+    assert not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]])
+    assert (filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0])
 
 
 # Block pruning is only allowed for 2-dimensional matrices, so assert a correct exception is raised when pruning with Conv2D
@@ -332,17 +321,24 @@ def test_conv2d_block_masking_raises_exception(local_masking, conv2d):
     try:
         get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
     except Exception:
-        assert (True)
+        assert True
         return
-    assert (False)
+    assert False
 
 
 # Test edge cases: 0% and 100% sparsity
 # Test 50% sparsity with two layers
 @pytest.mark.parametrize('s', [0, 0.5, 1])
 @pytest.mark.parametrize('local_masking', local_masking)
-@pytest.mark.parametrize('type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED,
-                                  SUPPORTED_STRUCTURES.PATTERN, SUPPORTED_STRUCTURES.BLOCK])
+@pytest.mark.parametrize(
+    'type',
+    [
+        SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        SUPPORTED_STRUCTURES.STRUCTURED,
+        SUPPORTED_STRUCTURES.PATTERN,
+        SUPPORTED_STRUCTURES.BLOCK,
+    ],
+)
 def test_multi_layer_masking(s, local_masking, type):
     dense_units = 16
     conv_filters = 6
@@ -352,8 +348,16 @@ def test_multi_layer_masking(s, local_masking, type):
 
     # Simple model, Conv2D weight shape (2, 2, 4, 6) and Dense weight shape (384, 16)
     model = Sequential()
-    model.add(Conv2D(conv_filters, input_shape=(*input_shape, conv_channels), kernel_size=conv_shape, name='conv2d',
-                     padding='same', kernel_initializer='ones'))
+    model.add(
+        Conv2D(
+            conv_filters,
+            input_shape=(*input_shape, conv_channels),
+            kernel_size=conv_shape,
+            name='conv2d',
+            padding='same',
+            kernel_initializer='ones',
+        )
+    )
     model.add(Flatten())
     model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
 
@@ -366,31 +370,33 @@ def test_multi_layer_masking(s, local_masking, type):
 
     model_attributes['conv2d'].optimizable = True
     model_attributes['conv2d'].optimization_attributes.pruning = True
-    model_attributes[
-        'conv2d'].optimization_attributes.structure_type = type if type != SUPPORTED_STRUCTURES.BLOCK else SUPPORTED_STRUCTURES.UNSTRUCTURED
+    model_attributes['conv2d'].optimization_attributes.structure_type = (
+        type if type != SUPPORTED_STRUCTURES.BLOCK else SUPPORTED_STRUCTURES.UNSTRUCTURED
+    )
     model_attributes['conv2d'].optimization_attributes.pattern_offset = 4  # Equivalent to RF = 4 (2 * 2 * 4 * 6 / 4)
 
     masks, offsets = get_model_masks(model, model_attributes, s, ParameterEstimator, metric='l1', local=local_masking)
     if s == 1:  # 100% sparsity - all masks are zero
         print(np.count_nonzero(masks['dense'].flatten()))
-        assert (not np.any(masks['dense']))
-        assert (not np.any(masks['conv2d']))
+        assert not np.any(masks['dense'])
+        assert not np.any(masks['conv2d'])
     elif s == 0.5:
         conv2d_weights = conv_channels * conv_filters * np.prod(conv_shape)
         dense_weights = dense_units * np.prod(input_shape) * conv_filters
         if local_masking:
-            assert (np.count_nonzero(masks['conv2d']) == int((1 - s) * conv2d_weights))
-            assert (np.count_nonzero(masks['dense']) == int((1 - s) * dense_weights))
+            assert np.count_nonzero(masks['conv2d']) == int((1 - s) * conv2d_weights)
+            assert np.count_nonzero(masks['dense']) == int((1 - s) * dense_weights)
         else:
             # Less than or equal to, since Knapsack problem imposes a hard constrain on the active resources (ones)
-            assert (np.count_nonzero(masks['conv2d']) + np.count_nonzero(masks['dense']) <= int(
-                (1 - s) * (conv2d_weights + dense_weights)))
+            assert np.count_nonzero(masks['conv2d']) + np.count_nonzero(masks['dense']) <= int(
+                (1 - s) * (conv2d_weights + dense_weights)
+            )
     elif s == 0:  # 0% sparsity - all masks are one
-        assert (np.all(masks['dense']))
-        assert (np.all(masks['conv2d']))
+        assert np.all(masks['dense'])
+        assert np.all(masks['conv2d'])
 
-    assert (not np.any(offsets['dense']))
-    assert (not np.any(offsets['conv2d']))
+    assert not np.any(offsets['dense'])
+    assert not np.any(offsets['conv2d'])
 
 
 # Create a Dense layer with artificial weights, so that some consecutive patterns are pruned
@@ -433,15 +439,10 @@ def test_consecutive_pattern_masking(local_masking, dense):
     model_attributes['dense'].optimization_attributes.consecutive_patterns = 2
 
     # 33% sparsity - zero 4 out of 12 weight, group by pattern [0.33 * 12 = 3.96]
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1',
-                                     local=local_masking)
-    zeros = np.array([
-        [0, 0], [1, 0], [2, 0],
-        [0, 1], [1, 1], [2, 1]
-    ],
-        dtype=np.int32)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0], [1, 0], [2, 0], [0, 1], [1, 1], [2, 1]], dtype=np.int32)
     nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
 
-    assert (not np.any(offsets['dense']))
-    assert (not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]]))
-    assert ((weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0]))
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
diff --git a/test/pytest/optimization/keras/test_reduction.py b/test/pytest/optimization/keras/test_reduction.py
index 3b9e75cd6e..1f3a0084a2 100644
--- a/test/pytest/optimization/keras/test_reduction.py
+++ b/test/pytest/optimization/keras/test_reduction.py
@@ -1,7 +1,7 @@
-from qkeras import quantized_bits
-from qkeras import QDense, QActivation, QConv2D
+from qkeras import QActivation, QConv2D, QDense, quantized_bits
+from tensorflow.keras.layers import AveragePooling2D, BatchNormalization, Conv2D, Dense, Flatten, MaxPooling2D, ReLU, Softmax
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Softmax, BatchNormalization, ReLU, Flatten, AveragePooling2D
+
 from hls4ml.optimization.keras.reduction import reduce_model
 from hls4ml.optimization.keras.utils import get_model_sparsity
 
@@ -11,6 +11,7 @@
 Test verify the above property, by setting some zeros in the last layer and verifying these remain in place
 '''
 
+
 def test_keras_model_reduction():
     model = Sequential()
     model.add(Conv2D(8, (3, 3), input_shape=(64, 64, 1), name='conv2d_1', padding='same'))
@@ -22,11 +23,11 @@ def test_keras_model_reduction():
     model.add(BatchNormalization())
     model.add(ReLU())
     model.add(Flatten())
-    model.add(Dense(32, input_shape=(16, ), name = 'dense_1', activation='relu'))
+    model.add(Dense(32, input_shape=(16,), name='dense_1', activation='relu'))
     model.add(BatchNormalization())
-    model.add(Dense(14, name = 'dense_2', activation='relu'))
+    model.add(Dense(14, name='dense_2', activation='relu'))
     model.add(BatchNormalization())
-    model.add(Dense(5, name = 'dense_3'))
+    model.add(Dense(5, name='dense_3'))
     model.add(Softmax())
 
     indices = {
@@ -45,29 +46,30 @@ def test_keras_model_reduction():
             weights = layer.get_weights()
             weights[0][:, :, :, indices[layer.name]] = 0
             layer.set_weights(weights)
-   
+
     sparsity, _ = get_model_sparsity(model)
-    assert(sparsity > 0)
+    assert sparsity > 0
 
     reduced = reduce_model(model)
-    assert(reduced.get_layer('conv2d_1').get_weights()[0].shape == (3, 3, 1, 5))
-    assert(reduced.get_layer('conv2d_2').get_weights()[0].shape == (5, 5, 5, 26))
-    assert(reduced.get_layer('dense_1').get_weights()[0].shape == (6656, 28))
-    assert(reduced.get_layer('dense_2').get_weights()[0].shape == (28, 11))
-    assert(reduced.get_layer('dense_3').get_weights()[0].shape == (11, 5))
+    assert reduced.get_layer('conv2d_1').get_weights()[0].shape == (3, 3, 1, 5)
+    assert reduced.get_layer('conv2d_2').get_weights()[0].shape == (5, 5, 5, 26)
+    assert reduced.get_layer('dense_1').get_weights()[0].shape == (6656, 28)
+    assert reduced.get_layer('dense_2').get_weights()[0].shape == (28, 11)
+    assert reduced.get_layer('dense_3').get_weights()[0].shape == (11, 5)
 
     _, layer_sparsity = get_model_sparsity(reduced)
-    assert(layer_sparsity['conv2d_1'] == 0)
-    assert(layer_sparsity['conv2d_2'] == 0)
-    assert(layer_sparsity['dense_1'] == 0)
-    assert(layer_sparsity['dense_2'] == 0)
-    assert(layer_sparsity['dense_3'] > 0)
+    assert layer_sparsity['conv2d_1'] == 0
+    assert layer_sparsity['conv2d_2'] == 0
+    assert layer_sparsity['dense_1'] == 0
+    assert layer_sparsity['dense_2'] == 0
+    assert layer_sparsity['dense_3'] > 0
+
 
 def test_qkeras_model_reduction():
     bits = 8
     activation = 'quantized_relu(4)'
     quantizer = quantized_bits(bits, 0)
-    
+
     model = Sequential()
     model.add(QConv2D(8, (3, 3), input_shape=(64, 64, 1), name='qconv2d_1', padding='same', kernel_quantizer=quantizer))
     model.add(MaxPooling2D())
@@ -78,13 +80,13 @@ def test_qkeras_model_reduction():
     model.add(BatchNormalization())
     model.add(QActivation(activation, name='qrelu_2'))
     model.add(Flatten())
-    model.add(QDense(32, input_shape=(16, ), name = 'qdense_1', kernel_quantizer=quantizer))
+    model.add(QDense(32, input_shape=(16,), name='qdense_1', kernel_quantizer=quantizer))
     model.add(QActivation(activation, name='qrelu_3'))
     model.add(BatchNormalization())
-    model.add(QDense(14, name = 'qdense_2', kernel_quantizer=quantizer))
+    model.add(QDense(14, name='qdense_2', kernel_quantizer=quantizer))
     model.add(QActivation(activation, name='qrelu_4'))
     model.add(BatchNormalization())
-    model.add(QDense(5, name = 'qdense_3', kernel_quantizer=quantizer))
+    model.add(QDense(5, name='qdense_3', kernel_quantizer=quantizer))
     model.add(Softmax())
 
     indices = {
@@ -103,32 +105,31 @@ def test_qkeras_model_reduction():
             weights = layer.get_weights()
             weights[0][:, :, :, indices[layer.name]] = 0
             layer.set_weights(weights)
-   
+
     sparsity, _ = get_model_sparsity(model)
-    assert(sparsity > 0)
+    assert sparsity > 0
 
     reduced = reduce_model(model)
-    assert(reduced.get_layer('qconv2d_1').get_weights()[0].shape == (3, 3, 1, 5))
-    assert(reduced.get_layer('qconv2d_2').get_weights()[0].shape == (5, 5, 5, 26))
-    assert(reduced.get_layer('qdense_1').get_weights()[0].shape == (6656, 28))
-    assert(reduced.get_layer('qdense_2').get_weights()[0].shape == (28, 11))
-    assert(reduced.get_layer('qdense_3').get_weights()[0].shape == (11, 5))
+    assert reduced.get_layer('qconv2d_1').get_weights()[0].shape == (3, 3, 1, 5)
+    assert reduced.get_layer('qconv2d_2').get_weights()[0].shape == (5, 5, 5, 26)
+    assert reduced.get_layer('qdense_1').get_weights()[0].shape == (6656, 28)
+    assert reduced.get_layer('qdense_2').get_weights()[0].shape == (28, 11)
+    assert reduced.get_layer('qdense_3').get_weights()[0].shape == (11, 5)
 
     _, layer_sparsity = get_model_sparsity(reduced)
-    assert(layer_sparsity['qconv2d_1'] == 0)
-    assert(layer_sparsity['qconv2d_2'] == 0)
-    assert(layer_sparsity['qdense_1'] == 0)
-    assert(layer_sparsity['qdense_2'] == 0)
-    assert(layer_sparsity['qdense_3'] > 0)
+    assert layer_sparsity['qconv2d_1'] == 0
+    assert layer_sparsity['qconv2d_2'] == 0
+    assert layer_sparsity['qdense_1'] == 0
+    assert layer_sparsity['qdense_2'] == 0
+    assert layer_sparsity['qdense_3'] > 0
 
     # Verify network surgery has no impact on quantization
-    assert(isinstance(reduced.get_layer('qrelu_1'), QActivation))
-    assert(isinstance(reduced.get_layer('qrelu_2'), QActivation))
-    assert(isinstance(reduced.get_layer('qrelu_3'), QActivation))
-    assert(isinstance(reduced.get_layer('qrelu_4'), QActivation))
-    assert(reduced.get_layer('qconv2d_1').kernel_quantizer['config']['bits'] == bits)
-    assert(reduced.get_layer('qconv2d_2').kernel_quantizer['config']['bits'] == bits)
-    assert(reduced.get_layer('qdense_1').kernel_quantizer['config']['bits'] == bits)
-    assert(reduced.get_layer('qdense_2').kernel_quantizer['config']['bits'] == bits)
-    assert(reduced.get_layer('qdense_3').kernel_quantizer['config']['bits'] == bits)
-    
\ No newline at end of file
+    assert isinstance(reduced.get_layer('qrelu_1'), QActivation)
+    assert isinstance(reduced.get_layer('qrelu_2'), QActivation)
+    assert isinstance(reduced.get_layer('qrelu_3'), QActivation)
+    assert isinstance(reduced.get_layer('qrelu_4'), QActivation)
+    assert reduced.get_layer('qconv2d_1').kernel_quantizer['config']['bits'] == bits
+    assert reduced.get_layer('qconv2d_2').kernel_quantizer['config']['bits'] == bits
+    assert reduced.get_layer('qdense_1').kernel_quantizer['config']['bits'] == bits
+    assert reduced.get_layer('qdense_2').kernel_quantizer['config']['bits'] == bits
+    assert reduced.get_layer('qdense_3').kernel_quantizer['config']['bits'] == bits
diff --git a/test/pytest/optimization/keras/test_regularizers.py b/test/pytest/optimization/keras/test_regularizers.py
index 64d199a37e..9fe518caae 100644
--- a/test/pytest/optimization/keras/test_regularizers.py
+++ b/test/pytest/optimization/keras/test_regularizers.py
@@ -1,13 +1,14 @@
-import pytest
 import numpy as np
+import pytest
 import tensorflow as tf
-from qkeras import QDense, QConv2D
-from tensorflow.keras.optimizers import Adam
+from qkeras import QConv2D, QDense
+from tensorflow.keras.layers import Conv2D, Dense, Flatten
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense, Conv2D, Flatten
+from tensorflow.keras.optimizers import Adam
+
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 from hls4ml.optimization.keras.builder import remove_custom_regularizers
-from hls4ml.optimization.keras.regularizers import DenseRegularizer, Conv2DRegularizer
+from hls4ml.optimization.keras.regularizers import Conv2DRegularizer, DenseRegularizer
 
 # Constants
 pattern_offset = 4
@@ -17,9 +18,11 @@
 dense_layers = [Dense, QDense]
 conv2d_layers = [Conv2D, QConv2D]
 
+
 # Sets the loss due to data to zero; train model only on regularization loss
 def zero_loss(y_true, y_pred):
-    return tf.reduce_mean(0 * tf.square(y_true - y_pred), axis=-1) 
+    return tf.reduce_mean(0 * tf.square(y_true - y_pred), axis=-1)
+
 
 # Helper function, calculates the group norm and variance for a single layer
 def get_norm_and_variance(weights, structure_type, layer='dense'):
@@ -27,7 +30,7 @@ def get_norm_and_variance(weights, structure_type, layer='dense'):
         norm = np.linalg.norm(weights.flatten(), ord=1)
         var = np.var(weights)
         return norm, var
-    
+
     if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
         if layer == 'conv2d':
             norm = np.linalg.norm(np.sum(np.linalg.norm(weights, axis=(0, 1), ord='fro'), axis=0), ord=1)
@@ -35,54 +38,65 @@ def get_norm_and_variance(weights, structure_type, layer='dense'):
         else:
             norm = np.linalg.norm(np.linalg.norm(weights, axis=0, ord=2), ord=1)
             var = np.linalg.norm(np.var(weights, axis=0), ord=1)
-        
+
         return norm, var
-    
+
     if structure_type == SUPPORTED_STRUCTURES.PATTERN:
         number_of_patterns = np.prod(weights.shape) // pattern_offset
         target_shape = (number_of_patterns, pattern_offset)
         reshaped = np.reshape(weights, target_shape)
         total_blocks = pattern_offset // consecutive_patterns
         blocks = np.reshape(
-                            tf.image.extract_patches(
-                                np.expand_dims(np.expand_dims(reshaped, 2), 0),
-                                [1, number_of_patterns, consecutive_patterns, 1],
-                                [1, number_of_patterns, consecutive_patterns, 1], 
-                                [1, 1, 1, 1], 'SAME'
-                            ).numpy(),
-                            (total_blocks, number_of_patterns * consecutive_patterns)
-                        )
+            tf.image.extract_patches(
+                np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                [1, number_of_patterns, consecutive_patterns, 1],
+                [1, number_of_patterns, consecutive_patterns, 1],
+                [1, 1, 1, 1],
+                'SAME',
+            ).numpy(),
+            (total_blocks, number_of_patterns * consecutive_patterns),
+        )
         norm = np.linalg.norm(np.linalg.norm(blocks, axis=1, ord=2), ord=1)
         var = np.linalg.norm(np.var(blocks, axis=1), ord=1)
         return norm, var
 
-    if structure_type == SUPPORTED_STRUCTURES.BLOCK: 
+    if structure_type == SUPPORTED_STRUCTURES.BLOCK:
         total_blocks = (weights.shape[0] * weights.shape[1]) // (block_shape[0] * block_shape[1])
         blocks = np.reshape(
-                            tf.image.extract_patches(
-                                np.expand_dims(np.expand_dims(weights, 2), 0),
-                                [1, block_shape[0], block_shape[1], 1],
-                                [1, block_shape[0], block_shape[1], 1], 
-                                [1, 1, 1, 1], 'SAME'
-                            ).numpy(),
-                            (total_blocks, block_shape[0] * block_shape[1])
-                        )
-        
+            tf.image.extract_patches(
+                np.expand_dims(np.expand_dims(weights, 2), 0),
+                [1, block_shape[0], block_shape[1], 1],
+                [1, block_shape[0], block_shape[1], 1],
+                [1, 1, 1, 1],
+                'SAME',
+            ).numpy(),
+            (total_blocks, block_shape[0] * block_shape[1]),
+        )
+
         norm = np.linalg.norm(np.linalg.norm(blocks, axis=1, ord=2), ord=1)
         var = np.linalg.norm(np.var(blocks, axis=1), ord=1)
         return norm, var
 
+
 @pytest.mark.parametrize('dense', dense_layers)
-@pytest.mark.parametrize('structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED, SUPPORTED_STRUCTURES.PATTERN, SUPPORTED_STRUCTURES.BLOCK])
+@pytest.mark.parametrize(
+    'structure_type',
+    [
+        SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        SUPPORTED_STRUCTURES.STRUCTURED,
+        SUPPORTED_STRUCTURES.PATTERN,
+        SUPPORTED_STRUCTURES.BLOCK,
+    ],
+)
 def test_dense_regularizer(structure_type, dense):
     epochs = 10
     data_points = 10
-    input_shape = (32, )
-    output_shape = (16, )
+    input_shape = (32,)
+    output_shape = (16,)
     X = np.random.rand(data_points, *input_shape)
     y = np.random.rand(data_points, *output_shape)
     w = np.random.rand(input_shape[0], output_shape[0])
-    
+
     # First, fit a model without regularization
     model = Sequential()
     model.add(dense(output_shape[0], input_shape=input_shape))
@@ -97,18 +111,23 @@ def test_dense_regularizer(structure_type, dense):
     dense_weights = model.layers[0].get_weights()
     dense_weights[0] = w
     model.layers[0].set_weights(dense_weights)
-    regularizer = DenseRegularizer(alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset, block_shape=block_shape)
-    model.layers[0].add_loss(lambda layer = model.layers[0] : regularizer(layer.kernel))
+    regularizer = DenseRegularizer(
+        alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset, block_shape=block_shape
+    )
+    model.layers[0].add_loss(lambda layer=model.layers[0]: regularizer(layer.kernel))
     model.compile(loss=zero_loss, optimizer=Adam(1.0))
     model.fit(X, y, epochs=epochs)
     reg_norm, reg_var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type)
 
     # Verify regularization decreased weight magnitude and variance
-    assert(reg_norm < norm)
-    assert(reg_var < var)
+    assert reg_norm < norm
+    assert reg_var < var
+
 
 @pytest.mark.parametrize('conv2d', conv2d_layers)
-@pytest.mark.parametrize('structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED, SUPPORTED_STRUCTURES.PATTERN])
+@pytest.mark.parametrize(
+    'structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED, SUPPORTED_STRUCTURES.PATTERN]
+)
 def test_conv2d_regularizer(structure_type, conv2d):
     epochs = 10
     data_points = 10
@@ -116,7 +135,7 @@ def test_conv2d_regularizer(structure_type, conv2d):
     num_filters = 4
     X = np.random.rand(data_points, *input_shape)
     y = np.random.rand(data_points, 1)
-    
+
     # First, fit a model without regularization
     model = Sequential()
     model.add(conv2d(num_filters, (3, 3), input_shape=input_shape))
@@ -126,18 +145,19 @@ def test_conv2d_regularizer(structure_type, conv2d):
     model.compile(loss=zero_loss, optimizer=Adam())
     model.fit(X, y, epochs=epochs, verbose=True)
     norm, var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type, 'conv2d')
-    
+
     # Now, fit a model with strong regularization, starting with the same initial weights
     model.layers[0].set_weights(conv_weights)
     regularizer = Conv2DRegularizer(alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset)
-    model.layers[0].add_loss(lambda layer = model.layers[0] : regularizer(layer.kernel))
+    model.layers[0].add_loss(lambda layer=model.layers[0]: regularizer(layer.kernel))
     model.compile(loss=zero_loss, optimizer=Adam())
     model.fit(X, y, epochs=epochs, verbose=True)
     reg_norm, reg_var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type, 'conv2d')
 
     # Verify regularization decreased weight magnitude and variance
-    assert(reg_norm < norm)
-    assert(reg_var < var)
+    assert reg_norm < norm
+    assert reg_var < var
+
 
 def test_removal_of_custom_regularizer():
     model = Sequential()
@@ -146,11 +166,11 @@ def test_removal_of_custom_regularizer():
     model.add(Dense(1, kernel_regularizer=DenseRegularizer(1e-3)))
     weights = model.get_weights()
 
-    assert(isinstance(model.layers[0].kernel_regularizer, Conv2DRegularizer))
-    assert(isinstance(model.layers[2].kernel_regularizer, DenseRegularizer))
+    assert isinstance(model.layers[0].kernel_regularizer, Conv2DRegularizer)
+    assert isinstance(model.layers[2].kernel_regularizer, DenseRegularizer)
 
     model = remove_custom_regularizers(model)
 
-    assert(model.layers[0].kernel_regularizer is None)
+    assert model.layers[0].kernel_regularizer is None
     for i in range(len(weights)):
-        assert(np.all(weights[i] == model.get_weights()[i]))
+        assert np.all(weights[i] == model.get_weights()[i])
diff --git a/test/pytest/optimization/keras/test_weight_sharing.py b/test/pytest/optimization/keras/test_weight_sharing.py
index 63718ce08f..c274a84da8 100644
--- a/test/pytest/optimization/keras/test_weight_sharing.py
+++ b/test/pytest/optimization/keras/test_weight_sharing.py
@@ -1,12 +1,13 @@
-import pytest
 import numpy as np
+import pytest
 from qkeras import QDense
 from tensorflow.keras.layers import Dense
 from tensorflow.keras.models import Sequential
+
+from hls4ml.optimization.attributes import get_attributes_from_keras_model
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 from hls4ml.optimization.keras.masking import get_model_masks
 from hls4ml.optimization.objectives import ObjectiveEstimator
-from hls4ml.optimization.attributes import get_attributes_from_keras_model
 
 # Similar tests in test_masking.py, weight sharing instead of pruning
 sparsity = 0.33
@@ -20,6 +21,7 @@
 No savings are incurred with unstructured type (unstructured weight sharing doesn't make sense)
 '''
 
+
 class MockWeightSharingEstimator(ObjectiveEstimator):
     @classmethod
     def layer_resources(self, layer_attributes):
@@ -42,13 +44,15 @@ def layer_savings(self, layer_attributes):
                 if 'Dense' in layer_attributes.layer_type.__name__:
                     return [layer_attributes.weight_shape[1]]
             elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
-                number_of_patterns = np.prod(
-                    layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                number_of_patterns = (
+                    np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                )
                 return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns]
             elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
                 return [np.prod(layer_attributes.optimization_attributes.block_shape)]
         return [0]
 
+
 @pytest.mark.parametrize('local_masking', local_masking)
 @pytest.mark.parametrize('dense', dense_layers)
 def test_weight_sharing_structured(local_masking, dense):
@@ -70,15 +74,13 @@ def test_weight_sharing_structured(local_masking, dense):
     model_attributes['dense'].optimization_attributes.weight_sharing = True
     model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.STRUCTURED
 
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1',
-                                     local=local_masking)
-    frozen = np.array([
-        [0, 1], [1, 1], [2, 1]
-    ],
-        dtype=np.int32)
+    masks, offsets = get_model_masks(
+        model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1', local=local_masking
+    )
+    frozen = np.array([[0, 1], [1, 1], [2, 1]], dtype=np.int32)
 
-    assert (not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]]))
-    assert (np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5))
+    assert not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]])
+    assert np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5)
 
 
 @pytest.mark.parametrize('local_masking', local_masking)
@@ -107,16 +109,13 @@ def test_weight_sharing_pattern(local_masking, dense):
     model_attributes['dense'].optimization_attributes.pattern_offset = 4
     model_attributes['dense'].optimization_attributes.consecutive_patterns = 1
 
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1',
-                                     local=local_masking)
-    frozen = np.array([
-        [0, 0], [1, 0], [2, 0],
-        [0, 2], [1, 2], [2, 2]
-    ],
-        dtype=np.int32)
+    masks, offsets = get_model_masks(
+        model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1', local=local_masking
+    )
+    frozen = np.array([[0, 0], [1, 0], [2, 0], [0, 2], [1, 2], [2, 2]], dtype=np.int32)
 
-    assert (not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]]))
-    assert (np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5))
+    assert not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]])
+    assert np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5)
 
 
 @pytest.mark.parametrize('local_masking', local_masking)
@@ -147,13 +146,10 @@ def test_weight_sharing_block(local_masking, dense):
     model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.BLOCK
     model_attributes['dense'].optimization_attributes.block_shape = (2, 2)
 
-    masks, offsets = get_model_masks(model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1',
-                                     local=local_masking)
-    frozen = np.array([
-        [0, 0], [0, 1], [1, 0], [1, 1],
-        [2, 2], [2, 3], [3, 2], [3, 3]
-    ],
-        dtype=np.int32)
+    masks, offsets = get_model_masks(
+        model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1', local=local_masking
+    )
+    frozen = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [2, 2], [2, 3], [3, 2], [3, 3]], dtype=np.int32)
 
-    assert (not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]]))
-    assert (np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5))
\ No newline at end of file
+    assert not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]])
+    assert np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5)
diff --git a/test/pytest/optimization/test_attributes.py b/test/pytest/optimization/test_attributes.py
index c6991be7e4..760f307133 100644
--- a/test/pytest/optimization/test_attributes.py
+++ b/test/pytest/optimization/test_attributes.py
@@ -1,7 +1,9 @@
+from tensorflow.keras.layers import Conv2D, Dense, Flatten, ReLU
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense, Conv2D, Flatten, ReLU
-from hls4ml.utils.config import config_from_keras_model
+
 from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
+from hls4ml.utils.config import config_from_keras_model
+
 
 def test_attributes():
     dense_units = 16
@@ -11,73 +13,84 @@ def test_attributes():
     input_shape = (8, 8)
     io_type = 'io_parallel'
     strategy = 'Resource'
-    
+
     model = Sequential()
-    model.add(Conv2D(conv_filters, input_shape=(*input_shape, conv_channels), kernel_size=conv_shape, name='conv2d', padding='same', kernel_initializer='ones'))
+    model.add(
+        Conv2D(
+            conv_filters,
+            input_shape=(*input_shape, conv_channels),
+            kernel_size=conv_shape,
+            name='conv2d',
+            padding='same',
+            kernel_initializer='ones',
+        )
+    )
     model.add(Flatten(name='flatten'))
     model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
     model.add(ReLU(name='relu'))
 
-    default_reuse_factor = 2 
+    default_reuse_factor = 2
     default_precision = 'ap_fixed<8, 0>'
-    cfg = config_from_keras_model(model, granularity='name', default_precision=default_precision, default_reuse_factor=default_reuse_factor)    
+    cfg = config_from_keras_model(
+        model, granularity='name', default_precision=default_precision, default_reuse_factor=default_reuse_factor
+    )
     cfg['IOType'] = io_type
     cfg['Model']['Strategy'] = strategy
     cfg['LayerName']['dense']['ReuseFactor'] = 1
 
     # Verify correct information for every layer
     model_attributes = get_attributes_from_keras_model_and_hls4ml_config(model, cfg)
-    assert(len(model_attributes) == 4)
+    assert len(model_attributes) == 4
 
     # conv2d
-    assert(model_attributes['conv2d'].name == 'conv2d')
-    assert(model_attributes['conv2d'].layer_type.__name__ == 'Conv2D')
-    assert(model_attributes['conv2d'].inbound_layers == [])
-    assert(model_attributes['conv2d'].weight_shape == (3, 3, 3, 6))
-    assert(model_attributes['conv2d'].input_shape == (8, 8, 3))
-    assert(model_attributes['conv2d'].output_shape == (8, 8, 6))
-    assert(model_attributes['conv2d'].optimizable == False)
-    assert(model_attributes['conv2d'].args['hls4ml_attributes'].n_in == 9)
-    assert(model_attributes['conv2d'].args['hls4ml_attributes'].n_out == 6)
-    assert(model_attributes['conv2d'].args['hls4ml_attributes'].io_type == io_type)
-    assert(model_attributes['conv2d'].args['hls4ml_attributes'].strategy == strategy)
-    assert(model_attributes['conv2d'].args['hls4ml_attributes'].reuse_factor == default_reuse_factor)
-    assert(model_attributes['conv2d'].args['hls4ml_attributes'].weight_precision.width == 8)
-    assert(model_attributes['conv2d'].args['hls4ml_attributes'].parallelization_factor == 1)
+    assert model_attributes['conv2d'].name == 'conv2d'
+    assert model_attributes['conv2d'].layer_type.__name__ == 'Conv2D'
+    assert model_attributes['conv2d'].inbound_layers == []
+    assert model_attributes['conv2d'].weight_shape == (3, 3, 3, 6)
+    assert model_attributes['conv2d'].input_shape == (8, 8, 3)
+    assert model_attributes['conv2d'].output_shape == (8, 8, 6)
+    assert model_attributes['conv2d'].optimizable == False
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].n_in == 9
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].n_out == 6
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].io_type == io_type
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].strategy == strategy
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].reuse_factor == default_reuse_factor
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].weight_precision.width == 8
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].parallelization_factor == 1
 
     # flatten
-    assert(model_attributes['flatten'].name == 'flatten')
-    assert(model_attributes['flatten'].layer_type.__name__ == 'Flatten')
-    assert(model_attributes['flatten'].weight_shape == ())
-    assert(model_attributes['flatten'].input_shape == (8, 8, 6))
-    assert(model_attributes['flatten'].output_shape == (384, ))
-    assert(model_attributes['flatten'].optimizable == False)
-    
+    assert model_attributes['flatten'].name == 'flatten'
+    assert model_attributes['flatten'].layer_type.__name__ == 'Flatten'
+    assert model_attributes['flatten'].weight_shape == ()
+    assert model_attributes['flatten'].input_shape == (8, 8, 6)
+    assert model_attributes['flatten'].output_shape == (384,)
+    assert model_attributes['flatten'].optimizable == False
+
     # Flatten is not optimizable so hls4mlAttributes (n_in, n_out, reuse factor etc.) will not be stored for it
-    assert('hls4ml_attributes' not in model_attributes['flatten'].args) 
+    assert 'hls4ml_attributes' not in model_attributes['flatten'].args
 
     # dense
-    assert(model_attributes['dense'].name == 'dense')
-    assert(model_attributes['dense'].layer_type.__name__ == 'Dense')
-    assert(model_attributes['dense'].weight_shape == (384, 16))
-    assert(model_attributes['dense'].input_shape == (384, ))
-    assert(model_attributes['dense'].output_shape == (16, ))
-    assert(model_attributes['dense'].optimizable == False)
-    assert(model_attributes['dense'].args['hls4ml_attributes'].n_in == 384)
-    assert(model_attributes['dense'].args['hls4ml_attributes'].n_out == 16)
-    assert(model_attributes['dense'].args['hls4ml_attributes'].io_type == io_type)
-    assert(model_attributes['dense'].args['hls4ml_attributes'].strategy == strategy)
-    assert(model_attributes['dense'].args['hls4ml_attributes'].reuse_factor == 1)
-    assert(model_attributes['dense'].args['hls4ml_attributes'].output_precision.width == 8)
-    assert(model_attributes['dense'].args['hls4ml_attributes'].parallelization_factor == 1)
+    assert model_attributes['dense'].name == 'dense'
+    assert model_attributes['dense'].layer_type.__name__ == 'Dense'
+    assert model_attributes['dense'].weight_shape == (384, 16)
+    assert model_attributes['dense'].input_shape == (384,)
+    assert model_attributes['dense'].output_shape == (16,)
+    assert model_attributes['dense'].optimizable == False
+    assert model_attributes['dense'].args['hls4ml_attributes'].n_in == 384
+    assert model_attributes['dense'].args['hls4ml_attributes'].n_out == 16
+    assert model_attributes['dense'].args['hls4ml_attributes'].io_type == io_type
+    assert model_attributes['dense'].args['hls4ml_attributes'].strategy == strategy
+    assert model_attributes['dense'].args['hls4ml_attributes'].reuse_factor == 1
+    assert model_attributes['dense'].args['hls4ml_attributes'].output_precision.width == 8
+    assert model_attributes['dense'].args['hls4ml_attributes'].parallelization_factor == 1
 
     # relu
-    assert(model_attributes['relu'].name == 'relu')
-    assert(model_attributes['relu'].layer_type.__name__ == 'ReLU')
-    assert(model_attributes['relu'].weight_shape == ())
-    assert(model_attributes['relu'].input_shape == (16, ))
-    assert(model_attributes['relu'].output_shape == (16, ))
-    assert(model_attributes['relu'].optimizable == False)
+    assert model_attributes['relu'].name == 'relu'
+    assert model_attributes['relu'].layer_type.__name__ == 'ReLU'
+    assert model_attributes['relu'].weight_shape == ()
+    assert model_attributes['relu'].input_shape == (16,)
+    assert model_attributes['relu'].output_shape == (16,)
+    assert model_attributes['relu'].optimizable == False
 
     # ReLU is not optimizable so hls4mlAttributes (n_in, n_out, reuse factor etc.) will not be stored for it
-    assert('hls4ml_attributes' not in model_attributes['relu'].args) 
+    assert 'hls4ml_attributes' not in model_attributes['relu'].args
diff --git a/test/pytest/optimization/test_knapsack.py b/test/pytest/optimization/test_knapsack.py
index 6c985688c4..a4145c00d0 100644
--- a/test/pytest/optimization/test_knapsack.py
+++ b/test/pytest/optimization/test_knapsack.py
@@ -1,7 +1,9 @@
-import pytest
 import numpy as np
+import pytest
+
 from hls4ml.optimization.knapsack import solve_knapsack
 
+
 # In the simple case below, both implementations give the optimal answer
 # In general, the greedy algorithm will not give the optimal solution
 @pytest.mark.parametrize('implementation', ['dynamic', 'greedy', 'branch_bound', 'CBC_MIP'])
@@ -11,10 +13,11 @@ def test_knapsack_1d(implementation):
     capacity = np.array([8])
 
     optimal, selected = solve_knapsack(values, weights, capacity, implementation=implementation)
-    assert(optimal == 18)
-    assert(0 in selected)
-    assert(2 in selected)
-    assert(3 in selected)
+    assert optimal == 18
+    assert 0 in selected
+    assert 2 in selected
+    assert 3 in selected
+
 
 @pytest.mark.parametrize('implementation', ['greedy', 'branch_bound', 'CBC_MIP'])
 def test_multidimensional_knapsack(implementation):
@@ -23,9 +26,10 @@ def test_multidimensional_knapsack(implementation):
     capacity = np.array([8, 7])
 
     optimal, selected = solve_knapsack(values, weights, capacity, implementation=implementation)
-    assert(optimal == 22)
-    assert(0 in selected)
-    assert(3 in selected)
+    assert optimal == 22
+    assert 0 in selected
+    assert 3 in selected
+
 
 def test_knapsack_equal_weights():
     values = np.array([10, 2, 6, 8, 3])
@@ -33,9 +37,10 @@ def test_knapsack_equal_weights():
     capacity = np.array([7, 7])
 
     optimal, selected = solve_knapsack(values, weights, capacity)
-    assert(optimal == 18)
-    assert(0 in selected)
-    assert(3 in selected)
+    assert optimal == 18
+    assert 0 in selected
+    assert 3 in selected
+
 
 def test_knapsack_all_elements_fit():
     values = np.array([10, 2, 6, 12, 3])
@@ -43,5 +48,5 @@ def test_knapsack_all_elements_fit():
     capacity = np.array([19, 12])
 
     optimal, selected = solve_knapsack(values, weights, capacity)
-    assert(optimal == 33)
-    assert(selected == list(range(0, values.shape[0])))
+    assert optimal == 33
+    assert selected == list(range(0, values.shape[0]))
diff --git a/test/pytest/optimization/test_objectives.py b/test/pytest/optimization/test_objectives.py
index 2991b1c3fc..a7d81befe6 100644
--- a/test/pytest/optimization/test_objectives.py
+++ b/test/pytest/optimization/test_objectives.py
@@ -1,10 +1,12 @@
 import numpy as np
+from tensorflow.keras.layers import Conv2D, Dense, Flatten
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense, Conv2D, Flatten
-from hls4ml.optimization.objectives import ParameterEstimator
+
 from hls4ml.optimization.attributes import get_attributes_from_keras_model
+from hls4ml.optimization.objectives import ParameterEstimator
+
 
-# Test attempts to verify one of the estimators (parameter) is correctly declared, the functions are static etc. 
+# Test attempts to verify one of the estimators (parameter) is correctly declared, the functions are static etc.
 def test_parameter_objective():
     # Model parameters
     dense_units = 16
@@ -14,7 +16,16 @@ def test_parameter_objective():
     input_shape = (8, 8)
 
     model = Sequential()
-    model.add(Conv2D(conv_filters, input_shape=(*input_shape, conv_channels), kernel_size=conv_shape, name='conv2d', padding='same', kernel_initializer='ones'))
+    model.add(
+        Conv2D(
+            conv_filters,
+            input_shape=(*input_shape, conv_channels),
+            kernel_size=conv_shape,
+            name='conv2d',
+            padding='same',
+            kernel_initializer='ones',
+        )
+    )
     model.add(Flatten(name='flatten'))
     model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
     model_attributes = get_attributes_from_keras_model(model)
@@ -26,17 +37,21 @@ def test_parameter_objective():
         model_attributes[layer.name].optimization_attributes = optimization_attributes
 
     # Verify conv2d and dense are optimizable, flatten is not
-    assert(model_attributes['conv2d'].optimizable)
-    assert(not model_attributes['flatten'].optimizable)
-    assert(model_attributes['dense'].optimizable)
+    assert model_attributes['conv2d'].optimizable
+    assert not model_attributes['flatten'].optimizable
+    assert model_attributes['dense'].optimizable
 
     # Verify layer resources (number of parameters) are correct
-    assert([conv_filters * conv_channels * np.prod(conv_shape)] == ParameterEstimator.layer_resources(model_attributes['conv2d']))
-    assert([0] == ParameterEstimator.layer_resources(model_attributes['flatten']))
-    assert([conv_filters * np.prod(input_shape) * dense_units] == ParameterEstimator.layer_resources(model_attributes['dense']))
+    assert [conv_filters * conv_channels * np.prod(conv_shape)] == ParameterEstimator.layer_resources(
+        model_attributes['conv2d']
+    )
+    assert [0] == ParameterEstimator.layer_resources(model_attributes['flatten'])
+    assert [conv_filters * np.prod(input_shape) * dense_units] == ParameterEstimator.layer_resources(
+        model_attributes['dense']
+    )
 
     # Verify layer savings are correct - is_layer_optimizable should have returned UNSTRUCTURED as the pruning type
     # Since it wasn't overwritten, each pruning step saves one parameter
-    assert([1] == ParameterEstimator.layer_savings(model_attributes['conv2d']))
-    assert([0] == ParameterEstimator.layer_savings(model_attributes['flatten']))
-    assert([1] == ParameterEstimator.layer_savings(model_attributes['dense']))
+    assert [1] == ParameterEstimator.layer_savings(model_attributes['conv2d'])
+    assert [0] == ParameterEstimator.layer_savings(model_attributes['flatten'])
+    assert [1] == ParameterEstimator.layer_savings(model_attributes['dense'])
diff --git a/test/pytest/optimization/test_scheduler.py b/test/pytest/optimization/test_scheduler.py
index d16d15da73..2dc7642bf6 100644
--- a/test/pytest/optimization/test_scheduler.py
+++ b/test/pytest/optimization/test_scheduler.py
@@ -1,8 +1,10 @@
 import numpy as np  # Use np.testing.assert_allclose due to floating point rounding errors
-from hls4ml.optimization.scheduler import ConstantScheduler, BinaryScheduler, PolynomialScheduler
+
+from hls4ml.optimization.scheduler import BinaryScheduler, ConstantScheduler, PolynomialScheduler
+
 
 def test_constant_scheduler():
-    initial_sparsity =  0.25
+    initial_sparsity = 0.25
     update_step = 0.10
     target_sparsity = initial_sparsity + 2.5 * update_step
 
@@ -22,8 +24,9 @@ def test_constant_scheduler():
     # Assert final (achievable) sparsity is correct
     np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity + 2 * update_step)
 
+
 def test_binary_scheduler():
-    initial_sparsity =  0.25
+    initial_sparsity = 0.25
     target_sparsity = 0.5
     threshold = 0.05
 
@@ -49,22 +52,25 @@ def test_binary_scheduler():
     # Assert final (achievable) sparsity is correct
     np.testing.assert_allclose(scheduler.get_sparsity(), s3)
 
+
 def test_polynomial_scheduler():
     decay_power = 2
     maximum_steps = 2
-    initial_sparsity =  0.25
+    initial_sparsity = 0.25
     target_sparsity = 0.5
-  
+
     # Assert initial sparsity correct
-    scheduler = PolynomialScheduler(maximum_steps, initial_sparsity=initial_sparsity, final_sparsity=target_sparsity, decay_power=decay_power)
+    scheduler = PolynomialScheduler(
+        maximum_steps, initial_sparsity=initial_sparsity, final_sparsity=target_sparsity, decay_power=decay_power
+    )
     np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity)
 
     # Assert 1st update step is correct
-    s1 =  target_sparsity + (initial_sparsity - target_sparsity) * ((1 - 1 / maximum_steps) ** decay_power)
+    s1 = target_sparsity + (initial_sparsity - target_sparsity) * ((1 - 1 / maximum_steps) ** decay_power)
     np.testing.assert_allclose(scheduler.update_step(), (True, s1))
 
     # Assert 1st repair step is correct
-    s2 =  target_sparsity + (initial_sparsity - target_sparsity) * ((1 - 2 / maximum_steps) ** decay_power)
+    s2 = target_sparsity + (initial_sparsity - target_sparsity) * ((1 - 2 / maximum_steps) ** decay_power)
     np.testing.assert_allclose(scheduler.repair_step(), (True, s2))
 
     # Assert 2nd update step does not occur, since current_step = maximum_steps

From aac3f1fe06420a2e3fa00496c58d6d4b185a379d Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 15 Sep 2023 19:03:03 +0200
Subject: [PATCH 13/19] Fix missing packages; rename PyTests & pre-commit pt 2.

---
 docs/advanced/model_optimization.rst          |  4 +-
 hls4ml/optimization/__init__.py               | 30 ++++++----
 hls4ml/optimization/attributes.py             | 10 ++--
 hls4ml/optimization/config.py                 |  8 ++-
 hls4ml/optimization/keras/__init__.py         | 59 +++++++++++--------
 hls4ml/optimization/keras/builder.py          | 16 +++--
 hls4ml/optimization/keras/config.py           |  3 +-
 hls4ml/optimization/keras/masking.py          | 36 +++++++----
 hls4ml/optimization/keras/reduction.py        |  1 -
 hls4ml/optimization/keras/regularizers.py     | 31 ++++++----
 hls4ml/optimization/keras/utils.py            |  8 ++-
 hls4ml/optimization/knapsack.py               | 46 +++++++++------
 hls4ml/optimization/objectives/__init__.py    | 26 +++++---
 .../optimization/objectives/gpu_objectives.py |  3 +-
 .../objectives/vivado_objectives.py           | 18 ++++--
 hls4ml/optimization/scheduler.py              | 26 +++++---
 setup.cfg                                     |  4 ++
 test/pytest/ci-template.yml                   |  2 +-
 .../test_attributes.py                        |  8 +--
 .../test_keras}/test_masking.py               |  7 ++-
 .../test_keras}/test_reduction.py             |  9 +++
 .../test_keras}/test_regularizers.py          |  0
 .../test_keras}/test_weight_sharing.py        |  0
 .../test_knapsack.py                          |  0
 .../test_objectives.py                        |  0
 .../test_scheduler.py                         |  0
 26 files changed, 231 insertions(+), 124 deletions(-)
 rename test/pytest/{optimization => test_optimization}/test_attributes.py (94%)
 rename test/pytest/{optimization/keras => test_optimization/test_keras}/test_masking.py (98%)
 rename test/pytest/{optimization/keras => test_optimization/test_keras}/test_reduction.py (94%)
 rename test/pytest/{optimization/keras => test_optimization/test_keras}/test_regularizers.py (100%)
 rename test/pytest/{optimization/keras => test_optimization/test_keras}/test_weight_sharing.py (100%)
 rename test/pytest/{optimization => test_optimization}/test_knapsack.py (100%)
 rename test/pytest/{optimization => test_optimization}/test_objectives.py (100%)
 rename test/pytest/{optimization => test_optimization}/test_scheduler.py (100%)

diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
index 75367c3088..07afcfa48f 100644
--- a/docs/advanced/model_optimization.rst
+++ b/docs/advanced/model_optimization.rst
@@ -3,7 +3,7 @@ hls4ml Optimization API
 ========================
 
 Pruning and weight sharing are effective techniques to reduce model footprint and computational requirements. The hls4ml Optimization API introduces hardware-aware pruning and weight sharing.
-By defining custom objectives, the algorithm solves a Knapsack optimization problem aimed at maximizing model performance, while keeping the target resource(s) at a minimum. Out-of-the box objectives include network sparsity, GPU FLOPs, Vivado DSPs, memory utilization etc. 
+By defining custom objectives, the algorithm solves a Knapsack optimization problem aimed at maximizing model performance, while keeping the target resource(s) at a minimum. Out-of-the box objectives include network sparsity, GPU FLOPs, Vivado DSPs, memory utilization etc.
 
 The code block below showcases three use cases of the hls4ml Optimization API - network sparsity (unstructured pruning), GPU FLOPs (structured pruning) and Vivado DSP utilization (pattern pruning). First, we start with unstructured pruning:
 
@@ -115,6 +115,6 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
 There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilisation and VivadoMultiObjectiveEstimator, aimed at optimising BRAM and DSP utilisation.
 Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesing HLS, by modifying the config:
 .. code-block:: Python
-    hls_config = config_from_keras_model(optimized_model)     
+    hls_config = config_from_keras_model(optimized_model)
     hls_config['Model']['DenseResourceImplementation'] = 'Unrolled'
     # Any addition hls4ml config, such as strategy, reuse factor etc...
diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
index 4f18998c1c..d8301f5892 100644
--- a/hls4ml/optimization/__init__.py
+++ b/hls4ml/optimization/__init__.py
@@ -3,6 +3,8 @@
 from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
 from hls4ml.optimization.keras import optimize_model
 
+default_regularization_range = np.logspace(-6, -2, num=16).tolist()
+
 
 def optimize_keras_for_hls4ml(
     keras_model,
@@ -20,7 +22,7 @@ def optimize_keras_for_hls4ml(
     validation_metric,
     increasing,
     rtol,
-    callbacks=[],
+    callbacks=None,
     ranking_metric='l1',
     local=False,
     verbose=False,
@@ -29,7 +31,7 @@ def optimize_keras_for_hls4ml(
     directory='hls4ml-optimization',
     tuner='Bayesian',
     knapsack_solver='CBC_MIP',
-    regularization_range=np.logspace(-6, -2, num=16).tolist(),
+    regularization_range=default_regularization_range,
 ):
     '''
     Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s)
@@ -37,8 +39,10 @@ def optimize_keras_for_hls4ml(
     Args:
     - keras_model (keras.Model): Model to be optimized
     - hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...)
-    - objective (hls4ml.optimization.objectives.ObjectiveEstimator): Parameter, hardware or user-defined objective of optimization
-    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler): Sparsity scheduler, choose between constant, polynomial and binary
+    - objective (hls4ml.optimization.objectives.ObjectiveEstimator):
+        Parameter, hardware or user-defined objective of optimization
+    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler):
+        Sparsity scheduler, choose between constant, polynomial and binary
     - X_train (np.array): Training inputs
     - y_train (np.array): Training labels
     - X_val (np.array): Validation inputs
@@ -48,19 +52,25 @@ def optimize_keras_for_hls4ml(
     - optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
     - loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
     - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
-    - increasing (boolean): If the metric improves with increased values; e.g. accuracy -> increasing = True, MSE -> increasing = False
-    - rtol (float): Relative tolerance; pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+    - increasing (boolean): If the metric improves with increased values;
+        e.g. accuracy -> increasing = True, MSE -> increasing = False
+    - rtol (float): Relative tolerance;
+        pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
 
     Kwargs:
     - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
-    - ranking_metric (string): Metric used for rannking weights and structures; currently supported l1, l2, saliency and Oracle
+    - ranking_metric (string): Metric used for rannking weights and structures;
+        currently supported l1, l2, saliency and Oracle
     - local (boolean): Layer-wise or global pruning
     - verbose (boolean): Display debug logs during model optimization
-    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing, allows regrowth of previously pruned weights
-    - cutoff_bad_trials (int): After how many bad trials (performance below threshold), should model pruning / weight sharing stop
+    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
+        allows regrowth of previously pruned weights
+    - cutoff_bad_trials (int): After how many bad trials (performance below threshold),
+        should model pruning / weight sharing stop
     - directory (string): Directory to store temporary results
     - tuner (str): Tuning alogorithm, choose between Bayesian, Hyperband and None
-    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing; default usually works well; for very large networks, greedy algorithm might be more suitable
+    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
+        default usually works well; for very large networks, greedy algorithm might be more suitable
     - regularization_range (list): List of suitable hyperparameters for weight decay
     '''
 
diff --git a/hls4ml/optimization/attributes.py b/hls4ml/optimization/attributes.py
index 02154931a5..672e2706a0 100644
--- a/hls4ml/optimization/attributes.py
+++ b/hls4ml/optimization/attributes.py
@@ -30,10 +30,10 @@ def __init__(
         if not isinstance(output_precision, (FixedPrecisionType, IntegerPrecisionType)):
             raise Exception('Layer weight precision is not in valid format')
 
-        if not strategy in ('Latency', 'latency', 'Resource', 'resource'):
+        if strategy not in ('Latency', 'latency', 'Resource', 'resource'):
             raise Exception('Unknown layer strategy')
 
-        if not io_type in ('io_parallel', 'io_stream'):
+        if io_type not in ('io_parallel', 'io_stream'):
             raise Exception('Unknown IO type')
 
         self.n_in = n_in
@@ -95,8 +95,10 @@ class LayerAttributes:
         - input_shape (tuple): Layer input shape
         - output_shape (tuple): Layer output shape
         - optimizable (bool): Should optimizations (pruning, weight sharing) be applied to this layer
-        - optimization_attributes (OptimizationAttributes): Type of optimization, optimization vs weight sharing, block shape and pattern offset
-        - args (dict): Additional information, e.g. hls4mlAttributes; dictionary so it can be generic enough for different platforms
+        - optimization_attributes (OptimizationAttributes): Type of optimization,
+            pruning or weight sharing, block shape and pattern offset
+        - args (dict): Additional information,
+            e.g. hls4mlAttributes; dictionary so it can be generic enough for different platforms
     '''
 
     def __init__(
diff --git a/hls4ml/optimization/config.py b/hls4ml/optimization/config.py
index 90fb4acceb..0879c47f62 100644
--- a/hls4ml/optimization/config.py
+++ b/hls4ml/optimization/config.py
@@ -17,15 +17,17 @@
         - Dense: Neurons, determined by their outgoing connections (columns in Keras weight tensors)
         - Conv2D: Filters (structures of size filt_width x filt_height x n_chan)
         - Notes:
-            - For Dense, it was also possible optimize by incoming connections (rows); however, removing zero neurons becomes harder
+            - For Dense, it was also possible optimize by incoming connections (rows);
+                However, removing zero neurons becomes harder because of Keras Surgeon
             - For Conv2D, significant literature explored pruning channels; currently not supported
     - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
 
 3. Pattern:
     - Pruning: Y
     - Weight sharing: Y
-    - Description: Zeroes out or quantizes all the weights in a group.
-       Groups are determined by a variable, n, and every n-th weight in the flattened, transposed (Resource) weight tensor is collected and stored in the same group
+    - Description: Zeroes out or quantizes all the weights in a group
+       Groups are determined by a variable, n, and every n-th weight in the flattened,
+       Transposed (Resource) weight tensor is collected and stored in the same group
        Equivalent to pruning/quantizing weight processed by the same DSP in hls4ml
     - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
 
diff --git a/hls4ml/optimization/keras/__init__.py b/hls4ml/optimization/keras/__init__.py
index c9088d50b5..a07f0adac7 100644
--- a/hls4ml/optimization/keras/__init__.py
+++ b/hls4ml/optimization/keras/__init__.py
@@ -16,6 +16,7 @@
 from hls4ml.optimization.scheduler import OptimizationScheduler
 
 np_config.enable_numpy_behavior()
+default_regularization_range = np.logspace(-6, -2, num=16).tolist()
 
 
 def optimize_model(
@@ -34,7 +35,7 @@ def optimize_model(
     validation_metric,
     increasing,
     rtol,
-    callbacks=[],
+    callbacks=None,
     ranking_metric='l1',
     local=False,
     verbose=False,
@@ -43,46 +44,59 @@ def optimize_model(
     directory=TMP_DIRECTORY,
     tuner='Bayesian',
     knapsack_solver='CBC_MIP',
-    regularization_range=np.logspace(-6, -2, num=16).tolist(),
+    regularization_range=default_regularization_range,
 ):
     '''
     Top-level function for optimizing a Keras model, given objectives
 
     Args:
     - model (keras.Model): Model to be optimized
-    - model_attributes (dict): Layer-wise model attributes, obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
-    - objective (hls4ml.optimization.objectives.ObjectiveEstimator): Parameter, hardware or user-defined objective of optimization
-    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler): Sparsity scheduler, choose between constant, polynomial and binary
+    - model_attributes (dict): Layer-wise model attributes,
+        obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
+    - objective (hls4ml.optimization.objectives.ObjectiveEstimator):
+        Parameter, hardware or user-defined objective of optimization
+    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler):
+        Sparsity scheduler, choose between constant, polynomial and binary
     - X_train (np.array): Training inputs
     - y_train (np.array): Training labels
     - X_val (np.array): Validation inputs
     - y_val (np.array): Validation labels
     - batch_size (int): Batch size during training
     - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
-    - optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
-    - loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
-    - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
-    - increasing (boolean): If the metric improves with increased values; e.g. accuracy -> increasing = True, MSE -> increasing = False
-    - rtol (float): Relative tolerance; pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+    - optimizer (keras.optimizers.Optimizer or equivalent-string description):
+        Optimizer used during training
+    - loss_fn (keras.losses.Loss or equivalent loss description):
+        Loss function used during training
+    - validation_metric (keras.metrics.Metric or equivalent loss description):
+        Validation metric, used as a baseline
+    - increasing (boolean): If the metric improves with increased values;
+        e.g. accuracy -> increasing = True, MSE -> increasing = False
+    - rtol (float): Relative tolerance;
+        pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
 
     Kwargs:
     - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
-    - ranking_metric (string): Metric used for rannking weights and structures; currently supported l1, l2, saliency and Oracle
+    - ranking_metric (string): Metric used for rannking weights and structures;
+        currently supported l1, l2, saliency and Oracle
     - local (boolean): Layer-wise or global pruning
     - verbose (boolean): Display debug logs during model optimization
-    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing, allows regrowth of previously pruned weights
-    - cutoff_bad_trials (int): After how many bad trials (performance below threshold), should model pruning / weight sharing stop
+    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
+        allows regrowth of previously pruned weights
+    - cutoff_bad_trials (int): After how many bad trials (performance below threshold),
+        should model pruning / weight sharing stop
     - directory (string): Directory to store temporary results
     - tuner (str): Tuning alogorithm, choose between Bayesian, Hyperband and None
-    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing; default usually works well; for very large networks, greedy algorithm might be more suitable
+    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
+        default usually works well; for very large networks, greedy algorithm might be more suitable
     - regularization_range (list): List of suitable hyperparameters for weight decay
     '''
 
     if not isinstance(scheduler, OptimizationScheduler):
         raise Exception(
             'Scheduler must be an instance of from hls4ml.optimization.scheduler.OptimizationScheduler'
-            + 'If you provided string description (e.g. \'constant\'), please use an object instance (i.e. ConstantScheduler())'
-            'For a full list of supported schedulers and their description, refer to hls4ml.optimization.scheduler.'
+            'If you provided string description (e.g. \'constant\')'
+            'Please use an object instance (i.e. ConstantScheduler()).'
+            'For a full list of supported schedulers, refer to hls4ml.optimization.scheduler.'
         )
 
     if epochs <= rewinding_epochs:
@@ -164,7 +178,8 @@ def optimize_model(
     masked_backprop = MaskedBackprop(optimizable_model, loss_fn, model_attributes)
 
     # In certain cases, the model might underperform at the current sparsity level, but perform better at a higher sparsity
-    # Therefore, monitor the models performance over several sparsity levels and only stop pruning after high loss over several trials
+    # Therefore, monitor the models performance over several sparsity levels and
+    # Only stop pruning after high loss over several trials
     bad_trials = 0
     sparsity_conditions = True
     target_sparsity = scheduler.get_sparsity()
@@ -222,13 +237,11 @@ def optimize_model(
             # Evaluate on validation set and print epoch summary
             if verbose:
                 val_res = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)
-                print(
-                    f'Epoch: {epoch + 1} - Time: {time.time() - start_time}s - Average training loss: {round(epoch_loss_avg.result(), 3)}'
-                )
+                t = time.time() - start_time
+                avg_loss = round(epoch_loss_avg.result(), 3)
+                print(f'Epoch: {epoch + 1} - Time: {t}s - Average training loss: {avg_loss}')
                 print(f'Epoch: {epoch + 1} - learning_rate: {optimizable_model.optimizer.learning_rate.numpy()}')
-                print(
-                    f'Epoch: {epoch + 1} - Loss on validation set: {val_res[0]} - Performance on validation set: {val_res[1]}'
-                )
+                print(f'Epoch: {epoch + 1} - Validation loss: {val_res[0]} - Performance on validation set: {val_res[1]}')
 
         # Check if model works after pruning
         pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
diff --git a/hls4ml/optimization/keras/builder.py b/hls4ml/optimization/keras/builder.py
index 9bb4db6626..9a3bec8d01 100644
--- a/hls4ml/optimization/keras/builder.py
+++ b/hls4ml/optimization/keras/builder.py
@@ -90,6 +90,9 @@ def build(self, hp):
         return model_to_prune
 
 
+default_regularization_range = np.logspace(-6, -2, num=16).tolist()
+
+
 def build_optimizable_model(
     model,
     attributes,
@@ -104,7 +107,7 @@ def build_optimizable_model(
     verbose=False,
     directory=TMP_DIRECTORY,
     tuner='Bayesian',
-    regularization_range=np.logspace(-6, -2, num=15).tolist(),
+    regularization_range=default_regularization_range,
 ):
     '''
     Function identifying optimizable layers and adding a regularization loss
@@ -128,10 +131,13 @@ def build_optimizable_model(
     - learning_rate_range (list): List of suitable hyperparameters for learning rate
 
     Notes:
-    - In general, the regularization and learning rate ranges do not need to be provided, as the implementation sets a generic enough range.
-      However, if the user has an idea on the possible range on hyperparameter ranges (eg. VGG-16 weight decay ~10^-5), the tuning will complete faster
-    - The default tuner is Bayesian & when coupled with the correct ranges of hyperparameters, it performs quite well, fast. However, older version of Keras Tuner had a crashing bug with Bayesian Tuner
-    - In general, the directory does not need to be specified. However, if pruning several models simultaneously, to avoid conflicting intermediate results, it is useful to specify directory
+    - In general, the regularization and learning rate ranges do not need to be provided,
+        as the implementation sets a generic enough range. if the user has an idea on the
+        possible range on hyperparameter ranges, the tuning will complete faster.
+    - The default tuner is Bayesian & when coupled with the correct ranges of hyperparameters,
+        it performs quite well, fast. However, older version of Keras Tuner had a crashing bug with it.
+    - In general, the directory does not need to be specified. However, if pruning several models simultaneously,
+        to avoid conflicting intermediate results, it is useful to specify directory.
     '''
     # User provided manual hyper-parameters for regularisation loss
     # TODO - Maybe we could extend this to be hyper-parameters per layer? or layer-type?
diff --git a/hls4ml/optimization/keras/config.py b/hls4ml/optimization/keras/config.py
index 08bf67aa7b..d3ade5933d 100644
--- a/hls4ml/optimization/keras/config.py
+++ b/hls4ml/optimization/keras/config.py
@@ -14,7 +14,8 @@
 
 1. l1 - groups of weights are ranked by their l1 norm
 2. l2 - groups of weights are ranked by their l2 norm
-3. oracle - abs(dL / dw * w), introduced by Molchanov et al. (2016) Pruning Convolutional Neural Networks for Resource Efficient Inference
+3. oracle - abs(dL / dw * w), introduced by Molchanov et al. (2016)
+    Pruning Convolutional Neural Networks for Resource Efficient Inference
 4. saliency - (d^2L / dw^2 * w)^2, introduced by Lecun et al. (1989) Optimal Brain Damage
 '''
 SUPPORTED_METRICS = ('l1', 'l2', 'oracle', 'saliency')
diff --git a/hls4ml/optimization/keras/masking.py b/hls4ml/optimization/keras/masking.py
index 76d84dbea2..bf673b9154 100644
--- a/hls4ml/optimization/keras/masking.py
+++ b/hls4ml/optimization/keras/masking.py
@@ -38,7 +38,8 @@ def get_model_masks(
 
     If a layer supports both weight sharing and pruning, both the norm and variance of the group are calculated
     And the smaller one is considered; so if the norm is smaller, the group will be considered for pruning
-    Otherise, the group will be considered for weight sharing. Both the norm and variance are normalized, to avoid magnitude biases
+    Otherise, the group will be considered for weight sharing.
+    Both the norm and variance are normalized, to avoid magnitude biases.
 
     Args:
         - keras_model (keras.model) - Model to be masked
@@ -47,9 +48,12 @@ def get_model_masks(
         - objective (ObjectiveEstimator) - Objective to be minimized (e.g. DSP, FLOPs etc.)
         - metric (string) - Weight ranking metric - l1, l2, Oracle, saliency
         - local (boolean) - Equal layer-wise sparsity
-        - gradients (dict) - A layer-wise dictionary of weight gradients, needed for Oracle ranking
-        - hessians (dict) - A layer-wse dictionary of second gradients, needed for saliency ranking
-        - knapsack_solver (str) - Algorithm for solving Knapsack problem; recommended is to use default, unless dealing with highly dimensional problems, in which case greedy is better
+        - gradients (dict) - A layer-wise dictionary of weight gradients
+            (needed for Oracle ranking)
+        - hessians (dict) - A layer-wise dictionary of second gradients
+            (needed for saliency ranking)
+        - knapsack_solver (str) - Algorithm for solving Knapsack problem; recommended is to use default.
+            Unless dealing with highly dimensional problems, in which case greedy is better.
 
     Return:
         - masks (dict) - Layer-wise dictionary of binary tensors
@@ -86,10 +90,14 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
     for layer in keras_model.layers:
         # Care needs to be taken if layer_savings = 0
         # As long as the default attributes are used (from is_layer_optimizable(...)), this should never happen
-        # However, if the optimization attributes are manually changed it would be possible to select the structure type such that savings = 0
-        # In this case, the goal is to keep resource utilization under a certain threshold; and savings are equivalent to resources per single group
-        # Therefore, in the knapsack solver, zero-saving would remain unmasked; as it has a "weight" of zero, it is always stored in the knapsack
-        # So not masking a group without saving is as expected; however, if solved through greedy knaspack an exception will be thrown (division by zero)
+        # However, if the optimization attributes are manually changed,
+        # It would be possible to select the structure type such that savings = 0
+        # In this case, the goal is to keep resource utilization under a certain threshold;
+        # Savings are equivalent to resources per single group
+        # Therefore, in the knapsack solver, zero-saving would remain unmasked;
+        # As it has a "weight" of zero, it is always stored in the knapsack
+        # So not masking a group without saving is as expected;
+        # However, if solved through greedy knaspack an exception will be thrown (division by zero)
         if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
             layer_savings = objective.layer_savings(model_attributes[layer.name])
             layer_resources = objective.layer_resources(model_attributes[layer.name])
@@ -330,7 +338,8 @@ def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric
                     raise Exception('Block sizes need to be fators of weight matrix dimensions')
 
                 # TensorFlow has a built-in method for exctracting sub-tensors of given shape and stride
-                # This method is commonly used to perform im2col, docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
+                # This method is commonly used to perform im2col,
+                # Docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
                 total_blocks = (value.shape[0] * value.shape[1]) // (block_shape[0] * block_shape[1])
                 blocks_in_row = value.shape[1] // block_shape[1]
                 blocks = np.reshape(
@@ -411,13 +420,16 @@ def __get_masks_global(keras_model, model_attributes, sparsity, objective, metri
     total_resources = []
 
     # Iterate through all layers and create a list of all the optimizable groups (single weight, structure, pattern, block)
-    # Each entry contains the value associated with the group, alongside the layer it belongs to and its location in the layer
+    # Each entry contains the value associated with the group,
+    # Alongside the layer it belongs to and its location in the layer
     # The values is normalised w.r.t to the to largest element in the group, to avoid bias towards large layers
     # We also keep track of total model resources, with respect to the objective
-    # A detailed comment in the local masking function is given for considerations on exception of layer savings and how to address them
+    # A detailed comment in the local masking function is given for
+    # Considerations on exception of layer savings and how to address them
     for layer in keras_model.layers:
         # Optimizable should be always enabled if either pruning or weight sharing are enabled
-        # However, if the objectives are implemented incorrectly, it is possible to have optimizatons enabled without any types of optimization (pruning, weight sharing etc.) enabled
+        # However, if the objectives are implemented incorrectly,
+        # It is possible to have optimizatons enabled without any types of optimization (pruning, weight sharing) enabled
         layer_optimizable = model_attributes[layer.name].optimizable and (
             model_attributes[layer.name].optimization_attributes.weight_sharing
             or model_attributes[layer.name].optimization_attributes.pruning
diff --git a/hls4ml/optimization/keras/reduction.py b/hls4ml/optimization/keras/reduction.py
index 17821ef13e..bd4621ed76 100644
--- a/hls4ml/optimization/keras/reduction.py
+++ b/hls4ml/optimization/keras/reduction.py
@@ -20,7 +20,6 @@
 
 
 def reduce_model(model):
-    # TODO - Should we make Keras Surgeon a hard requirement in setup.cfg? If so, needs to be installed from git, @bo3z fork
     try:
         from kerassurgeon import Surgeon
     except ModuleNotFoundError:
diff --git a/hls4ml/optimization/keras/regularizers.py b/hls4ml/optimization/keras/regularizers.py
index 255036f4f3..fd8a2f3656 100644
--- a/hls4ml/optimization/keras/regularizers.py
+++ b/hls4ml/optimization/keras/regularizers.py
@@ -24,13 +24,18 @@ class DenseRegularizer(tf.keras.regularizers.Regularizer):
 
     Examples:
         - structure_type = unstructured: unstructured weight regularisation
-        - structure_type = structured: neuron regularization (group weights by row)
-        - structure_type = pattern: regularization on groups of every n-th weight (e.g. grouping by reuse factor in hls4ml)
-        - structure_type = block: regularisation on blocks within weight matrix (e.g. 4x4, 8x1 for certain SIMD processors)
-
-        - consecutive_patterns is commonly encountered with optimization of BRAM utilization - e.g. while it is true that each
-          DSP pattern consumes one DSP, they likely use less than one BRAM block (e.g. if the BRAM width is 36 bit and weight width is 16)
-          In that case, we need to group several patterns together, so the entire block of patterns can be removed, thus saving DSP and BRAM
+        - structure_type = structured: neuron regularization
+            (group weights by row)
+        - structure_type = pattern: regularization on groups of every n-th weight
+            (e.g. grouping by reuse factor in hls4ml)
+        - structure_type = block: regularisation on blocks within weight matrix
+            (e.g. 4x4, 8x1 for certain SIMD processors)
+
+        - consecutive_patterns is commonly encountered with optimization of BRAM utilization -
+            e.g. while it is true that each DSP pattern consumes one DSP,
+            They likely use less than one BRAM block (e.g. if the BRAM width is 36 bit and weight width is 16)
+            In that case, we need to group several patterns together,
+            So the entire block of patterns can be removed, thus saving DSP and BRAM
     '''
 
     def __init__(
@@ -111,7 +116,8 @@ def __call__(self, weights):
                 raise Exception(f'{self.__class__.__name__}: block sizes need to be fators of weight matrix dimensions')
 
             # TensorFlow has a built-in method for exctracting sub-tensors of given shape and stride
-            # This method is commonly used to perform im2col, docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
+            # This method is commonly used to perform im2col,
+            # Docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
             total_blocks = (weights.shape[0] * weights.shape[1]) // (self.block_shape[0] * self.block_shape[1])
             blocks = tf.reshape(
                 tf.image.extract_patches(
@@ -151,15 +157,18 @@ class Conv2DRegularizer(tf.keras.regularizers.Regularizer):
         - norm (int): Norm type (l1 or l2)
         - structure_type (string): Type of regularisation - unstructured, structured, pattern
         - pattern_offset (int): Length of each pattern if structure_type == pattern
-        - weights (tf.Variable): Four-dimensional layer weight tensor, dimensionality (filter_width x filter_height x n_chan x n_filt)
+        - weights (tf.Variable): Four-dimensional layer weight tensor, dimensionality
+        (filter_width x filter_height x n_chan x n_filt)
 
     Return:
         - Regularizer penalty (tf.Variable): Penalty associated with layer weights
 
     Example use cases:
         - structure_type = unstructured: unstructured weight regularisation
-        - structure_type = structured: filter regularization (group weights of dimensionality filt_width x filt_height x n_chan)
-        - structure_type = pattern: regularization on groups of every n-th weight in flattened array (e.g. grouping by reuse factor in hls4ml)
+        - structure_type = structured: filter regularization
+            (group weights of dimensionality filt_width x filt_height x n_chan)
+        - structure_type = pattern: regularization on groups of every n-th weight in flattened array
+            (e.g. grouping by reuse factor in hls4ml)
     '''
 
     def __init__(
diff --git a/hls4ml/optimization/keras/utils.py b/hls4ml/optimization/keras/utils.py
index 92ed1e12ab..eaf34f41fc 100644
--- a/hls4ml/optimization/keras/utils.py
+++ b/hls4ml/optimization/keras/utils.py
@@ -17,7 +17,8 @@ def get_model_gradients(model, loss_fn, X, y):
         - grads (dict): Per-layer gradients of loss with respect to weights
     '''
     grads = {}
-    # While persistent GradientTape slows down execution, it is faster than performing forward pass and non-persisten GradientTape for every layer
+    # While persistent GradientTape slows down execution,
+    # Is faster than performing forward pass and non-persistent for every layer
     with tf.GradientTape(persistent=True) as tape:
         output = model(X, training=True)
         loss_value = loss_fn(y, output)
@@ -83,7 +84,7 @@ def get_model_sparsity(model):
 
     try:
         return zero_weights / total_weights, layer_sparsity
-    except:
+    except ZeroDivisionError:
         return 0.0, layer_sparsity
 
 
@@ -92,7 +93,8 @@ def get_last_layer_with_weights(model):
     '''
     Finds the last layer with weights
     The last layer with weights determined the output shape, so, pruning is sometimes not applicable to it
-    As an example, consider a network with 16 - 32 - 5 neurons - the last layer's neuron (5) cannot be removed, as they map to the data labels
+    As an example, consider a network with 16 - 32 - 5 neurons - the last layer's neuron (5) cannot be removed
+    Since they map to the data labels
     Args:
         - model (keras.model): Input model
 
diff --git a/hls4ml/optimization/knapsack.py b/hls4ml/optimization/knapsack.py
index 5487f4347e..e198945b1b 100644
--- a/hls4ml/optimization/knapsack.py
+++ b/hls4ml/optimization/knapsack.py
@@ -15,8 +15,9 @@ def solve_knapsack(values, weights, capacity, implementation='CBC_MIP', **kwargs
         - implementation (string): Algorithm to solve Knapsack problem - dynamic programming, greedy, branch and bound
 
     Kwargs:
-        - time_limit (float): Limit (in seconds) after which the CBC or Branch & Bound should stop looking for a solution and return optimal so far
-        - scaling_factor (float): Scaling factor for floating points values in CBC or Branch & Bound (see function for details)
+        - time_limit (float): Limit (in seconds) after which the CBC or Branch & Bound should
+            stop looking for a solution and return optimal so far
+        - scaling_factor (float): Scaling factor for floating points values in CBC or B&B
 
     Return:
         - optimal_value (float): The optimal values of elements in the knapsack
@@ -47,8 +48,10 @@ def solve_knapsack(values, weights, capacity, implementation='CBC_MIP', **kwargs
                 - Time complexity: O(mn)
                 - Suitable for highly dimensional constraints or a very high number of items
 
-        - Most implementations require integer values of weights and capacities; for pruning & weight sharing this is never a problem
-            In case non-integer weights and capacities are requires, all of the values should be scaled by an appropriate scaling factor
+        - Most implementations require integer values of weights and capacities;
+            For pruning & weight sharing this is never a problem
+            In case non-integer weights and capacities are requires,
+            All of the values should be scaled by an appropriate scaling factor
     '''
     if implementation not in ('dynamic', 'greedy', 'branch_bound', 'CBC_MIP'):
         raise Exception('Unknown algorithm for solving Knapsack')
@@ -61,8 +64,9 @@ def solve_knapsack(values, weights, capacity, implementation='CBC_MIP', **kwargs
 
     if len(weights.shape) != 2:
         raise Exception(
-            'Current implementation of Knapsack assumes weight vector is 2-dimensional, to allow for multi-dimensional Knapsack problem. \
-                        If solve a one-dimensional Knapsack problem, extend dimensions of weights to a one-row matrix'
+            'Current implementation of Knapsack assumes weight vector is 2-dimensional,'
+            'to allow for multi-dimensional Knapsack problem. \
+            If solving a one-dimensional Knapsack problem, extend dimensions of weights to a one-row matrix'
         )
 
     if values.shape[0] != weights.shape[1]:
@@ -111,7 +115,8 @@ def __solve_1d_knapsack_dp(values, weights, capacity):
     Helper function to solve the 1-dimensional Knapsack problem exactly through dynamic programming
     The dynamic programming approach is only suitable for one-dimensional weight constraints
     Furthermore, it has a high computational complexity and it is not suitable for highly-dimensional arrays
-    NOTE: The weights and corresponding weight constraint need to be integers; if not, the they should be scaled and rounded beforehand
+    NOTE: The weights and corresponding weight constraint need to be integers;
+    If not, the they should be scaled and rounded beforehand
     '''
     assert len(weights.shape) == 1
 
@@ -167,8 +172,10 @@ def __solve_knapsack_greedy(values, weights, capacity):
         else:
             break
 
-    # The greedy algorithm can be sub-optimal; however, selecting the above elements or the next element that could not fit into the knapsack
-    # Will lead to solution that is at most (1/2) of the optimal solution; therefore, take whichever is higher and satisfies the constraints
+    # The greedy algorithm can be sub-optimal;
+    # However, selecting the above elements or the next element that could not fit into the knapsack
+    # Will lead to solution that is at most (1/2) of the optimal solution;
+    # Therefore, take whichever is higher and satisfies the constraints
     if values[i] > optimal and np.all(weights[:, i]) <= capacity:
         return values[i], [i]
     else:
@@ -177,16 +184,17 @@ def __solve_knapsack_greedy(values, weights, capacity):
 
 def __solve_knapsack_branch_and_bound(values, weights, capacity, time_limit=sys.float_info.max, scaling_factor=10e4):
     '''
-    Helper function to solve Knapsack problem using Branch and Bound; implemented using Google OR-Tools [weights & capacities need to be integers]
-    The algorithm explores the search space (a tree of all the posible combinations, 2^N nodes), but discards infeasible & sub-optimal solutions
+    Helper function to solve Knapsack problem using Branch and Bound;
+    Implemented using Google OR-Tools [weights & capacities need to be integers]
+    The algorithm explores the search space (a tree of all the posible combinations, 2^N nodes),
+    But discards infeasible & sub-optimal solutions
 
     Additional args:
-        - time_limit - Time limit (seconds) after which Branch & Bound search should stop and return a sub-optimal solution
-        - scaling_factor - Factor to scale floats in values arrays; OR-Tools requires all values & weights to be integers; so all of the values are scaled by a large number
+        - time_limit - Time limit in seconds
+            After which B&B search should stop and return a sub-optimal solution
+        - scaling_factor - Factor to scale floats in values arrays;
+            OR-Tools requires all values & weights to be integers;
     '''
-    # TODO - Should we make ortools a hard requirement in setup.cfg?
-    # If so, need to see if there are dependencies issues, when I installed a hard requirement was a protobuf version incompatible with tensorboard (MacOS X)
-    # Installed newer version of protobuf and then downgraded (to work with tensorboard) and ortools worked correctly
     try:
         from ortools.algorithms import pywrapknapsack_solver
     except ModuleNotFoundError:
@@ -204,11 +212,13 @@ def __solve_knapsack_branch_and_bound(values, weights, capacity, time_limit=sys.
 
 def __solve_knapsack_cbc_mip(values, weights, capacity, time_limit=sys.float_info.max, scaling_factor=10e4):
     '''
-    Helper function to solve Knapsack problem using the CBC MIP solver using Google OR-Tools [weights & capacities need to be integers]
+    Helper function to solve Knapsack problem using the CBC MIP solver using Google OR-Tools
 
     Additional args:
         - time_limit - Time limit (seconds) after which CBC solver should stop and return a sub-optimal solution
-        - scaling_factor - Factor to scale floats in values arrays; OR-Tools requires all values & weights to be integers; so all of the values are scaled by a large number
+        - scaling_factor - Factor to scale floats in values arrays;
+            OR-Tools requires all values & weights to be integers;
+            So all of the values are scaled by a large number
     '''
     try:
         from ortools.algorithms import pywrapknapsack_solver
diff --git a/hls4ml/optimization/objectives/__init__.py b/hls4ml/optimization/objectives/__init__.py
index bc58f56a6b..05d916d85f 100644
--- a/hls4ml/optimization/objectives/__init__.py
+++ b/hls4ml/optimization/objectives/__init__.py
@@ -16,26 +16,31 @@ class ObjectiveEstimator(ABC):
     '''
     Abstract class with methods for estimating the utilization and savings of a certain layer, with respect to some objective
     For each objective, an inherited class is written with the correct implementaton of the below methods
-    The objectives can be multi-dimensional, e.g. DSPs and BRAM [care needs to be taken when optimizing several objectives, especially if conflicting]
+    The objectives can be multi-dimensional, e.g. DSPs and BRAM
+    Care needs to be taken when optimizing several objectives, especially if conflicting
     '''
 
     @abstractmethod
     def is_layer_optimizable(self, layer_attributes):
         '''
         For a given layer, checks whether optimizations make sense, with respect to the given objective(s)
-        Furthermore, it returns the type of optimization (structured, unstructured etc.) most suitable for minimising the objective(s).
+        Furthermore, it returns the type of optimization (structured, unstructured etc.)
+        Most suitable for minimising the objective(s).
 
         Args:
             - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
 
         Return:
             - optimizable (boolean) - can optimizations be applied to this layer
-            - optimization_attributes (hls4ml.optimiation.attributes.OptimizationAttributes) - most suitable approach for optimization
+            - optimization_attributes (hls4ml.optimiation.attributes.OptimizationAttributes) -
+                Most suitable approach for optimization
 
         Examples:
             - Metric = Total weights, Layer = Dense, shape = (4, 4) -> return True, unstructured
-            - Metric = DSP, Layer = Dense, Precision = ap_fixed<8, 0> -> return False (Vivado doesn't use DSP when precision < 9)
-            - Metric = DSP, Layer = Dense, Precision = ap_fixed<16, 6> -> return True, pattern structure, both pruning and weight sharing
+            - Metric = DSP, Layer = Dense, Precision = ap_fixed<8, 0> -> return False
+                (Vivado doesn't use DSP when precision < 9)
+            - Metric = DSP, Layer = Dense, Precision = ap_fixed<16, 6> ->
+                return True, pattern structure, both pruning and weight sharing
         '''
         pass
 
@@ -65,7 +70,8 @@ def layer_savings(self, layer_attributes):
             - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
 
         Return:
-            - savings (list, int) - savings achieved (one for every dimenson of objective) with OptimizationAttributes from layer_attributes
+            - savings (list, int) - savings achieved (one for every dimenson of objective)
+                                    With OptimizationAttributes from layer_attributes
 
         Example: Metric = Total weights, Layer = Dense, shape = (4, 4):
             - structure_type == unstructured -> return [1]
@@ -75,7 +81,8 @@ def layer_savings(self, layer_attributes):
 
 
 '''
-A class containing objective estimation with the goal of minimizing the number of non-zero weights in a layer [corresponds to unstructured pruning]
+A class containing objective estimation with the goal of minimizing
+The number of non-zero weights in a layer [corresponds to unstructured pruning]
 '''
 
 
@@ -113,11 +120,12 @@ def layer_savings(self, layer_attributes):
         if not pruning:
             logging.warn(
                 'Pruning needs to be enabled to decrease the number of parameters. \
-                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+                It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
             )
             return [0]
 
-        # In this case, pruning = True and weight_sharing = False, so calculate savings incurred by removing a group of weights
+        # In this case, pruning = True and weight_sharing = False,
+        # So calculate savings incurred by removing a group of weights
         if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
             return [1]
         elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
diff --git a/hls4ml/optimization/objectives/gpu_objectives.py b/hls4ml/optimization/objectives/gpu_objectives.py
index 1f8014171a..8528a31839 100644
--- a/hls4ml/optimization/objectives/gpu_objectives.py
+++ b/hls4ml/optimization/objectives/gpu_objectives.py
@@ -58,7 +58,8 @@ def layer_savings(self, layer_attributes):
             )
             return [0]
 
-        # TODO - The below formulas underestimate FLOP savings [removing a filter in a lyer removes channels / neurons in subsequent layers]
+        # TODO - The below formulas underestimate FLOP savings
+        # Removing a filter in a layer removes channels / neurons in subsequent layers
         if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
             if 'Dense' in layer_attributes.layer_type.__name__:
                 return [2 * layer_attributes.weight_shape[0] + 1]
diff --git a/hls4ml/optimization/objectives/vivado_objectives.py b/hls4ml/optimization/objectives/vivado_objectives.py
index bba25dbb84..c0c0c33e09 100644
--- a/hls4ml/optimization/objectives/vivado_objectives.py
+++ b/hls4ml/optimization/objectives/vivado_objectives.py
@@ -214,21 +214,24 @@ def is_layer_optimizable(self, layer_attributes):
         ):
             return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
 
-        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so structured pruning is the most suitable, it reduces the size out output before compile-time
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs,
+        # So structured pruning is the most suitable, it reduces the size out output before compile-time
         if (
             layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
             and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
         ):
             return True, OptimizationAttributes(SUPPORTED_STRUCTURES.STRUCTURED, pruning=True, weight_sharing=False)
 
-        # In io_parallel with Latency, weights and activation tensors are all stored in FFs, so it is equivalent to unstructured, high sparsity pruning
+        # In io_parallel with Latency, weights and activation tensors are all stored in FFs,
+        # So it is equivalent to unstructured, high sparsity pruning
         if (
             layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
             and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
         ):
             return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
 
-    # TODO - This method is inaccurate (accross all cases); in general, estimating FFs is hard, but as long as it is consistent(ly wrong), it should not matter for the pruning
+    # TODO - This method is inaccurate (accross all cases); in general, estimating FFs is hard,
+    # But as long as it is consistent(ly wrong), it should not matter for the pruning
     @classmethod
     def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape:
@@ -250,7 +253,8 @@ def layer_resources(self, layer_attributes):
                 np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width
             ]
 
-        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so FF ~ number_of_outputs x weight_precision
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs,
+        # So FF ~ number_of_outputs x weight_precision
         if (
             layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
             and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
@@ -259,7 +263,8 @@ def layer_resources(self, layer_attributes):
                 np.prod(layer_attributes.output_shape) * layer_attributes.args['hls4ml_attributes'].output_precision.width
             ]
 
-        # In io_parallel with Latency, weights and latency are all stored in FFs, so it is equivalent to the sum of the above two cases
+        # In io_parallel with Latency, weights and latency are all stored in FFs,
+        # So it is equivalent to the sum of the above two cases
         if (
             layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
             and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
@@ -326,7 +331,8 @@ def layer_savings(self, layer_attributes):
                     * layer_attributes.args['hls4ml_attributes'].weight_precision.width
                 ]
 
-        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs, so only structured pruning helps
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs,
+        # So only structured pruning helps
         if (
             layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
             and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
diff --git a/hls4ml/optimization/scheduler.py b/hls4ml/optimization/scheduler.py
index 9b390dc8a4..347427d5dd 100644
--- a/hls4ml/optimization/scheduler.py
+++ b/hls4ml/optimization/scheduler.py
@@ -51,7 +51,9 @@ def get_sparsity(self):
 
 class ConstantScheduler(OptimizationScheduler):
     '''
-    Sparsity updated by a constant term, until (i) sparsity target reached (ii) optimization algorithm stops requesting state updates
+    Sparsity updated by a constant term, until
+        (i) sparsity target reached OR
+        (ii) optimization algorithm stops requesting state updates
     '''
 
     def __init__(self, initial_sparsity=0, final_sparsity=1.0, update_step=0.05):
@@ -66,9 +68,12 @@ def update_step(self):
             return False, self.sparsity
 
     '''
-    In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity
-    In this case, constant sparsity (since it increments by a small amount every time), will simply jump to the next sparsity level
-    The model's performance over several sparsity levels optimization is stoped after high loss over several trials (see top level pruning/optimization function)
+    In certain cases, a model might underperform at the current sparsity level,
+    But perform better at a higher sparsity
+    In this case, constant sparsity (since it increments by a small amount every time),
+    Will simply jump to the next sparsity level
+    The model's performance over several sparsity levels optimization is tracked and S
+    Stopped after high loss over several trials (see top level pruning/optimization function)
 
     '''
 
@@ -108,9 +113,13 @@ def repair_step(self):
 
 class PolynomialScheduler(OptimizationScheduler):
     '''
-    Sparsity updated by at a polynomial decay, until (i) sparsity target reached (ii) optimization algorithm stops requesting state updates
-    For more information, see Zhu & Gupta (2016) - 'To prune, or not to prune: exploring the efficacy of pruning for model compression' and TensorFlow Model Optimization library
-    Note, the implementation is slightly different, as TensorFlow Prune API depends on the total number of epochs and update frequency
+    Sparsity updated by at a polynomial decay, until
+        (i) sparsity target reached OR
+        (ii) optimization algorithm stops requesting state updates
+    For more information, see Zhu & Gupta (2016) -
+        'To prune, or not to prune: exploring the efficacy of pruning for model compression'
+    Note, the implementation is slightly different,
+    Since TensorFlow Prune API depends on the total number of epochs and update frequency
     '''
 
     def __init__(self, maximum_steps, initial_sparsity=0, final_sparsity=1.0, decay_power=3):
@@ -132,7 +141,8 @@ def update_step(self):
     '''
     In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity
     In this case, polynomial sparsity, will simply jump to the next sparsity level
-    The model's performance over several sparsity levels optimization is stoped after high loss over several trials (see top level pruning/optimization function)
+    The model's performance over several sparsity levels optimization is tracked and
+    toped after high loss over several trials (see top level pruning/optimization function)
     '''
 
     def repair_step(self):
diff --git a/setup.cfg b/setup.cfg
index 58ba20f99d..61f9a3dddc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -40,6 +40,10 @@ pytest_randomly.random_seeder =
     hls4ml = hls4ml:reseed
 
 [options.extras_require]
+optimization =
+    keras-surgeon@https://github.com/fastmachinelearning/keras-surgeon
+    ortools
+    packaging
 profiling =
     matplotlib
     pandas
diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index 934cd68d3e..36b2b012f4 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -7,7 +7,7 @@
     - source ~/.bashrc
     - if [ $EXAMPLEMODEL == 1 ]; then git submodule init; git submodule update; fi
     - conda activate hls4ml-testing
-    - pip install .[profiling]
+    - pip install .[profiling,optimization]
   script:
     - cd test/pytest
     - pytest $PYTESTFILE -rA --cov-report xml --cov-report term --cov=hls4ml --junitxml=report.xml --randomly-seed=42 --randomly-dont-reorganize --randomly-dont-reset-seed
diff --git a/test/pytest/optimization/test_attributes.py b/test/pytest/test_optimization/test_attributes.py
similarity index 94%
rename from test/pytest/optimization/test_attributes.py
rename to test/pytest/test_optimization/test_attributes.py
index 760f307133..2669321e09 100644
--- a/test/pytest/optimization/test_attributes.py
+++ b/test/pytest/test_optimization/test_attributes.py
@@ -49,7 +49,7 @@ def test_attributes():
     assert model_attributes['conv2d'].weight_shape == (3, 3, 3, 6)
     assert model_attributes['conv2d'].input_shape == (8, 8, 3)
     assert model_attributes['conv2d'].output_shape == (8, 8, 6)
-    assert model_attributes['conv2d'].optimizable == False
+    assert not model_attributes['conv2d'].optimizable
     assert model_attributes['conv2d'].args['hls4ml_attributes'].n_in == 9
     assert model_attributes['conv2d'].args['hls4ml_attributes'].n_out == 6
     assert model_attributes['conv2d'].args['hls4ml_attributes'].io_type == io_type
@@ -64,7 +64,7 @@ def test_attributes():
     assert model_attributes['flatten'].weight_shape == ()
     assert model_attributes['flatten'].input_shape == (8, 8, 6)
     assert model_attributes['flatten'].output_shape == (384,)
-    assert model_attributes['flatten'].optimizable == False
+    assert not model_attributes['flatten'].optimizable
 
     # Flatten is not optimizable so hls4mlAttributes (n_in, n_out, reuse factor etc.) will not be stored for it
     assert 'hls4ml_attributes' not in model_attributes['flatten'].args
@@ -75,7 +75,7 @@ def test_attributes():
     assert model_attributes['dense'].weight_shape == (384, 16)
     assert model_attributes['dense'].input_shape == (384,)
     assert model_attributes['dense'].output_shape == (16,)
-    assert model_attributes['dense'].optimizable == False
+    assert not model_attributes['dense'].optimizable
     assert model_attributes['dense'].args['hls4ml_attributes'].n_in == 384
     assert model_attributes['dense'].args['hls4ml_attributes'].n_out == 16
     assert model_attributes['dense'].args['hls4ml_attributes'].io_type == io_type
@@ -90,7 +90,7 @@ def test_attributes():
     assert model_attributes['relu'].weight_shape == ()
     assert model_attributes['relu'].input_shape == (16,)
     assert model_attributes['relu'].output_shape == (16,)
-    assert model_attributes['relu'].optimizable == False
+    assert not model_attributes['relu'].optimizable
 
     # ReLU is not optimizable so hls4mlAttributes (n_in, n_out, reuse factor etc.) will not be stored for it
     assert 'hls4ml_attributes' not in model_attributes['relu'].args
diff --git a/test/pytest/optimization/keras/test_masking.py b/test/pytest/test_optimization/test_keras/test_masking.py
similarity index 98%
rename from test/pytest/optimization/keras/test_masking.py
rename to test/pytest/test_optimization/test_keras/test_masking.py
index aa9b9be564..5c5e60aca7 100644
--- a/test/pytest/optimization/keras/test_masking.py
+++ b/test/pytest/test_optimization/test_keras/test_masking.py
@@ -125,7 +125,7 @@ def test_dense_masking_pattern(local_masking, dense):
     model_attributes['dense'].optimization_attributes.pattern_offset = 4
     model_attributes['dense'].optimization_attributes.consecutive_patterns = 1
 
-    # 33% sparsity - zero 4 out of 12 weights, group by pattern [0.33 * 12 = 3.96] - so will select 2 patterns, 6 weights (>=)
+    # 33% sparsity - zero 4 from 12 weights, group by pattern [0.33 * 12 = 3.96] - so will select 2 patterns, 6 weights (>=)
     masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
     zeros = np.array([[0, 0], [1, 0], [2, 0], [0, 2], [1, 2], [2, 2]], dtype=np.int32)
     nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
@@ -138,6 +138,9 @@ def test_dense_masking_pattern(local_masking, dense):
 # Create a Dense layer with artificial weights, so that the 1st and 4th block are pruned
 @pytest.mark.parametrize('local_masking', local_masking)
 @pytest.mark.parametrize('dense', dense_layers)
+@pytest.mark.skip(
+    reason='Currently disabled as no benefits from block pruning are achieved for hls4ml.'
+)  # TODO - Enable when fully tested
 def test_dense_masking_block(local_masking, dense):
     weight_shape = (4, 6)
     model = Sequential()
@@ -323,7 +326,7 @@ def test_conv2d_block_masking_raises_exception(local_masking, conv2d):
     except Exception:
         assert True
         return
-    assert False
+    assert not True
 
 
 # Test edge cases: 0% and 100% sparsity
diff --git a/test/pytest/optimization/keras/test_reduction.py b/test/pytest/test_optimization/test_keras/test_reduction.py
similarity index 94%
rename from test/pytest/optimization/keras/test_reduction.py
rename to test/pytest/test_optimization/test_keras/test_reduction.py
index 1f3a0084a2..e07cbab656 100644
--- a/test/pytest/optimization/keras/test_reduction.py
+++ b/test/pytest/test_optimization/test_keras/test_reduction.py
@@ -1,3 +1,6 @@
+import keras
+import pytest
+from packaging import version
 from qkeras import QActivation, QConv2D, QDense, quantized_bits
 from tensorflow.keras.layers import AveragePooling2D, BatchNormalization, Conv2D, Dense, Flatten, MaxPooling2D, ReLU, Softmax
 from tensorflow.keras.models import Sequential
@@ -12,6 +15,9 @@
 '''
 
 
+@pytest.mark.skipif(
+    version.parse(keras.__version__) > version.parse('2.12.0'), reason='Keras Surgeon only works until Keras 2.12'
+)
 def test_keras_model_reduction():
     model = Sequential()
     model.add(Conv2D(8, (3, 3), input_shape=(64, 64, 1), name='conv2d_1', padding='same'))
@@ -65,6 +71,9 @@ def test_keras_model_reduction():
     assert layer_sparsity['dense_3'] > 0
 
 
+@pytest.mark.skipif(
+    version.parse(keras.__version__) > version.parse('2.12.0'), reason='Keras Surgeon only works until Keras 2.12'
+)
 def test_qkeras_model_reduction():
     bits = 8
     activation = 'quantized_relu(4)'
diff --git a/test/pytest/optimization/keras/test_regularizers.py b/test/pytest/test_optimization/test_keras/test_regularizers.py
similarity index 100%
rename from test/pytest/optimization/keras/test_regularizers.py
rename to test/pytest/test_optimization/test_keras/test_regularizers.py
diff --git a/test/pytest/optimization/keras/test_weight_sharing.py b/test/pytest/test_optimization/test_keras/test_weight_sharing.py
similarity index 100%
rename from test/pytest/optimization/keras/test_weight_sharing.py
rename to test/pytest/test_optimization/test_keras/test_weight_sharing.py
diff --git a/test/pytest/optimization/test_knapsack.py b/test/pytest/test_optimization/test_knapsack.py
similarity index 100%
rename from test/pytest/optimization/test_knapsack.py
rename to test/pytest/test_optimization/test_knapsack.py
diff --git a/test/pytest/optimization/test_objectives.py b/test/pytest/test_optimization/test_objectives.py
similarity index 100%
rename from test/pytest/optimization/test_objectives.py
rename to test/pytest/test_optimization/test_objectives.py
diff --git a/test/pytest/optimization/test_scheduler.py b/test/pytest/test_optimization/test_scheduler.py
similarity index 100%
rename from test/pytest/optimization/test_scheduler.py
rename to test/pytest/test_optimization/test_scheduler.py

From 7c2d128b6b974fa48f1618e2971ed8daca1a2d10 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Fri, 15 Sep 2023 19:32:44 +0200
Subject: [PATCH 14/19] Fix failing tests & GitHub warnings

---
 docs/advanced/model_optimization.rst |  9 ++++++++-
 setup.cfg                            |  4 ++--
 test/pytest/generate_ci_yaml.py      | 21 +++++++++++++++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
index 07afcfa48f..9465353c2e 100644
--- a/docs/advanced/model_optimization.rst
+++ b/docs/advanced/model_optimization.rst
@@ -80,6 +80,7 @@ To optimize GPU FLOPs, the code is similar to above:
         baseline_model, model_attributes, GPUFLOPEstimator, scheduler,
         X_train, y_train, X_val, y_val, batch_size, epochs, optimizer, loss_fn, metric, increasing, rtol
     )
+
     # Evaluate optimized model
     y_optimized = optimized_model.predict(X_test)
     acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
@@ -109,9 +110,15 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
     # Note the change from ParameterEstimator to VivadoDSPEstimator
     optimized_model = optimize_keras_for_hls4ml(
         baseline_model, model_attributes, VivadoDSPEstimator, scheduler,
-        X_train, y_train, X_val, y_val, batch_size, epochs, optimizer, loss_fn, metric, increasing, rtol
+        X_train, y_train, X_val, y_val, batch_size, epochs,
+        optimizer, loss_fn, metric, increasing, rtol
     )
 
+    # Evaluate optimized model
+    y_optimized = optimized_model.predict(X_test)
+    acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
+    print(f'Optimized Keras accuracy: {acc_optimized}')
+
 There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilisation and VivadoMultiObjectiveEstimator, aimed at optimising BRAM and DSP utilisation.
 Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesing HLS, by modifying the config:
 .. code-block:: Python
diff --git a/setup.cfg b/setup.cfg
index 8e0fdd4ba1..24e78e4bd6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,8 +41,8 @@ pytest_randomly.random_seeder =
 
 [options.extras_require]
 optimization =
-    keras-surgeon@https://github.com/fastmachinelearning/keras-surgeon
-    ortools
+    keras-surgeon@git+https://github.com/fastmachinelearning/keras-surgeon.git
+    ortools==9.4.1874
     packaging
 profiling =
     matplotlib
diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index 6d816a7abb..46c1fa9e5b 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -32,5 +32,26 @@ def uses_example_model(test_filename):
     else:
         yml.update(new_yml)
 
+# hls4ml Optimization API
+tests = glob.glob('test_optimization/test_*.py')
+for test in tests:
+    name = test.replace('test_optimization/', '').replace('test_', '').replace('.py', '')
+    new_yml = yaml.safe_load(template.format(name, f'test_optimization/test_{name}.py', int(uses_example_model(test))))
+    if yml is None:
+        yml = new_yml
+    else:
+        yml.update(new_yml)
+
+tests = glob.glob('test_optimization/test_keras/test_*.py')
+for test in tests:
+    name = test.replace('test_optimization/test_keras/', '').replace('test_', '').replace('.py', '')
+    new_yml = yaml.safe_load(
+        template.format(name, f'test_optimization/test_keras/test_{name}.py', int(uses_example_model(test)))
+    )
+    if yml is None:
+        yml = new_yml
+    else:
+        yml.update(new_yml)
+
 yamlfile = open('pytests.yml', 'w')
 yaml.safe_dump(yml, yamlfile)

From c4a5a0f1ded24ca12346ff0c50f30254225843f9 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Mon, 4 Dec 2023 14:03:05 +0100
Subject: [PATCH 15/19] Fix optimization failing PyTests

---
 hls4ml/optimization/keras/reduction.py         |  6 ++++--
 setup.cfg                                      |  2 +-
 test/pytest/generate_ci_yaml.py                | 18 ++++++++++--------
 .../test_keras/test_reduction.py               |  4 ++++
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/hls4ml/optimization/keras/reduction.py b/hls4ml/optimization/keras/reduction.py
index bd4621ed76..899f7f350f 100644
--- a/hls4ml/optimization/keras/reduction.py
+++ b/hls4ml/optimization/keras/reduction.py
@@ -8,7 +8,9 @@
 Function for removing zero neurons & filters from a model and rewiring the model graph
 This function is built on top of Keras Surgeon available at: https://github.com/BenWhetton/keras-surgeon
 Keras Surgeon is no longer under active development and does not work for TensorFlow 2.3+ and QKeras
-The baseline version was forked and updated, available at: https://github.com/bo3z/keras-surgeon
+The baseline version was forked and updated, available at: https://github.com/fastmachinelearning/keras-surgeon
+
+IMPORTANT: To use this funcionality please install separately from the above GitHub.
 
 Args:
     - model (keras.model): Input model
@@ -26,7 +28,7 @@ def reduce_model(model):
         raise Exception(
             'Keras Surgeon not installed. Unable to reduce model footprint '
             'Please install up-to-date Keras Surgeon compatible wit TensorFlow 2.3+ and QKeras '
-            'Installation from git: https://github.com/bo3z/keras-surgeon'
+            'Installation from git: https://github.com/fastmachinelearning/keras-surgeon'
         )
 
     # Initiate surgeon
diff --git a/setup.cfg b/setup.cfg
index 24e78e4bd6..f1cab47f7f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,7 +41,7 @@ pytest_randomly.random_seeder =
 
 [options.extras_require]
 optimization =
-    keras-surgeon@git+https://github.com/fastmachinelearning/keras-surgeon.git
+    keras-tuner==1.1.3
     ortools==9.4.1874
     packaging
 profiling =
diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index 46c1fa9e5b..107cabdbbb 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -44,14 +44,16 @@ def uses_example_model(test_filename):
 
 tests = glob.glob('test_optimization/test_keras/test_*.py')
 for test in tests:
-    name = test.replace('test_optimization/test_keras/', '').replace('test_', '').replace('.py', '')
-    new_yml = yaml.safe_load(
-        template.format(name, f'test_optimization/test_keras/test_{name}.py', int(uses_example_model(test)))
-    )
-    if yml is None:
-        yml = new_yml
-    else:
-        yml.update(new_yml)
+    # For now, skip Keras Surgeon [conflicting versions]
+    if 'test_reduction' not in test:
+        name = test.replace('test_optimization/test_keras/', '').replace('test_', '').replace('.py', '')
+        new_yml = yaml.safe_load(
+            template.format(name, f'test_optimization/test_keras/test_{name}.py', int(uses_example_model(test)))
+        )
+        if yml is None:
+            yml = new_yml
+        else:
+            yml.update(new_yml)
 
 yamlfile = open('pytests.yml', 'w')
 yaml.safe_dump(yml, yamlfile)
diff --git a/test/pytest/test_optimization/test_keras/test_reduction.py b/test/pytest/test_optimization/test_keras/test_reduction.py
index e07cbab656..7243a9123f 100644
--- a/test/pytest/test_optimization/test_keras/test_reduction.py
+++ b/test/pytest/test_optimization/test_keras/test_reduction.py
@@ -1,3 +1,4 @@
+# Skip Keras Surgeon tests for now, due to conflicting PyTest versions
 import keras
 import pytest
 from packaging import version
@@ -8,6 +9,9 @@
 from hls4ml.optimization.keras.reduction import reduce_model
 from hls4ml.optimization.keras.utils import get_model_sparsity
 
+pytest.skip(allow_module_level=True)
+
+
 '''
 Set some neurons / filters to zero and verify that these are removed
 Even is some neurons (columns) in the output layer are zero, these should not be removed (to match data set labels)

From 7a26a9afc0f144055c5175341688abf7dac42cc5 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 25 Jan 2024 19:52:57 +0100
Subject: [PATCH 16/19] Cleanup docstrings

---
 docs/advanced/model_optimization.rst       | 13 +++-
 docs/index.rst                             |  2 +
 hls4ml/optimization/__init__.py            | 65 +++++++++---------
 hls4ml/optimization/attributes.py          | 60 ++++++++--------
 hls4ml/optimization/keras/__init__.py      | 80 +++++++++++-----------
 hls4ml/optimization/keras/builder.py       | 68 +++++++++---------
 hls4ml/optimization/keras/masking.py       | 26 +++----
 hls4ml/optimization/keras/reduction.py     | 25 ++++---
 hls4ml/optimization/keras/regularizers.py  | 52 +++++++-------
 hls4ml/optimization/keras/utils.py         | 45 ++++++------
 hls4ml/optimization/knapsack.py            |  6 +-
 hls4ml/optimization/objectives/__init__.py | 45 ++++++------
 hls4ml/optimization/scheduler.py           | 38 ++++++----
 13 files changed, 278 insertions(+), 247 deletions(-)

diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
index 9465353c2e..14813f7217 100644
--- a/docs/advanced/model_optimization.rst
+++ b/docs/advanced/model_optimization.rst
@@ -1,6 +1,6 @@
-========================
-hls4ml Optimization API
-========================
+=================================
+Hardware-aware Optimization API
+=================================
 
 Pruning and weight sharing are effective techniques to reduce model footprint and computational requirements. The hls4ml Optimization API introduces hardware-aware pruning and weight sharing.
 By defining custom objectives, the algorithm solves a Knapsack optimization problem aimed at maximizing model performance, while keeping the target resource(s) at a minimum. Out-of-the box objectives include network sparsity, GPU FLOPs, Vivado DSPs, memory utilization etc.
@@ -8,6 +8,7 @@ By defining custom objectives, the algorithm solves a Knapsack optimization prob
 The code block below showcases three use cases of the hls4ml Optimization API - network sparsity (unstructured pruning), GPU FLOPs (structured pruning) and Vivado DSP utilization (pattern pruning). First, we start with unstructured pruning:
 
 .. code-block:: Python
+
     from sklearn.metrics import accuracy_score
     from tensorflow.keras.optimizers import Adam
     from tensorflow.keras.metrics import CategoricalAccuracy
@@ -71,7 +72,9 @@ In a similar manner, it is possible to target GPU FLOPs or Vivado DSPs. However,
 Instead, it is the sparsity of the target resource. As an example: Starting with a network utilizing 512 DSPs and a final sparsity of 50%; the optimized network will use 256 DSPs.
 
 To optimize GPU FLOPs, the code is similar to above:
+
 .. code-block:: Python
+
     from hls4ml.optimization.objectives.gpu_objectives import GPUFLOPEstimator
 
     # Optimize model
@@ -91,7 +94,9 @@ To optimize GPU FLOPs, the code is similar to above:
     print(optimized_model.summary())
 
 Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
+
 .. code-block:: Python
+
     from hls4ml.utils.config import config_from_keras_model
     from hls4ml.optimization.objectives.vivado_objectives import VivadoDSPEstimator
 
@@ -121,7 +126,9 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
 
 There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilisation and VivadoMultiObjectiveEstimator, aimed at optimising BRAM and DSP utilisation.
 Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesing HLS, by modifying the config:
+
 .. code-block:: Python
+
     hls_config = config_from_keras_model(optimized_model)
     hls_config['Model']['DenseResourceImplementation'] = 'Unrolled'
     # Any addition hls4ml config, such as strategy, reuse factor etc...
diff --git a/docs/index.rst b/docs/index.rst
index 805a0a06c5..c21b90aebc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,6 +25,7 @@
     advanced/fifo_depth
     advanced/extension
     advanced/accelerator
+    advanced/model_optimization
 
 .. toctree::
     :hidden:
@@ -34,6 +35,7 @@
     autodoc/hls4ml.backends
     autodoc/hls4ml.converters
     autodoc/hls4ml.model
+    autodoc/hls4ml.optimization
     autodoc/hls4ml.report
     autodoc/hls4ml.utils
     autodoc/hls4ml.writer
diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
index d8301f5892..da913f7dcb 100644
--- a/hls4ml/optimization/__init__.py
+++ b/hls4ml/optimization/__init__.py
@@ -37,41 +37,42 @@ def optimize_keras_for_hls4ml(
     Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s)
 
     Args:
-    - keras_model (keras.Model): Model to be optimized
-    - hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...)
-    - objective (hls4ml.optimization.objectives.ObjectiveEstimator):
+        keras_model (keras.Model): Model to be optimized
+        hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...)
+        objective (hls4ml.optimization.objectives.ObjectiveEstimator):
         Parameter, hardware or user-defined objective of optimization
-    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler):
+        scheduler (hls4ml.optimization.scheduler.OptimizationScheduler):
         Sparsity scheduler, choose between constant, polynomial and binary
-    - X_train (np.array): Training inputs
-    - y_train (np.array): Training labels
-    - X_val (np.array): Validation inputs
-    - y_val (np.array): Validation labels
-    - batch_size (int): Batch size during training
-    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
-    - optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
-    - loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
-    - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
-    - increasing (boolean): If the metric improves with increased values;
-        e.g. accuracy -> increasing = True, MSE -> increasing = False
-    - rtol (float): Relative tolerance;
-        pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+        X_train (np.array): Training inputs
+        y_train (np.array): Training labels
+        X_val (np.array): Validation inputs
+        y_val (np.array): Validation labels
+        batch_size (int): Batch size during training
+        epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+        optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
+        loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
+        validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
+        increasing (boolean): If the metric improves with increased values;
+            e.g. accuracy -> increasing = True, MSE -> increasing = False
+        rtol (float): Relative tolerance;
+            pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+        callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
+        ranking_metric (string): Metric used for ranking weights and structures;
+            currently supported l1, l2, saliency and Oracle
+        local (boolean): Layer-wise or global pruning
+        verbose (boolean): Display debug logs during model optimization
+        rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
+            allows regrowth of previously pruned weights
+        cutoff_bad_trials (int): After how many bad trials (performance below threshold),
+            should model pruning / weight sharing stop
+        directory (string): Directory to store temporary results
+        tuner (str): Tuning algorithm, choose between Bayesian, Hyperband and None
+        knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
+            default usually works well; for very large networks, greedy algorithm might be more suitable
+        regularization_range (list): List of suitable hyperparameters for weight decay
 
-    Kwargs:
-    - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
-    - ranking_metric (string): Metric used for rannking weights and structures;
-        currently supported l1, l2, saliency and Oracle
-    - local (boolean): Layer-wise or global pruning
-    - verbose (boolean): Display debug logs during model optimization
-    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
-        allows regrowth of previously pruned weights
-    - cutoff_bad_trials (int): After how many bad trials (performance below threshold),
-        should model pruning / weight sharing stop
-    - directory (string): Directory to store temporary results
-    - tuner (str): Tuning alogorithm, choose between Bayesian, Hyperband and None
-    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
-        default usually works well; for very large networks, greedy algorithm might be more suitable
-    - regularization_range (list): List of suitable hyperparameters for weight decay
+    Returns:
+        keras.Model: Optimized model
     '''
 
     # Extract model attributes
diff --git a/hls4ml/optimization/attributes.py b/hls4ml/optimization/attributes.py
index 672e2706a0..a7b6d74135 100644
--- a/hls4ml/optimization/attributes.py
+++ b/hls4ml/optimization/attributes.py
@@ -11,14 +11,14 @@ class hls4mlAttributes:
     A class for storing hls4ml information of a single layer
 
     Args:
-    - n_in (int): Number of inputs (rows) for Dense matrix multiplication
-    - n_out (int): Number of outputs (cols) for Dense matrix multiplication
-    - io_type (string): io_parallel or io_stream
-    - strategy (string): Resource or Latency
-    - weight_precision (FixedPrecisionType): Layer weight precision
-    - output_precision (FixedPrecisionType): Layer output precision
-    - reuse_factor (int): Layer reuse factor
-    - parallelization_factor (int): Layer parallelization factor - [applicable to io_parallel Conv2D]
+        n_in (int): Number of inputs (rows) for Dense matrix multiplication
+        n_out (int): Number of outputs (cols) for Dense matrix multiplication
+        io_type (string): io_parallel or io_stream
+        strategy (string): Resource or Latency
+        weight_precision (FixedPrecisionType): Layer weight precision
+        output_precision (FixedPrecisionType): Layer output precision
+        reuse_factor (int): Layer reuse factor
+        parallelization_factor (int): Layer parallelization factor - [applicable to io_parallel Conv2D]
     '''
 
     def __init__(
@@ -51,12 +51,12 @@ class OptimizationAttributes:
     A class for storing layer optimization attributes
 
     Args:
-        - structure_type (enum): Targeted structure - unstructured, structured, pattern, block
-        - pruning (boolean): Should pruning be applied to the layer
-        - weight_sharing (boolean): Should weight sharing be applied to the layer
-        - block_shape (tuple): Block shape if structure_type == block
-        - pattern_offset (int): Length of each pattern if structure_type == pattern
-        - consecutive_patterns (int): How many consecutive patterns are grouped together if structure_type == pattern
+        structure_type (enum): Targeted structure - unstructured, structured, pattern, block
+        pruning (boolean): Should pruning be applied to the layer
+        weight_sharing (boolean): Should weight sharing be applied to the layer
+        block_shape (tuple): Block shape if structure_type == block
+        pattern_offset (int): Length of each pattern if structure_type == pattern
+        consecutive_patterns (int): How many consecutive patterns are grouped together if structure_type == pattern
 
     Notes:
         - In the case of hls4ml, pattern_offset is equivalent to the number of weights processed in parallel
@@ -88,16 +88,16 @@ class LayerAttributes:
     A class for storing layer information
 
     Args:
-        - name (string): Layer name
-        - layer_type (keras.Layer): Layer type (e.g. Dense, Conv2D etc.)
-        - inbound_layers (list): List of parent nodes, identified by name
-        - weight_shape (tuple): Layer weight shape
-        - input_shape (tuple): Layer input shape
-        - output_shape (tuple): Layer output shape
-        - optimizable (bool): Should optimizations (pruning, weight sharing) be applied to this layer
-        - optimization_attributes (OptimizationAttributes): Type of optimization,
+        name (string): Layer name
+        layer_type (keras.Layer): Layer type (e.g. Dense, Conv2D etc.)
+        inbound_layers (list): List of parent nodes, identified by name
+        weight_shape (tuple): Layer weight shape
+        input_shape (tuple): Layer input shape
+        output_shape (tuple): Layer output shape
+        optimizable (bool): Should optimizations (pruning, weight sharing) be applied to this layer
+        optimization_attributes (OptimizationAttributes): Type of optimization,
             pruning or weight sharing, block shape and pattern offset
-        - args (dict): Additional information,
+        args (dict): Additional information,
             e.g. hls4mlAttributes; dictionary so it can be generic enough for different platforms
     '''
 
@@ -147,10 +147,10 @@ def get_attributes_from_keras_model(model):
     Per-layer pruning sype (structured, pattern etc.), depend on the pruning objective and are inserted later
 
     Args:
-        - model (keras.model): Model to extract attributes from
+        model (keras.model): Model to extract attributes from
 
-    Return:
-        - model_attributes (dict): Each key corresponds to a layer name, values are instances of LayerAttribute
+    Returns:
+        model_attributes (dict): Each key corresponds to a layer name, values are instances of LayerAttribute
     '''
     is_sequential = model.__class__.__name__ == 'Sequential'
     model_attributes = {}
@@ -188,11 +188,11 @@ def get_attributes_from_keras_model_and_hls4ml_config(model, config):
     Per-layer pruning sype (structured, pruning etc.), depend on the pruning objective and are inserted later
 
     Args:
-        - model (keras.model): Model to extract attributes from
-        - config (dict): hls4ml dictionary
+        model (keras.model): Model to extract attributes from
+        config (dict): hls4ml dictionary
 
-    Return:
-        - model_attributes (dict): Each key corresponds to a layer name, values are LayerAttribute instances
+    Returns:
+        model_attributes (dict): Each key corresponds to a layer name, values are LayerAttribute instances
     '''
 
     # Extract Keras attributes
diff --git a/hls4ml/optimization/keras/__init__.py b/hls4ml/optimization/keras/__init__.py
index a07f0adac7..d67ddd5d26 100644
--- a/hls4ml/optimization/keras/__init__.py
+++ b/hls4ml/optimization/keras/__init__.py
@@ -50,45 +50,43 @@ def optimize_model(
     Top-level function for optimizing a Keras model, given objectives
 
     Args:
-    - model (keras.Model): Model to be optimized
-    - model_attributes (dict): Layer-wise model attributes,
-        obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
-    - objective (hls4ml.optimization.objectives.ObjectiveEstimator):
-        Parameter, hardware or user-defined objective of optimization
-    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler):
-        Sparsity scheduler, choose between constant, polynomial and binary
-    - X_train (np.array): Training inputs
-    - y_train (np.array): Training labels
-    - X_val (np.array): Validation inputs
-    - y_val (np.array): Validation labels
-    - batch_size (int): Batch size during training
-    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
-    - optimizer (keras.optimizers.Optimizer or equivalent-string description):
-        Optimizer used during training
-    - loss_fn (keras.losses.Loss or equivalent loss description):
-        Loss function used during training
-    - validation_metric (keras.metrics.Metric or equivalent loss description):
-        Validation metric, used as a baseline
-    - increasing (boolean): If the metric improves with increased values;
-        e.g. accuracy -> increasing = True, MSE -> increasing = False
-    - rtol (float): Relative tolerance;
-        pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
-
-    Kwargs:
-    - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
-    - ranking_metric (string): Metric used for rannking weights and structures;
-        currently supported l1, l2, saliency and Oracle
-    - local (boolean): Layer-wise or global pruning
-    - verbose (boolean): Display debug logs during model optimization
-    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
-        allows regrowth of previously pruned weights
-    - cutoff_bad_trials (int): After how many bad trials (performance below threshold),
-        should model pruning / weight sharing stop
-    - directory (string): Directory to store temporary results
-    - tuner (str): Tuning alogorithm, choose between Bayesian, Hyperband and None
-    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
-        default usually works well; for very large networks, greedy algorithm might be more suitable
-    - regularization_range (list): List of suitable hyperparameters for weight decay
+        model (keras.Model): Model to be optimized
+        model_attributes (dict): Layer-wise model attributes,
+            obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
+        objective (hls4ml.optimization.objectives.ObjectiveEstimator):
+            Parameter, hardware or user-defined objective of optimization
+        scheduler (hls4ml.optimization.scheduler.OptimizationScheduler):
+            Sparsity scheduler, choose between constant, polynomial and binary
+        X_train (np.array): Training inputs
+        y_train (np.array): Training labels
+        X_val (np.array): Validation inputs
+        y_val (np.array): Validation labels
+        batch_size (int): Batch size during training
+        epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+        optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
+        loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
+        validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
+        increasing (boolean): If the metric improves with increased values;
+            e.g. accuracy -> increasing = True, MSE -> increasing = False
+        rtol (float): Relative tolerance;
+            pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+        callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
+        ranking_metric (string): Metric used for ranking weights and structures;
+            currently supported l1, l2, saliency and Oracle
+        local (boolean): Layer-wise or global pruning
+        verbose (boolean): Display debug logs during model optimization
+        rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
+            allows regrowth of previously pruned weights
+        cutoff_bad_trials (int): After how many bad trials (performance below threshold),
+            should model pruning / weight sharing stop
+        directory (string): Directory to store temporary results
+        tuner (str): Tuning algorithm, choose between Bayesian, Hyperband and None
+        knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
+            default usually works well; for very large networks, greedy algorithm might be more suitable
+        regularization_range (list): List of suitable hyperparameters for weight decay
+
+    Returns:
+        keras.Model: Optimized model
     '''
 
     if not isinstance(scheduler, OptimizationScheduler):
@@ -213,7 +211,7 @@ def optimize_model(
 
         # Mask gradients
         # Before training the model at the next sparsity level, reset internal states
-        # Furthemore, modern optimizers (e.g. Adam) accumulate gradients during backprop
+        # Furthermore, modern optimizers (e.g. Adam) accumulate gradients during backprop
         # Therefore, even if the gradient for a weight is zero, it might be updated, due to previous gradients
         # Avoid this by resetting the internal variables of an optimizer
         optimizable_model.reset_metrics()
@@ -329,7 +327,7 @@ def __call__(self, X, y, s):
             - y (tf.Tensor): Output data
             - s (float): Sparsity
 
-        Return:
+        Returns:
             - loss (tf.Varilable): Model loss with input X and output y
         '''
         grads = []
diff --git a/hls4ml/optimization/keras/builder.py b/hls4ml/optimization/keras/builder.py
index 9a3bec8d01..f265ccdf48 100644
--- a/hls4ml/optimization/keras/builder.py
+++ b/hls4ml/optimization/keras/builder.py
@@ -18,17 +18,19 @@
 class HyperOptimizationModel(kt.HyperModel):
     '''
     Helper class for Keras Tuner
-
-    Args:
-        - model (keras.Model): Baseline model
-        - attributes (dict): Layer-wise dictionary of attributes
-        - optimizer (keras.optimizers.Optimizer or equvialent string description): Model optimizer
-        - loss_fn (keras.losses.Loss or equivalent string description): Model loss function
-        - validation_metric (keras.metrics.Metric or equivalent string description): Model validation metric
-        - regularization_range (list): List of suitable hyperparameters for weight decay
     '''
 
     def __init__(self, model, attributes, optimizer, loss_fn, validation_metric, regularization_range):
+        """Create new instance of HyperOptimizationModel
+
+        Args:
+            model (keras.Model): Baseline model
+            attributes (dict): Layer-wise dictionary of attributes
+            optimizer (keras.optimizers.Optimizer or equivalent string description): Model optimizer
+            loss_fn (keras.losses.Loss or equivalent string description): Model loss function
+            validation_metric (keras.metrics.Metric or equivalent string description): Model validation metric
+            regularization_range (list): List of suitable hyperparameters for weight decay
+        """
         self.model = model
         self.attributes = attributes
         self.optimizer = optimizer
@@ -112,32 +114,33 @@ def build_optimizable_model(
     '''
     Function identifying optimizable layers and adding a regularization loss
 
-    Args:
-    - model (keras.Model): Model to be optimized
-    - attributes (dict): Layer-wise model attributes, obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
-    - optimizer (keras.optimizers.Optimizer): Optimizer used during training
-    - loss_fn (keras.losses.Loss): Loss function used during training
-    - validation_metric (keras.metrics.Metric): Validation metric, used as a baseline
-    - train_dataset (tf.Dataset): Training inputs and labels, in the form of an iterable TF Dataset
-    - validation_dataset (tf.Dataset): Validation inputs and labels, in the form of an iterable TF Dataset
-    - batch_size (int): Batch size during training
-    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
-
-    Kwargs:
-    - verbose (bool): Whether to log tuner outputs to the console
-    - directory (string): Directory to store tuning results
-    - tuner (str): Tuning alogorithm, choose between Bayesian and Hyperband
-    - regularization_range (list): List of suitable hyperparameters for weight decay
-    - learning_rate_range (list): List of suitable hyperparameters for learning rate
-
     Notes:
     - In general, the regularization and learning rate ranges do not need to be provided,
-        as the implementation sets a generic enough range. if the user has an idea on the
-        possible range on hyperparameter ranges, the tuning will complete faster.
+    as the implementation sets a generic enough range. if the user has an idea on the
+    possible range on hyperparameter ranges, the tuning will complete faster.
     - The default tuner is Bayesian & when coupled with the correct ranges of hyperparameters,
-        it performs quite well, fast. However, older version of Keras Tuner had a crashing bug with it.
+    it performs quite well, fast. However, older version of Keras Tuner had a crashing bug with it.
     - In general, the directory does not need to be specified. However, if pruning several models simultaneously,
-        to avoid conflicting intermediate results, it is useful to specify directory.
+    to avoid conflicting intermediate results, it is useful to specify directory.
+
+    Args:
+        model (keras.Model): Model to be optimized
+        attributes (dict): Layer-wise model attributes, obtained from hls4ml.optimization.get_attributes_from_keras_model()
+        optimizer (keras.optimizers.Optimizer): Optimizer used during training
+        loss_fn (keras.losses.Loss): Loss function used during training
+        validation_metric (keras.metrics.Metric): Validation metric, used as a baseline
+        train_dataset (tf.Dataset): Training inputs and labels, in the form of an iterable TF Dataset
+        validation_dataset (tf.Dataset): Validation inputs and labels, in the form of an iterable TF Dataset
+        batch_size (int): Batch size during training
+        epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+        verbose (bool): Whether to log tuner outputs to the console
+        directory (string): Directory to store tuning results
+        tuner (str): Tuning algorithm, choose between Bayesian and Hyperband
+        regularization_range (list): List of suitable hyperparameters for weight decay
+        learning_rate_range (list): List of suitable hyperparameters for learning rate
+
+    Returns:
+        keras.Model: Model prepared for optimization
     '''
     # User provided manual hyper-parameters for regularisation loss
     # TODO - Maybe we could extend this to be hyper-parameters per layer? or layer-type?
@@ -239,7 +242,10 @@ def remove_custom_regularizers(model):
     This makes it possible to load the model in a different environment without hls4ml installed
 
     Args:
-        - model (keras.Model): Baseline model
+        model (keras.Model): Baseline model
+
+    Returns:
+        keras.Model: Model without custom regularizers
     '''
     weights = model.get_weights()
     for layer in model.layers:
diff --git a/hls4ml/optimization/keras/masking.py b/hls4ml/optimization/keras/masking.py
index bf673b9154..0e74997be8 100644
--- a/hls4ml/optimization/keras/masking.py
+++ b/hls4ml/optimization/keras/masking.py
@@ -42,22 +42,24 @@ def get_model_masks(
     Both the norm and variance are normalized, to avoid magnitude biases.
 
     Args:
-        - keras_model (keras.model) - Model to be masked
-        - model_attributes (dict) - A layer-wise dictionary of LayerAttributes classes
-        - sparsity (float) - Desired sparsity, with respect to the objective
-        - objective (ObjectiveEstimator) - Objective to be minimized (e.g. DSP, FLOPs etc.)
-        - metric (string) - Weight ranking metric - l1, l2, Oracle, saliency
-        - local (boolean) - Equal layer-wise sparsity
-        - gradients (dict) - A layer-wise dictionary of weight gradients
+        keras_model (keras.model): Model to be masked
+        model_attributes (dict): A layer-wise dictionary of LayerAttributes classes
+        sparsity (float): Desired sparsity, with respect to the objective
+        objective (ObjectiveEstimator): Objective to be minimized (e.g. DSP, FLOPs etc.)
+        metric (string): Weight ranking metric - l1, l2, Oracle, saliency
+        local (boolean): Equal layer-wise sparsity
+        gradients (dict): A layer-wise dictionary of weight gradients
             (needed for Oracle ranking)
-        - hessians (dict) - A layer-wise dictionary of second gradients
+        hessians (dict): A layer-wise dictionary of second gradients
             (needed for saliency ranking)
-        - knapsack_solver (str) - Algorithm for solving Knapsack problem; recommended is to use default.
+        knapsack_solver (str): Algorithm for solving Knapsack problem; recommended is to use default.
             Unless dealing with highly dimensional problems, in which case greedy is better.
 
-    Return:
-        - masks (dict) - Layer-wise dictionary of binary tensors
-        - offsets (dict) - Layer-wise dictionary of offsets for every weight
+    Returns:
+        tuple containing
+
+        - masks (dict): Layer-wise dictionary of binary tensors
+        - offsets (dict): Layer-wise dictionary of offsets for every weight
     '''
 
     if metric not in SUPPORTED_METRICS:
diff --git a/hls4ml/optimization/keras/reduction.py b/hls4ml/optimization/keras/reduction.py
index 899f7f350f..4ea8855aa8 100644
--- a/hls4ml/optimization/keras/reduction.py
+++ b/hls4ml/optimization/keras/reduction.py
@@ -4,24 +4,23 @@
 
 from hls4ml.optimization.keras.utils import get_last_layer_with_weights
 
-'''
-Function for removing zero neurons & filters from a model and rewiring the model graph
-This function is built on top of Keras Surgeon available at: https://github.com/BenWhetton/keras-surgeon
-Keras Surgeon is no longer under active development and does not work for TensorFlow 2.3+ and QKeras
-The baseline version was forked and updated, available at: https://github.com/fastmachinelearning/keras-surgeon
 
-IMPORTANT: To use this funcionality please install separately from the above GitHub.
-
-Args:
-    - model (keras.model): Input model
+def reduce_model(model):
+    '''
+    Function for removing zero neurons & filters from a model and rewiring the model graph
+    This function is built on top of Keras Surgeon available at: https://github.com/BenWhetton/keras-surgeon
+    Keras Surgeon is no longer under active development and does not work for TensorFlow 2.3+ and QKeras
+    The baseline version was forked and updated, available at: https://github.com/fastmachinelearning/keras-surgeon
 
-Return:
-    - reduced (keras.model): Modified model, with redundant structures removed
+    IMPORTANT: To use this funcionality please install separately from the above GitHub.
 
-    '''
+    Args:
+        model (keras.model): Input model
 
+    Returns:
+        reduced (keras.model): Modified model, with redundant structures removed
 
-def reduce_model(model):
+    '''
     try:
         from kerassurgeon import Surgeon
     except ModuleNotFoundError:
diff --git a/hls4ml/optimization/keras/regularizers.py b/hls4ml/optimization/keras/regularizers.py
index fd8a2f3656..1e885963c2 100644
--- a/hls4ml/optimization/keras/regularizers.py
+++ b/hls4ml/optimization/keras/regularizers.py
@@ -10,25 +10,25 @@ class DenseRegularizer(tf.keras.regularizers.Regularizer):
     A flexible regularizer for Dense layers, simultaneously penalizing high values and variance
 
     Args:
-        - alpha (float): Sparse penalty; a higher value pushes more weights towards zero
-        - beta (float): Variance penalty; a higer value reduces variance between a group of weights
-        - norm (int): Norm type (l1 or l2)
-        - structure_type (string): Type of regularisation - unstructured, structured, pattern, block
-        - block_shape (tuple): Block shape if structure_type == block
-        - pattern_offset (int): Length of each pattern if structure_type == pattern
-        - consecutive_patterns (int): How many consecutive patterns should be considered
-        - weights (tf.Variable): Two-dimensional layer weight tensor, dimensionality (M x N)
-
-    Return:
-        - Regularizer penalty (tf.Variable): Penalty associated with layer weights
+        alpha (float): Sparse penalty; a higher value pushes more weights towards zero
+        beta (float): Variance penalty; a higher value reduces variance between a group of weights
+        norm (int): Norm type (l1 or l2)
+        structure_type (string): Type of regularization - unstructured, structured, pattern, block
+        block_shape (tuple): Block shape if structure_type == block
+        pattern_offset (int): Length of each pattern if structure_type == pattern
+        consecutive_patterns (int): How many consecutive patterns should be considered
+        weights (tf.Variable): Two-dimensional layer weight tensor, dimensionality (M x N)
+
+    Returns:
+        Regularizer penalty (tf.Variable): Penalty associated with layer weights
 
     Examples:
-        - structure_type = unstructured: unstructured weight regularisation
+        - structure_type = unstructured: unstructured weight regularization
         - structure_type = structured: neuron regularization
             (group weights by row)
         - structure_type = pattern: regularization on groups of every n-th weight
             (e.g. grouping by reuse factor in hls4ml)
-        - structure_type = block: regularisation on blocks within weight matrix
+        - structure_type = block: regularization on blocks within weight matrix
             (e.g. 4x4, 8x1 for certain SIMD processors)
 
         - consecutive_patterns is commonly encountered with optimization of BRAM utilization -
@@ -78,7 +78,7 @@ def __call__(self, weights):
             return sparse_penalty + variance_penalty
 
         if self.structure_type == SUPPORTED_STRUCTURES.PATTERN:
-            # This is equivalent to penalising all the weights processed by the same DSP block in hls4ml.
+            # This is equivalent to penalizing all the weights processed by the same DSP block in hls4ml.
             # The matrix is transposed, according to Resource strategy and reshaped into (pattern_offset, pattern_number)
             # Pattern offset corresponds to the number of patterns is equivalent to RF
             if (np.prod(weights.shape)) % self.pattern_offset != 0:
@@ -115,7 +115,7 @@ def __call__(self, weights):
             if (weights.shape[0] % self.block_shape[0]) != 0 or (weights.shape[1] % self.block_shape[1] != 0):
                 raise Exception(f'{self.__class__.__name__}: block sizes need to be fators of weight matrix dimensions')
 
-            # TensorFlow has a built-in method for exctracting sub-tensors of given shape and stride
+            # TensorFlow has a built-in method for extracting sub-tensors of given shape and stride
             # This method is commonly used to perform im2col,
             # Docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
             total_blocks = (weights.shape[0] * weights.shape[1]) // (self.block_shape[0] * self.block_shape[1])
@@ -149,22 +149,22 @@ def get_config(self):
 @tf.keras.utils.register_keras_serializable(name='Conv2DRegularizer')
 class Conv2DRegularizer(tf.keras.regularizers.Regularizer):
     '''
-    A flexible regulariser for Conv2D layers, simultaneously performing pruning and clustering
+    A flexible regularizer for Conv2D layers, simultaneously performing pruning and clustering
 
     Args:
-        - alpha (float): Sparse penalty; a higher value pushes more weights towards zero
-        - beta (float): Variance penalty; a higer value reduces variance between a group of weights
-        - norm (int): Norm type (l1 or l2)
-        - structure_type (string): Type of regularisation - unstructured, structured, pattern
-        - pattern_offset (int): Length of each pattern if structure_type == pattern
-        - weights (tf.Variable): Four-dimensional layer weight tensor, dimensionality
-        (filter_width x filter_height x n_chan x n_filt)
+        alpha (float): Sparse penalty; a higher value pushes more weights towards zero
+        beta (float): Variance penalty; a higher value reduces variance between a group of weights
+        norm (int): Norm type (l1 or l2)
+        structure_type (string): Type of regularization - unstructured, structured, pattern
+        pattern_offset (int): Length of each pattern if structure_type == pattern
+        weights (tf.Variable): Four-dimensional layer weight tensor, dimensionality
+            (filter_width x filter_height x n_chan x n_filt)
 
-    Return:
-        - Regularizer penalty (tf.Variable): Penalty associated with layer weights
+    Returns:
+        Regularizer penalty (tf.Variable): Penalty associated with layer weights
 
     Example use cases:
-        - structure_type = unstructured: unstructured weight regularisation
+        - structure_type = unstructured: unstructured weight regularization
         - structure_type = structured: filter regularization
             (group weights of dimensionality filt_width x filt_height x n_chan)
         - structure_type = pattern: regularization on groups of every n-th weight in flattened array
diff --git a/hls4ml/optimization/keras/utils.py b/hls4ml/optimization/keras/utils.py
index eaf34f41fc..bf10f6feb0 100644
--- a/hls4ml/optimization/keras/utils.py
+++ b/hls4ml/optimization/keras/utils.py
@@ -8,13 +8,13 @@ def get_model_gradients(model, loss_fn, X, y):
     Calculate model gradients with respect to weights
 
     Args:
-        - model (keras.model): Input model
-        - loss_fn (keras.losses.Loss): Model loss function
-        - X (np.array): Input data
-        - y (np.array): Output data
+        model (keras.model): Input model
+        loss_fn (keras.losses.Loss): Model loss function
+        X (np.array): Input data
+        y (np.array): Output data
 
-    Return:
-        - grads (dict): Per-layer gradients of loss with respect to weights
+    Returns:
+        grads (dict): Per-layer gradients of loss with respect to weights
     '''
     grads = {}
     # While persistent GradientTape slows down execution,
@@ -33,17 +33,18 @@ def get_model_gradients(model, loss_fn, X, y):
 @tf.function
 def get_model_hessians(model, loss_fn, X, y):
     '''
-    Calculate the second derivatives of the loss with repsect to model weights
-    Note, only diagonal elements of the Hessian are computed
+    Calculate the second derivatives of the loss with repsect to model weights.
+
+    Note, only diagonal elements of the Hessian are computed.
 
     Args:
-        - model (keras.model): Input model
-        - loss_fn (keras.losses.Loss): Model loss function
-        - X (np.array): Input data
-        - y (np.array): Output data
+        model (keras.model): Input model
+        loss_fn (keras.losses.Loss): Model loss function
+        X (np.array): Input data
+        y (np.array): Output data
 
-    Return:
-        - grads (dict): Per-layer second derivatives of loss with respect to weights
+    Returns:
+        grads (dict): Per-layer second derivatives of loss with respect to weights
     '''
     grads = {}
     with tf.GradientTape(persistent=True) as tape:
@@ -64,7 +65,9 @@ def get_model_sparsity(model):
     Args:
         - model (keras.model): Model to be evaluated
 
-    Return:
+    Returns:
+        tuple containing
+
         - sparsity (float): Model sparsity, as a percentage of zero weights w.r.t to total number of model weights
         - layers (dict): Key-value dictionary; each key is a layer name and the associated value is the layer's sparsity
 
@@ -92,14 +95,16 @@ def get_model_sparsity(model):
 def get_last_layer_with_weights(model):
     '''
     Finds the last layer with weights
-    The last layer with weights determined the output shape, so, pruning is sometimes not applicable to it
+
+    The last layer with weights determined the output shape, so, pruning is sometimes not applicable to it.
     As an example, consider a network with 16 - 32 - 5 neurons - the last layer's neuron (5) cannot be removed
-    Since they map to the data labels
+    since they map to the data labels
+
     Args:
-        - model (keras.model): Input model
+        model (keras.model): Input model
 
-    Return:
-        - idx (int): Index location of last layer with params
+    Returns:
+        idx (int): Index location of last layer with params
     '''
     for idx, layer in reversed(list(enumerate(model.layers))):
         if hasattr(layer, 'kernel'):
diff --git a/hls4ml/optimization/knapsack.py b/hls4ml/optimization/knapsack.py
index e198945b1b..541222dc63 100644
--- a/hls4ml/optimization/knapsack.py
+++ b/hls4ml/optimization/knapsack.py
@@ -13,13 +13,13 @@ def solve_knapsack(values, weights, capacity, implementation='CBC_MIP', **kwargs
         - weights (np.array, int): An matrix, each row represents the weights of every item, in a given knapsack
         - capacity (np.array, int): A one-dimensional array, each entry is the maximum weights of a Knapsack
         - implementation (string): Algorithm to solve Knapsack problem - dynamic programming, greedy, branch and bound
-
-    Kwargs:
         - time_limit (float): Limit (in seconds) after which the CBC or Branch & Bound should
             stop looking for a solution and return optimal so far
         - scaling_factor (float): Scaling factor for floating points values in CBC or B&B
 
-    Return:
+    Returns:
+        tuple containing
+
         - optimal_value (float): The optimal values of elements in the knapsack
         - selected_items (list): A list of indices, corresponding to the selected elements
 
diff --git a/hls4ml/optimization/objectives/__init__.py b/hls4ml/optimization/objectives/__init__.py
index 05d916d85f..8ab4535060 100644
--- a/hls4ml/optimization/objectives/__init__.py
+++ b/hls4ml/optimization/objectives/__init__.py
@@ -7,7 +7,7 @@
 from hls4ml.optimization.config import SUPPORTED_STRUCTURES
 
 '''
-Pruning & weight sharing are formulated as an optimization problem, with the aim of minimising some metric
+Pruning & weight sharing are formulated as an optimization problem, with the aim of minimizing some metric
 Metrics can include: total number of weights, DSP utilization, latency, FLOPs etc.
 '''
 
@@ -15,7 +15,7 @@
 class ObjectiveEstimator(ABC):
     '''
     Abstract class with methods for estimating the utilization and savings of a certain layer, with respect to some objective
-    For each objective, an inherited class is written with the correct implementaton of the below methods
+    For each objective, an inherited class is written with the correct implementation of the below methods
     The objectives can be multi-dimensional, e.g. DSPs and BRAM
     Care needs to be taken when optimizing several objectives, especially if conflicting
     '''
@@ -25,14 +25,14 @@ def is_layer_optimizable(self, layer_attributes):
         '''
         For a given layer, checks whether optimizations make sense, with respect to the given objective(s)
         Furthermore, it returns the type of optimization (structured, unstructured etc.)
-        Most suitable for minimising the objective(s).
+        Most suitable for minimizing the objective(s).
 
         Args:
-            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+            layer_attributes (hls4ml.optimization.attributes.LayerAttributes): Layer attributes
 
-        Return:
-            - optimizable (boolean) - can optimizations be applied to this layer
-            - optimization_attributes (hls4ml.optimiation.attributes.OptimizationAttributes) -
+        Returns:
+            optimizable (boolean): can optimizations be applied to this layer
+            optimization_attributes (hls4ml.optimization.attributes.OptimizationAttributes):
                 Most suitable approach for optimization
 
         Examples:
@@ -50,12 +50,13 @@ def layer_resources(self, layer_attributes):
         For a given layer, how many units of the metric are used, given a generic weight matrix
 
         Args:
-            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+            layer_attributes (hls4ml.optimization.attributes.LayerAttributes): Layer attributes
 
-        Return:
-            - resources (list, int) - total resources (w.r.t every dimension of the objective) used
+        Returns:
+            resources (list, int): total resources (w.r.t every dimension of the objective) used
 
-        Example: Metric = Total weights, Layer = Dense, shape = (4, 4) -> return [16] [regardless of layer sparsity]
+        Example:
+            Metric = Total weights, Layer = Dense, shape = (4, 4) -> return [16] [regardless of layer sparsity]
         '''
         pass
 
@@ -67,26 +68,26 @@ def layer_savings(self, layer_attributes):
         For best results, OptimizationAttributes in layer_attribtues should be obtained from is_layer_optimizable
 
         Args:
-            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+            layer_attributes (hls4ml.optimization.attributes.LayerAttributes): Layer attributes
 
-        Return:
-            - savings (list, int) - savings achieved (one for every dimenson of objective)
-                                    With OptimizationAttributes from layer_attributes
+        Returns:
+            savings (list, int): savings achieved (one for every dimension of objective)
+                With OptimizationAttributes from layer_attributes
 
-        Example: Metric = Total weights, Layer = Dense, shape = (4, 4):
+        Example:
+            Metric = Total weights, Layer = Dense, shape = (4, 4):
             - structure_type == unstructured -> return [1]
             - structure_type == structured -> return [4]
         '''
         pass
 
 
-'''
-A class containing objective estimation with the goal of minimizing
-The number of non-zero weights in a layer [corresponds to unstructured pruning]
-'''
-
-
 class ParameterEstimator(ObjectiveEstimator):
+    '''
+    A class containing objective estimation with the goal of minimizing
+    The number of non-zero weights in a layer [corresponds to unstructured pruning]
+    '''
+
     @classmethod
     def is_layer_optimizable(self, layer_attributes):
         if not layer_attributes.weight_shape:
diff --git a/hls4ml/optimization/scheduler.py b/hls4ml/optimization/scheduler.py
index 347427d5dd..a371ef85fb 100644
--- a/hls4ml/optimization/scheduler.py
+++ b/hls4ml/optimization/scheduler.py
@@ -21,27 +21,37 @@ def __init__(self, initial_sparsity=0, final_sparsity=1):
     @abstractmethod
     def update_step(self):
         '''
-        Increments the current sparsity, according to the rule, examples:
+        Increments the current sparsity, according to the rule.
+
+        Examples:
             - ConstantScheduler, sparsity = 0.5, increment = 0.05 -> sparsity = 0.55
             - BinaryScheduler, sparsity = 0.5, target = 1.0 -> sparsity = 0.75
 
-        Return:
+        Returns:
+            tuple containing
+
             - updated (boolean) - Has the sparsity changed? If not, the optimization algorithm can stop
             - sparsity (float) - Updated sparsity
+
         '''
         pass
 
     @abstractmethod
     def repair_step(self):
         '''
-        Method used when the neural architecture does not meet satisfy performance requirement for a given sparsity
-        Then, the target sparsity is decreased according to the rule, examples:
+        Method used when the neural architecture does not meet satisfy performance requirement for a given sparsity.
+        Then, the target sparsity is decreased according to the rule.
+
+        Examples:
             - ConstantScheduler, sparsity = 0.5, increment = 0.05 -> sparsity = 0.55 [see ConstantScheduler for explanation]
             - BinaryScheduler, sparsity = 0.75, target = 1.0, previous = 0.5 -> sparsity = (0.5 + 0.75) / 2 = 0.625
 
-        Return:
+        Returns:
+            tuple containing
+
             - updated (boolean) - Has the sparsity changed? If not, the optimization algorithm can stop
             - sparsity (float) - Updated sparsity
+
         '''
         pass
 
@@ -116,10 +126,17 @@ class PolynomialScheduler(OptimizationScheduler):
     Sparsity updated by at a polynomial decay, until
         (i) sparsity target reached OR
         (ii) optimization algorithm stops requesting state updates
+
     For more information, see Zhu & Gupta (2016) -
         'To prune, or not to prune: exploring the efficacy of pruning for model compression'
-    Note, the implementation is slightly different,
-    Since TensorFlow Prune API depends on the total number of epochs and update frequency
+
+    Note, the implementation is slightly different, since TensorFlow Prune API depends on the total number of
+    epochs and update frequency.
+
+    In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity.
+    In this case, polynomial sparsity will simply jump to the next sparsity level
+    The model's performance over several sparsity levels optimization is tracked and
+    toped after high loss over several trials (see top level pruning/optimization function)
     '''
 
     def __init__(self, maximum_steps, initial_sparsity=0, final_sparsity=1.0, decay_power=3):
@@ -138,12 +155,5 @@ def update_step(self):
         else:
             return False, self.sparsity
 
-    '''
-    In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity
-    In this case, polynomial sparsity, will simply jump to the next sparsity level
-    The model's performance over several sparsity levels optimization is tracked and
-    toped after high loss over several trials (see top level pruning/optimization function)
-    '''
-
     def repair_step(self):
         return self.update_step()

From 4675607aa82297990923e51128966643ccf019c5 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 29 Jan 2024 19:04:30 +0100
Subject: [PATCH 17/19] Fix docstring in ObjectiveEstimator

---
 hls4ml/optimization/objectives/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hls4ml/optimization/objectives/__init__.py b/hls4ml/optimization/objectives/__init__.py
index 8ab4535060..fcbef305b6 100644
--- a/hls4ml/optimization/objectives/__init__.py
+++ b/hls4ml/optimization/objectives/__init__.py
@@ -31,8 +31,10 @@ def is_layer_optimizable(self, layer_attributes):
             layer_attributes (hls4ml.optimization.attributes.LayerAttributes): Layer attributes
 
         Returns:
-            optimizable (boolean): can optimizations be applied to this layer
-            optimization_attributes (hls4ml.optimization.attributes.OptimizationAttributes):
+            tuple containing
+
+            - optimizable (boolean): can optimizations be applied to this layer
+            - optimization_attributes (hls4ml.optimization.attributes.OptimizationAttributes):
                 Most suitable approach for optimization
 
         Examples:

From 8503c867c4301a5a29981a3b45689c7046bd3edd Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 29 Jan 2024 19:04:54 +0100
Subject: [PATCH 18/19] Add optimization API paper to reference.rst

---
 docs/reference.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/reference.rst b/docs/reference.rst
index c9f3493437..1b25d12d65 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst
@@ -86,6 +86,19 @@ binary/ternary networks:
         year = "2021"
     }
 
+optimization API:
+
+..  code-block:: bibtex
+
+    @article{Ramhorst:2023fpga,
+      author = "Benjamin Ramhorst and others",
+      title = "{FPGA Resource-aware Structured Pruning for Real-Time Neural Networks}",
+      eprint = "2308.05170",
+      archivePrefix = "arXiv",
+      primaryClass = "cs.AR",
+      year = "2023"
+    }
+
 Acknowledgments
 ===============
 If you benefited from participating in our community, we ask that you please acknowledge the Fast Machine Learning collaboration, and particular individuals who helped you, in any publications.

From 6eb391f3cd8d20f1face2190017c0d47097e9578 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 29 Jan 2024 19:18:00 +0100
Subject: [PATCH 19/19] Rename optimize_keras_for_hls4ml

---
 docs/advanced/model_optimization.rst | 8 ++++----
 hls4ml/optimization/__init__.py      | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
index 14813f7217..a75224b8cc 100644
--- a/docs/advanced/model_optimization.rst
+++ b/docs/advanced/model_optimization.rst
@@ -100,9 +100,9 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
     from hls4ml.utils.config import config_from_keras_model
     from hls4ml.optimization.objectives.vivado_objectives import VivadoDSPEstimator
 
-    # Note the change from optimize_model to optimize_keras_for_hls4ml
-    # The function optimize_keras_for_hls4ml acts as a wrapper for the function, parsing hls4ml config to model attributes
-    from hls4ml.optimization import optimize_keras_for_hls4ml
+    # Note the change from optimize_model to optimize_keras_model_for_hls4ml
+    # The function optimize_keras_model_for_hls4ml acts as a wrapper for the function, parsing hls4ml config to model attributes
+    from hls4ml.optimization import optimize_keras_model_for_hls4ml
 
     # Create hls4ml config
     default_reuse_factor = 4
@@ -113,7 +113,7 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
 
     # Optimize model
     # Note the change from ParameterEstimator to VivadoDSPEstimator
-    optimized_model = optimize_keras_for_hls4ml(
+    optimized_model = optimize_keras_model_for_hls4ml(
         baseline_model, model_attributes, VivadoDSPEstimator, scheduler,
         X_train, y_train, X_val, y_val, batch_size, epochs,
         optimizer, loss_fn, metric, increasing, rtol
diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
index da913f7dcb..ab51ce1eb3 100644
--- a/hls4ml/optimization/__init__.py
+++ b/hls4ml/optimization/__init__.py
@@ -6,7 +6,7 @@
 default_regularization_range = np.logspace(-6, -2, num=16).tolist()
 
 
-def optimize_keras_for_hls4ml(
+def optimize_keras_model_for_hls4ml(
     keras_model,
     hls_config,
     objective,