From be1503a0c78fd4c4d903b1ffbf61964659725bb6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 3 Jan 2023 15:37:42 +0000
Subject: [PATCH 001/123] First changes to custom_op for RTL-based MVAU

---
 .../matrixvectoractivation_rtl.py             | 1036 +++++++++++++++++
 1 file changed, 1036 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
new file mode 100644
index 0000000000..c8a0aa675b
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -0,0 +1,1036 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+
+from . import templates
+
+# ONNX i/o tensor shape assumptions for MatrixVectorActivation:
+# input 0 is the input tensor, shape (.., i_size) = (..., MW)
+# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
+# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
+# output 0 is the output tensor, shape (.., o_size) = (..., MH)
+# the ... here can be any shape (representing groups of vectors)
+
+
+class MatrixVectorActivation_rtl(HLSCustomOp):
+    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
+    function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.decoupled_wrapper = templates.decoupled_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "PE": ("i", True, 0),
+            "SIMD": ("i", True, 0),
+            "MW": ("i", True, 0),
+            "MH": ("i", True, 0),
+            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
+            "ActVal": ("i", False, 0),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # FINN DataType for accumulator -- auto-computed and updated
+            "accDataType": ("s", False, "INT32"),
+            # use xnor-popcount for binary weights/inputs, thus treating them
+            # as bipolar
+            "binaryXnorMode": ("i", False, 0, {0, 1}),
+            # no-activation mode (produce accumulators)
+            "noActivation": ("i", False, 0, {0, 1}),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+            # memory mode for the FC weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # FPGA resource type for threshold memories (if noActivation is False)
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            "ram_style_thresholds": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed"},
+            ),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        assert mw % simd == 0, "Requirement MW divisable by SIMD is violated."
+        wmem = mw * mh // (pe * simd)
+        return wmem
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer"
+        return 0
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("resType")
+            self.get_nodeattr("MW")
+            self.get_nodeattr("MH")
+            self.get_nodeattr("SIMD")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            self.get_nodeattr("weightDataType")
+            self.get_nodeattr("outputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required MatrixVectorActivation attributes do not exist."""
+            )
+
+        # verify the number of inputs depending on noActivation value
+        # check noActivation value to determine the number of inputs
+        no_act = self.get_nodeattr("noActivation")
+
+        if no_act == 1:
+            if len(self.onnx_node.input) == 2:
+                info_messages.append("The number of inputs is correct")
+            else:
+                info_messages.append(
+                    """RTL-based MatrixVectorActivation needs in no
+                            activation mode 2 inputs (data input and weights)"""
+                )
+        elif no_act == 0:
+            info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer")
+        else:
+            info_messages.append(
+                """noActivation attribute contains {} should
+                be 1 for RTL-based MatrixVectorActivation""".format(
+                    no_act
+                )
+            )
+
+        mem_mode = self.get_nodeattr("mem_mode")
+
+        if mem_mode != "decoupled":
+            info_messages.append("RTL-based MVAU supports only decoupled weights currently")
+
+        return info_messages
+
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
+
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # assuming decoupled (RTL) memory, which is more efficient than const (HLS)
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18))
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
+
+    def bram_efficiency_estimation(self):
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+    def uram_efficiency_estimation(self):
+        """Function for URAM efficiency estimation: actual parameter storage
+        needed divided by the allocated URAM storage (from estimation)"""
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        uram_est = self.uram_estimation()
+        if uram_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        uram_est_capacity = uram_est * 72 * 4096
+        return wbits / uram_est_capacity
+
+#TODO: FIX
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        MW = self.get_nodeattr("MW")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_bits = W + A + np.ceil(math.log(MW, 2))
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0
+            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
+            + c2
+        )
+
+#TODO: FIX
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
+#TODO: FIX
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        mh = self.get_nodeattr("MH")
+        mw = self.get_nodeattr("MW")
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
+        return int(exp_cycles)
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        # when performing FIFO insertion on an FC layer with ext weights, the ind
+        # parameter can be > 0 (referring to the weights) so handle that here
+        if ind == 0:
+            return DataType[self.get_nodeattr("inputDataType")]
+        elif ind == 1:
+            return DataType[self.get_nodeattr("weightDataType")]
+        else:
+            raise Exception("Undefined input ind for this layer type")
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        i_bits = self.get_input_datatype().bitwidth()
+        assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = pe * simd * wp
+            assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
+    def get_ap_int_max_w(self):
+        # base class impl (max of inp/out stream widths)
+        max_of_io = super().get_ap_int_max_w()
+        # decoupled mode weight stream
+        weightstream = self.get_weightstream_width()
+        # single PE weight entry
+        weight_bits = self.get_weight_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        single_pe_w = simd * weight_bits
+        return max([weightstream, max_of_io, single_pe_w])
+
+    def get_folded_input_shape(self, ind=0):
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        sf = mw // simd
+        nf = mh // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple(vecs + [sf, simd])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple(vecs + [sf * nf, simd * pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
+        return folded_input_shape
+
+    def get_folded_output_shape(self, ind=0):
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        nf = mh // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        folded_output_shape = tuple(vecs + [nf, pe])
+        return folded_output_shape
+
+    def get_normal_input_shape(self, ind=0):
+        mw = self.get_nodeattr("MW")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_input_shape = tuple(vecs + [mw])
+        return normal_input_shape
+
+    def get_normal_output_shape(self, ind=0):
+        mh = self.get_nodeattr("MH")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_output_shape = tuple(vecs + [mh])
+        return normal_output_shape
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0 and MW % SIMD == 0
+        * for bipolar {-1,+1} weights, convert to binary {0, 1}
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            mw,
+            mh,
+        ), """Weights matrix doesn't
+        have expected shape (mw, mh)"""
+        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        # start by transposing the original weight matrix, since ONNX and
+        # finn-hlslib use different assumptions
+        # ONNX uses (in_features, out_features) and matmul(x, W)
+        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
+        ret = orig_weight_matrix.T
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, simd)
+        # reverse the SIMD dimension
+        ret = np.flip(ret, axis=-1)
+        return ret
+
+    def minimize_accumulator_width(self, model):
+        weights = model.get_initializer(self.onnx_node.input[1])
+        idt = self.get_input_datatype()
+        # calculate minimum and maximum values of accumulator
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        if acc_min < 0:
+            if abs(acc_min) > acc_max:
+                adt = DataType.get_smallest_possible(acc_min)
+            else:
+                adt = DataType.get_smallest_possible(-acc_max - 1)
+        else:
+            adt = DataType.get_smallest_possible(acc_max)
+        # ensure a datatype divisible by 8-bits in case this is the last node
+        bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+        new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+        adt = DataType[new_adt_name]
+        self.set_nodeattr("accDataType", adt.name)
+        # for no-activation nodes, output dt = acc dt
+        self.set_nodeattr("outputDataType", adt.name)
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        if "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
+            # flipped
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            if weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown/unsupported weight_file_mode")
+
+        else:
+            raise Exception("Unknown/unsupported weight_file_mode")
+
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "decoupled":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            # also save weights as Verilog .dat file
+            # note that we provide two different .dat files, one for synth
+            # and one for synthesis. this is because URAM-based weights always
+            # need zero weights for synthesis, otherwise they get inferred
+            # as BRAM
+            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
+                code_gen_dir
+            )
+            weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
+            # sim weights are always the true weights
+            self.make_weight_file(
+                weights, "decoupled_verilog_dat", weight_filename_rtl_sim
+            )
+            ram_style = self.get_nodeattr("ram_style")
+            if ram_style == "ultra":
+                # UltraRAM must have no memory initializer, or only zeroes
+                # otherwise BRAM will be inferred instead of URAM
+                # as a workaround we provide a zero-weight init here
+                synth_weights = np.zeros_like(weights, dtype=np.float32)
+            else:
+                synth_weights = weights
+            self.make_weight_file(
+                synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
+            )
+        else:
+            raise Exception(
+                """Please set mem_mode to "decoupled",
+                currently no other parameter value is supported!"""
+            )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for MatrixVectorActivation")
+            in_ind += 1
+
+        if mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            if mem_mode == "external" or mem_mode == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass   
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass     
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
+            sname = self.hls_sname()
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+            )
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
+            return super().code_generation_ipi()
+        else:
+            raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
+        return cmd
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(
+                ("weights_" + sname, self.get_weightstream_width_padded())
+            )
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def get_op_and_param_counts(self):
+        in_features = self.get_nodeattr("MW")
+        out_features = self.get_nodeattr("MH")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        num_repetitions = int(np.prod(num_inp_vec))
+        mac_count = in_features * out_features * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = in_features * out_features
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = out_features
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+    def generate_hdl(self):
+#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded
+        template_path, code_gen_dict = self.prepare_codegen_default()
+
+        # add general parameters to dictionary
+        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+#TODO: currently only ram_style=auto is supported
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "auto":
+            continue
+        else:
+            raise Exception("Unrecognized ram_style for MatrixVectorActivation")
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+            ),
+            "w",
+        ) as f:
+            f.write(template)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper)
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)    
+
+    def prepare_codegen_default(self):
+        # TODO: Differentiate between PE folding and fully unrolled along MH dimension
+        template_path = (
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl"
+        )
+        code_gen_dict = {}
+
+        code_gen_dict["$PE$"] = self.get_nodeattr("PE")
+        code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD")
+        code_gen_dict["$MW$"] = self.get_nodeattr("MW")
+        code_gen_dict["$MH$"] = self.get_nodeattr("MH")
+        code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth()
+        code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth()
+        code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth()
+
+        return template_path, code_gen_dict
+

From afab9cd6543b4fe1f612c329074d30d59706ac08 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:34:01 +0100
Subject: [PATCH 002/123] [rtl custom op]: initial implementation of mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9.sv | 284 ++++++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
new file mode 100644
index 0000000000..c992990d9f
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -0,0 +1,284 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
+ *****************************************************************************/
+
+module mvu_8sx9 #(
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
+  )
+  (
+    input   logic clk,
+    input   logic rst,
+    input   logic en,
+    input   logic last,
+    input   logic zero,
+    input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a,
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w,
+    output  logic vld,
+    output  logic [PE-1:0][57:0] p 
+  );
+
+//-------------------- Declare global signals --------------------\\
+localparam int unsigned CHAINLEN = (SIMD+2)/3;
+localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+uwire [26:0] a_in_i [CHAINLEN];
+uwire [23:0] b_in_i [PE][CHAINLEN];
+uwire [57:0] pcout [PE][CHAINLEN];
+
+//-------------------- Shift register for opmode select signal --------------------\\
+localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+
+always_ff @(posedge clk) begin
+  if(rst)     L <= '{default: 0};
+  else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+end  
+assign vld = L[0];
+
+//-------------------- Shift register for ZERO flag --------------------\\
+logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+
+if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+  always_ff @(posedge clk) begin
+    if (rst)      Z <= '{default: 0};
+    else if(en) begin
+        Z[0] <= zero;
+        if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+    end    
+  end
+end;
+
+//-------------------- Buffer for input activations --------------------\\
+localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
+
+for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+  localparam int TOTAL_PREGS = i/SEGLEN;
+  localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+  
+  if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+    a_buffer_t A [0:EXTERNAL_PREGS-1];
+    always_ff @(posedge clk) begin
+      if (rst)     A <= '{default: 0};
+      else if(en) begin
+        A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+        if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+      end
+    end
+    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
+                             : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+  end : genExternalPregAct
+  else begin : genInpDSPAct
+    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
+                             : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+  end : genInpDSPAct
+
+end : genActSIMD
+
+//-------------------- Buffer for weights --------------------\\
+localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
+
+for (genvar j=0; j<PE; j++) begin : genWeightPE
+  for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
+    localparam int TOTAL_PREGS = i/SEGLEN;
+    localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+    
+    if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+      b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+      always_ff @(posedge clk) begin
+        if (rst)    B <= '{default: 0};
+        else if (en) begin
+          B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
+          if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+        end
+      end
+      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+    end : genExternalPregWeight
+    else begin : genInpDSPWeight
+      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+    end : genInpDSPWeight
+  end : genWeightSIMD
+
+end : genWeightPE
+
+//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
+for (genvar j=0; j<PE; j++) begin : genDSPPE
+  for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
+    localparam int TOTAL_PREGS = i/SEGLEN;
+    localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+    localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
+    localparam bit FIRST = i == 0;
+    localparam bit LAST = i == CHAINLEN-1;
+    uwire [57:0] pp;
+    
+    if (LAST) begin : genPOUT
+      assign p[j] = pp;
+    end      
+    
+    DSP58 #(
+      // Feature Control Attributes: Data Path Selection
+      .AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+      .A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+      .BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+      .B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+      .DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+                                          // legacy mode.
+      .PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+      .RND(58'h000000000000000),          // Rounding Constant
+      .USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+      .USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+      .USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+      .XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+      // Pattern Detector Attributes: Pattern Detection Configuration
+      .AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+      .AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+      .MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+      .PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+      .SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+      .SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+      .USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+      // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+      .IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+      .IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+      .IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+      .IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+      .IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+      .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+                            FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+                            2'b01, // Y : M
+                            2'b01  // X: M
+        }), // Optional inversion for OPMODE
+      .IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+      .IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+      .IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+      .IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+      .IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+      .IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+      .IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+      .IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+      .IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+      .IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+      // Register Control Attributes: Pipeline Register Configuration
+      .ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+      .ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+      .ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+      .AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+      .BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+      .BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+      .CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+      .CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+      .CREG(0),                           // Pipeline stages for C (0-1)
+      .DREG(0),                           // Pipeline stages for D (0-1)
+      .INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+      .MREG(1),                           // Multiplier pipeline stages (0-1)
+      .OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+      .PREG(PREG),                        // Number of pipeline stages for P (0-1)
+      .RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+    )
+    DSP58_inst (
+      // Cascade outputs: Cascade Ports
+      .ACOUT(),                           // 34-bit output: A port cascade
+      .BCOUT(),                           // 24-bit output: B cascade
+      .CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+      .MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+      .PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+      // Control outputs: Control Inputs/Status Bits
+      .OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+      .PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+      .PATTERNDETECT(),                   // 1-bit output: Pattern detect
+      .UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+      // Data outputs: Data Ports
+      .CARRYOUT(),                        // 4-bit output: Carry
+      .P(pp),                             // 58-bit output: Primary data
+      .XOROUT(),                          // 8-bit output: XOR data
+      // Cascade inputs: Cascade Ports
+      .ACIN('x),                          // 34-bit input: A cascade data
+      .BCIN('x),                          // 24-bit input: B cascade
+      .CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+      .MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+      .PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+      // Control inputs: Control Inputs/Status Bits
+      .ALUMODE(4'h0),                     // 4-bit input: ALU control
+      .CARRYINSEL('0),                    // 3-bit input: Carry select
+      .CLK(clk),                          // 1-bit input: Clock
+      .INMODE({
+              INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+              2'b00,
+              TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+              INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+      }),                                 // 5-bit input: INMODE control
+      .NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+      .OPMODE({
+              LAST ? {1'b0, L[1]} : 2'b00,
+              7'b000_0000
+      }), // 9-bit input: Operation mode
+      // Data inputs: Data Ports
+      .A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+      .B(b_in_i[j][i]),                   // 24-bit input: B data
+      .C('x),                             // 58-bit input: C data
+      .CARRYIN('0),                       // 1-bit input: Carry-in
+      .D('x),                             // 27-bit input: D data
+      // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+      .ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+      .CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+      .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+      .CEAD('0),                          // 1-bit input: Clock enable for ADREG
+      .CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+      .CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+      .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+      .CEC('0),                           // 1-bit input: Clock enable for CREG
+      .CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+      .CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+      .CED('0),                           // 1-bit input: Clock enable for DREG
+      .CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+      .CEM(en),                           // 1-bit input: Clock enable for MREG
+      .CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+      .RSTA(rst),                         // 1-bit input: Reset for AREG
+      .RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+      .RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+      .RSTB(rst),                         // 1-bit input: Reset for BREG
+      .RSTC('0),                          // 1-bit input: Reset for CREG
+      .RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+      .RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+      .RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+      .RSTM(rst),                         // 1-bit input: Reset for MREG
+      .RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+    );
+  end : genDSPChain  
+end : genDSPPE
+    
+endmodule

From a94fc3bb0759ecd4b9af212d1629236894a1b520 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:34:22 +0100
Subject: [PATCH 003/123] [rtl custom op]: testbench for mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
new file mode 100644
index 0000000000..ea3ecbbd70
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_8sx9_tb();
+
+  //-------------------- Simulation parameters --------------------\\
+  // Matrix & parallelism config
+  localparam int unsigned MH = 256;
+  localparam int unsigned PE = 16;
+  localparam int unsigned MW = 600;
+  localparam int unsigned SIMD = 60;
+  localparam int unsigned SEGMENTLEN = 4;
+  // Bit-width config  
+  localparam int unsigned ACTIVATION_WIDTH = 8;
+  localparam int unsigned WEIGHT_WIDTH = 4;
+  localparam bit SIGNED_ACTIVATIONS = 1;
+  // Simulation constants
+  localparam int unsigned NF = MH/PE;
+  localparam int unsigned SF = MW/SIMD;
+  localparam int unsigned NUM_OF_DSP = SIMD/3;
+  
+  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+  typedef activation_t activation_vector_t[SF];
+
+  function activation_vector_t init_ACTIVATIONS;
+    automatic activation_vector_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_ACTIVATIONS
+
+  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+  typedef weight_t weight_matrix_t[NF][SF];
+  
+  function weight_matrix_t init_WEIGHTS;
+    automatic weight_matrix_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_WEIGHTS;
+  
+  typedef logic signed [PE-1:0][57:0] output_t;
+  typedef output_t output_vector_t [NF];
+
+  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+    automatic output_vector_t res = '{default: 0};
+    for (int j = 0; j<MH; j++) begin
+      for (int i = 0; i<MW; i++) begin
+        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+      end
+    end  
+    return res;
+  endfunction : check_output;
+  
+  logic clk = 0;
+  always #5ns clk = !clk;
+  
+  logic rst;
+  initial begin
+    rst = 1;
+    repeat(16) @(posedge clk);
+    rst <= 0;
+  end
+   
+  logic last;
+  logic zero;
+  logic vld;
+  activation_t a;
+  weight_t w;
+  output_t p;
+  // Reference signals
+  activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+  weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+  output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+  // Counter for number of outputs (NF dimension) that are produced
+  int NF_CNT = 0;
+  
+  initial begin
+    ACTIVATIONS = init_ACTIVATIONS();
+    WEIGHTS = init_WEIGHTS();
+    GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+    last = 0;
+    zero = 0;
+    a = 'x;
+    w = 'x;
+    
+    @(posedge clk iff !rst);
+
+    for (int j=0; j<NF; j++) begin
+      for (int i=0; i<SF; i++) begin
+        last <= (i==SF-1) ? 1 : 0;
+        a <= ACTIVATIONS[i];
+        w <= WEIGHTS[j][i];
+        @(posedge clk iff en);
+      end
+    end
+
+    last <= 0;
+    zero <= 1;  
+
+    // Continue until all NF outputs are produced & compared
+    @(posedge clk && (NF_CNT==NF));
+
+    $finish;
+  end
+
+  logic en = 0;
+  always_ff @(posedge clk) begin
+    en <= ($urandom()%7 > 1) && !rst;
+  end
+
+  // Compare computed output against golden output when vld flag is raised by DUT
+  always_ff @(posedge clk iff (vld && en)) begin
+    foreach(p[i]) begin
+      assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+      else begin 
+        $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+        $stop;
+      end  
+    end
+    NF_CNT += 1;
+  end
+
+  // Instantiate DUT
+  mvu_8sx9 #(
+      .PE(PE),
+      .SIMD(SIMD),
+      .WEIGHT_WIDTH(WEIGHT_WIDTH),
+      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+      .SEGMENTLEN(SEGMENTLEN)
+    )
+    dut (
+      .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+    );
+  
+endmodule

From 98f9accb40bed3445215e15d30398e09948e0b9f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:35:30 +0100
Subject: [PATCH 004/123] [rtl custom op]: initial implementation of flow
 control component for mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
new file mode 100644
index 0000000000..8765c50a26
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -0,0 +1,179 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_8sx9_axi #(
+    int unsigned MW,
+    int unsigned MH,
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+    int unsigned ACCU_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0,
+		parameter RAM_STYLE = "auto",
+
+    localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+    localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+		localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+		localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+    localparam int unsigned SF = MW/SIMD,
+		localparam int unsigned NF = MH/PE,
+    localparam int unsigned OUTPUT_LANES = PE,
+    localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input		logic  ap_clk,
+	input		logic  ap_rst_n,
+
+	// Weight Stream
+	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input		logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input		logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input		logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+		$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+		$finish;
+		end
+		if (MH % PE != 0) begin
+		$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+		$finish;
+		end
+		if (ACTIVATION_WIDTH > 9) begin
+		$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+		$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+		$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+		$finish;
+		end
+		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
+		$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+		$finish;
+		end
+		if (SEGMENTLEN == 0) begin
+		$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+		end
+		if (SEGMENTLEN > (SIMD+2)/3) begin
+		$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+		$finish;
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+	//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+	//-------------------- Core MVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][57:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+		.clk, .rst, .en,
+		.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+		.vld(ovld), .p(odat)
+	);
+
+	//-------------------- Output register slice --------------------\\
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)  A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [57:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+	
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ default: 'x };
+		else begin
+			if(b_load)	 B <= '{ vld: A.vld, dat: A.dat};
+		end	
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule
\ No newline at end of file

From 96925a929877ce084466438128678250b09784a9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:36:00 +0100
Subject: [PATCH 005/123] [rtl custom op]: implementation of replay buffer for
 mvu

---
 finn-rtllib/mvu/replay_buffer.sv | 109 +++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 finn-rtllib/mvu/replay_buffer.sv

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
new file mode 100644
index 0000000000..685ac03137
--- /dev/null
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Replay buffer for counted sequences on an AXI-lite stream.
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ *****************************************************************************/
+
+module replay_buffer #(
+	int unsigned  LEN,	// Sequence length
+	int unsigned  REP,	// Sequence replay count
+	int unsigned  W,	// Data width
+	parameter RAM_STYLE = "auto" 	// ram style for buffer {block, distributed, ultra, auto}
+)(
+	input	logic  clk,
+	input	logic  rst,
+
+	input	logic [W-1:0]  idat,
+	input	logic  ivld,
+	output	logic  irdy,
+
+	output	logic [W-1:0]  odat,
+	output	logic  olast,
+	output	logic  ofin,
+	output	logic  ovld,
+	input	logic  ordy
+);
+
+	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
+	count_t  Count = 0;
+	uwire  done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
+	uwire  done_rep;
+	uwire  done_all = done_len && done_rep;
+
+	uwire  shift;
+	uwire  clr = rst || (done_all && shift);
+	always_ff @(posedge clk) begin
+		if(clr)         Count <= 0;
+		else if(shift)  Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1);
+	end
+
+	typedef logic [W-1:0]  data_t;
+	uwire data_t  rdat;
+	uwire  first_rep;
+	if(REP == 1) begin
+		assign	done_rep  = 1;
+		assign	first_rep = 1;
+		assign	rdat = 'x;
+	end
+	else begin
+		assign	done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0;
+
+		logic  FirstRep = 1;
+		always_ff @(posedge clk) begin
+			if(clr)         FirstRep <= 1;
+			else if(shift)  FirstRep <= FirstRep && !done_len;
+		end
+		assign	first_rep = FirstRep;
+
+		(* RAM_STYLE = RAM_STYLE *)
+		data_t  Buf[LEN];
+		if(LEN == 1) begin : genTrivial
+			always_ff @(posedge clk) begin
+				if(shift && FirstRep)  Buf[0] <= idat;
+			end
+		end : genTrivial
+		else begin : genShift
+			always_ff @(posedge clk) begin
+				if(shift)  Buf <= { odat, Buf[0:LEN-2] };
+			end
+		end : genShift
+
+		assign	rdat = Buf[LEN-1];
+	end
+
+	assign  irdy  = ordy && first_rep;
+	assign	odat  = first_rep? idat : rdat;
+	assign	olast = done_len;
+	assign	ofin  = done_all;
+	assign	ovld  = first_rep? ivld : 1;
+	assign	shift = ovld && ordy;
+
+endmodule : replay_buffer
\ No newline at end of file

From a3d11567468899bbcf33c83b509c26f908a807a3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:37:16 +0100
Subject: [PATCH 006/123] [rtl custom op]: testbench for mvu_8sx9_axi
 (including axi_wrapper & compute kernel)

---
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 +++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
new file mode 100644
index 0000000000..ea97e0708c
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
@@ -0,0 +1,208 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_8sx9_axi_tb();
+
+  //-------------------- Simulation parameters --------------------\\
+  // Matrix & parallelism config
+  localparam int unsigned MW = 600;
+  localparam int unsigned MH = 256;
+  localparam int unsigned SIMD = 60;
+  localparam int unsigned PE = 16;
+  localparam int unsigned SEGMENTLEN = 4;
+  // Bit-width config  
+  localparam int unsigned ACTIVATION_WIDTH = 8;
+  localparam int unsigned WEIGHT_WIDTH = 4;
+  localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+  localparam bit SIGNED_ACTIVATIONS = 1;
+  // Simulation constants  
+  localparam int unsigned NF = MH/PE;
+  localparam int unsigned SF = MW/SIMD;
+  localparam int unsigned NUM_OF_DSP = SIMD/3;
+  localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+  localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+  localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+  localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+  localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+  // Generate clk and reset signal   
+  logic clk = 0;
+  always #5ns clk = !clk;
+  
+  logic ap_rst_n = 0;
+  initial begin
+    repeat(16) @(posedge clk);
+    ap_rst_n <= 1;
+  end
+
+  uwire ap_clk = clk;
+
+  // Generate activations  
+  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+  typedef activation_t activation_vector_t[SF];
+    
+  function activation_vector_t init_ACTIVATIONS;
+    automatic activation_vector_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_ACTIVATIONS
+
+  activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+  struct {
+    activation_t dat;
+    logic vld;
+    logic rdy;
+  } activations;
+
+  initial begin
+    activations.vld = 0;
+    activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+    @(posedge clk iff ap_rst_n);
+
+    for (int i=0; i<SF; i++) begin
+      activations.dat <= ACTIVATIONS[i];
+      do begin 
+        activations.vld = $urandom()%7 > 1;
+        @(posedge clk);
+      end while (!(activations.vld === 1 && activations.rdy === 1));
+    end
+    
+    activations.vld <= 0;
+    activations.dat <= 'x;
+  end
+   
+  // Generate weights   
+  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+  typedef weight_t weight_matrix_t[NF][SF]; 
+  
+  function weight_matrix_t init_WEIGHTS;
+    automatic weight_matrix_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_WEIGHTS;
+
+  weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+  struct {
+    weight_t dat;
+    logic vld;
+    logic rdy;
+  } weights;
+
+  initial begin
+    weights.vld = 0;
+    weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+    @(posedge clk iff ap_rst_n);
+
+    weights.vld <= 1;
+    for (int i=0; i<NF; i++) begin
+      for (int j=0; j<SF; j++) begin
+        weights.dat <= WEIGHTS[i][j];
+        @(posedge clk iff weights.rdy);
+      end
+    end
+
+    weights.vld <= 0;
+    weights.dat <= 'x;
+  end
+  
+  // Function to compute golden output  
+  // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+  // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+  typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+  typedef output_t output_vector_t [NF];
+
+  struct {
+    output_t dat;
+    logic vld;
+    logic rdy;
+  } outputs;
+
+  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+    automatic output_vector_t res = '{default: 0};
+    for (int j = 0; j<MH; j++) begin
+      for (int i = 0; i<MW; i++) begin
+        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+      end
+    end  
+    return res;
+  endfunction : check_output;
+
+  output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+  int unsigned NF_CNT = 0;
+  initial begin
+    outputs.rdy = 0;
+    while (NF_CNT < NF) begin
+      // Loop until both rdy & vld are asserted
+      do begin
+        outputs.rdy <= $urandom()%7 >= 1;
+        @(posedge clk iff ap_rst_n);
+      end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+      // Compare produced outputs against golden outputs
+      foreach(outputs.dat[i]) begin
+        assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+        else begin 
+          $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+          $stop;
+        end  
+      end
+      
+      NF_CNT += 1;
+    end
+    
+    $finish;  
+  end
+
+  // Instantiate DUT
+  mvu_8sx9_axi #(
+      .MW(MW),
+      .MH(MH),
+      .PE(PE),
+      .SIMD(SIMD),
+      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+      .WEIGHT_WIDTH(WEIGHT_WIDTH),
+      .ACCU_WIDTH(ACCU_WIDTH),
+      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+      .SEGMENTLEN(SEGMENTLEN)
+    )
+    dut (
+      .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+      .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+      .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+      .m_axis_output_tready(outputs.rdy)
+    );
+  
+endmodule

From 2aea664b2260a4ea759909d0a3168b5f62b114a2 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:37:55 +0100
Subject: [PATCH 007/123] [rtl custom op]: initial implementation of verilog
 wrapper for mvu_8sx9_axi

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 90 ++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
new file mode 100644
index 0000000000..ff3779d211
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter 	MW = $MW$,
+	parameter		MH = $MH$,
+	parameter 	PE = $PE$,
+	parameter 	SIMD = $SIMD$,
+	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter 	SEGMENTLEN = $SEGMENTLEN$,
+	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
+
+	// Safely deducible parameters
+	parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter OUTPUT_LANES = PE,
+	parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)(
+  // Global Control
+	input		logic  ap_clk,
+	input		logic  ap_rst_n,
+
+	// Weight Stream
+	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input		logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input		logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input		logic  m_axis_output_tready
+);
+
+mvu_8sx9_axi #(
+	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(s_axis_weights_tdata),
+	.s_axis_weights_tvalid(s_axis_weights_tvalid),
+	.s_axis_weights_tready(s_axis_weights_tready),
+	.s_axis_input_tdata(s_axis_input_tdata),
+	.s_axis_input_tvalid(s_axis_input_tvalid),
+	.s_axis_input_tready(s_axis_input_tready),
+	.m_axis_output_tdata(m_axis_output_tdata),
+	.m_axis_output_tvalid(m_axis_output_tvalid),
+	.m_axis_output_tready(m_axis_output_tready)
+)
+
+endmodule : mvau_8sx9_axi_wrapper
\ No newline at end of file

From 8b57849bb47c3119b177e78dcbaa48954f69b811 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 11 Apr 2023 15:50:24 +0100
Subject: [PATCH 008/123] [rtl mvu]: fix tab indentation

---
 finn-rtllib/mvu/mvu_8sx9.sv            | 424 ++++++++++++-------------
 finn-rtllib/mvu/mvu_8sx9_axi.sv        |  32 +-
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv     | 342 ++++++++++----------
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v |  26 +-
 finn-rtllib/mvu/mvu_8sx9_tb.sv         | 258 +++++++--------
 5 files changed, 541 insertions(+), 541 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index c992990d9f..d082d4fb2e 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -52,233 +52,233 @@ module mvu_8sx9 #(
   );
 
 //-------------------- Declare global signals --------------------\\
-localparam int unsigned CHAINLEN = (SIMD+2)/3;
-localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-uwire [26:0] a_in_i [CHAINLEN];
-uwire [23:0] b_in_i [PE][CHAINLEN];
-uwire [57:0] pcout [PE][CHAINLEN];
+	localparam int unsigned CHAINLEN = (SIMD+2)/3;
+	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+	uwire [26:0] a_in_i [CHAINLEN];
+	uwire [23:0] b_in_i [PE][CHAINLEN];
+	uwire [57:0] pcout [PE][CHAINLEN];
 
 //-------------------- Shift register for opmode select signal --------------------\\
-localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
-logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
 
-always_ff @(posedge clk) begin
-  if(rst)     L <= '{default: 0};
-  else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
-end  
-assign vld = L[0];
+	always_ff @(posedge clk) begin
+		if(rst)     L <= '{default: 0};
+		else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+	end  
+	assign vld = L[0];
 
 //-------------------- Shift register for ZERO flag --------------------\\
-logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
 
-if (MAX_PIPELINE_STAGES > 1) begin : genZreg
-  always_ff @(posedge clk) begin
-    if (rst)      Z <= '{default: 0};
-    else if(en) begin
-        Z[0] <= zero;
-        if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
-    end    
-  end
-end;
+	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+		always_ff @(posedge clk) begin
+			if (rst)      Z <= '{default: 0};
+			else if(en) begin
+				Z[0] <= zero;
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+			end    
+		end
+	end;
 
 //-------------------- Buffer for input activations --------------------\\
-localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
+	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
 
-for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-  localparam int TOTAL_PREGS = i/SEGLEN;
-  localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-  
-  if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-    a_buffer_t A [0:EXTERNAL_PREGS-1];
-    always_ff @(posedge clk) begin
-      if (rst)     A <= '{default: 0};
-      else if(en) begin
-        A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
-        if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
-      end
-    end
-    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
-                             : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
-  end : genExternalPregAct
-  else begin : genInpDSPAct
-    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
-                             : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
-  end : genInpDSPAct
+	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+		localparam int TOTAL_PREGS = i/SEGLEN;
+		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
 
-end : genActSIMD
+		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+			a_buffer_t A [0:EXTERNAL_PREGS-1];
+			always_ff @(posedge clk) begin
+				if (rst)     A <= '{default: 0};
+				else if(en) begin
+					A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+				end
+			end
+			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
+									: { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+		end : genExternalPregAct
+		else begin : genInpDSPAct
+			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
+									: { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+		end : genInpDSPAct
+
+	end : genActSIMD
 
 //-------------------- Buffer for weights --------------------\\
-localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
+	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+	typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
 
-for (genvar j=0; j<PE; j++) begin : genWeightPE
-  for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
-    localparam int TOTAL_PREGS = i/SEGLEN;
-    localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-    
-    if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-      b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
-      always_ff @(posedge clk) begin
-        if (rst)    B <= '{default: 0};
-        else if (en) begin
-          B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
-          if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
-        end
-      end
-      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
-    end : genExternalPregWeight
-    else begin : genInpDSPWeight
-      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
-    end : genInpDSPWeight
-  end : genWeightSIMD
+	for (genvar j=0; j<PE; j++) begin : genWeightPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
 
-end : genWeightPE
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+				b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+				always_ff @(posedge clk) begin
+					if (rst)    B <= '{default: 0};
+					else if (en) begin
+						B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
+						if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+					end
+				end
+				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+			end : genExternalPregWeight
+			else begin : genInpDSPWeight
+				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+			end : genInpDSPWeight
+		end : genWeightSIMD
+
+	end : genWeightPE
 
 //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-for (genvar j=0; j<PE; j++) begin : genDSPPE
-  for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
-    localparam int TOTAL_PREGS = i/SEGLEN;
-    localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-    localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
-    localparam bit FIRST = i == 0;
-    localparam bit LAST = i == CHAINLEN-1;
-    uwire [57:0] pp;
-    
-    if (LAST) begin : genPOUT
-      assign p[j] = pp;
-    end      
-    
-    DSP58 #(
-      // Feature Control Attributes: Data Path Selection
-      .AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-      .A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-      .BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-      .B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-      .DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-                                          // legacy mode.
-      .PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-      .RND(58'h000000000000000),          // Rounding Constant
-      .USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-      .USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-      .USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-      .XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-      // Pattern Detector Attributes: Pattern Detection Configuration
-      .AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-      .AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-      .MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-      .PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-      .SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-      .SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-      .USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-      // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-      .IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-      .IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-      .IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-      .IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-      .IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-      .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-                            FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
-                            2'b01, // Y : M
-                            2'b01  // X: M
-        }), // Optional inversion for OPMODE
-      .IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-      .IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-      .IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-      .IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-      .IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-      .IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-      .IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-      .IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-      .IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-      .IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-      // Register Control Attributes: Pipeline Register Configuration
-      .ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-      .ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-      .ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-      .AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-      .BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-      .BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-      .CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-      .CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-      .CREG(0),                           // Pipeline stages for C (0-1)
-      .DREG(0),                           // Pipeline stages for D (0-1)
-      .INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-      .MREG(1),                           // Multiplier pipeline stages (0-1)
-      .OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-      .PREG(PREG),                        // Number of pipeline stages for P (0-1)
-      .RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-    )
-    DSP58_inst (
-      // Cascade outputs: Cascade Ports
-      .ACOUT(),                           // 34-bit output: A port cascade
-      .BCOUT(),                           // 24-bit output: B cascade
-      .CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-      .MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-      .PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
-      // Control outputs: Control Inputs/Status Bits
-      .OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-      .PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-      .PATTERNDETECT(),                   // 1-bit output: Pattern detect
-      .UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-      // Data outputs: Data Ports
-      .CARRYOUT(),                        // 4-bit output: Carry
-      .P(pp),                             // 58-bit output: Primary data
-      .XOROUT(),                          // 8-bit output: XOR data
-      // Cascade inputs: Cascade Ports
-      .ACIN('x),                          // 34-bit input: A cascade data
-      .BCIN('x),                          // 24-bit input: B cascade
-      .CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-      .MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-      .PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
-      // Control inputs: Control Inputs/Status Bits
-      .ALUMODE(4'h0),                     // 4-bit input: ALU control
-      .CARRYINSEL('0),                    // 3-bit input: Carry select
-      .CLK(clk),                          // 1-bit input: Clock
-      .INMODE({
-              INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-              2'b00,
-              TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-              INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
-      }),                                 // 5-bit input: INMODE control
-      .NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-      .OPMODE({
-              LAST ? {1'b0, L[1]} : 2'b00,
-              7'b000_0000
-      }), // 9-bit input: Operation mode
-      // Data inputs: Data Ports
-      .A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-      .B(b_in_i[j][i]),                   // 24-bit input: B data
-      .C('x),                             // 58-bit input: C data
-      .CARRYIN('0),                       // 1-bit input: Carry-in
-      .D('x),                             // 27-bit input: D data
-      // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-      .ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-      .CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-      .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-      .CEAD('0),                          // 1-bit input: Clock enable for ADREG
-      .CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-      .CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-      .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-      .CEC('0),                           // 1-bit input: Clock enable for CREG
-      .CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-      .CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-      .CED('0),                           // 1-bit input: Clock enable for DREG
-      .CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-      .CEM(en),                           // 1-bit input: Clock enable for MREG
-      .CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-      .RSTA(rst),                         // 1-bit input: Reset for AREG
-      .RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-      .RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-      .RSTB(rst),                         // 1-bit input: Reset for BREG
-      .RSTC('0),                          // 1-bit input: Reset for CREG
-      .RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-      .RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-      .RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-      .RSTM(rst),                         // 1-bit input: Reset for MREG
-      .RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-    );
-  end : genDSPChain  
-end : genDSPPE
+	for (genvar j=0; j<PE; j++) begin : genDSPPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+			localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
+			localparam bit FIRST = i == 0;
+			localparam bit LAST = i == CHAINLEN-1;
+			uwire [57:0] pp;
+
+			if (LAST) begin : genPOUT
+				assign p[j] = pp;
+			end      
+
+			DSP58 #(
+				// Feature Control Attributes: Data Path Selection
+				.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+				.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+				.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+				.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+				.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+													// legacy mode.
+				.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+				.RND(58'h000000000000000),          // Rounding Constant
+				.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+				.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+				.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+				.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+				// Pattern Detector Attributes: Pattern Detection Configuration
+				.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+				.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+				.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+				.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+				.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+				.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+				.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+				// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+				.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+				.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+				.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+				.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+				.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+				.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+									FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+									2'b01, // Y : M
+									2'b01  // X: M
+				}), // Optional inversion for OPMODE
+				.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+				.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+				.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+				.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+				.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+				.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+				.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+				.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+				.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+				.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+				// Register Control Attributes: Pipeline Register Configuration
+				.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+				.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+				.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+				.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+				.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+				.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+				.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+				.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+				.CREG(0),                           // Pipeline stages for C (0-1)
+				.DREG(0),                           // Pipeline stages for D (0-1)
+				.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+				.MREG(1),                           // Multiplier pipeline stages (0-1)
+				.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+				.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+				.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+			)
+			DSP58_inst (
+				// Cascade outputs: Cascade Ports
+				.ACOUT(),                           // 34-bit output: A port cascade
+				.BCOUT(),                           // 24-bit output: B cascade
+				.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+				.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+				.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+				// Control outputs: Control Inputs/Status Bits
+				.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+				.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+				.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+				.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+				// Data outputs: Data Ports
+				.CARRYOUT(),                        // 4-bit output: Carry
+				.P(pp),                             // 58-bit output: Primary data
+				.XOROUT(),                          // 8-bit output: XOR data
+				// Cascade inputs: Cascade Ports
+				.ACIN('x),                          // 34-bit input: A cascade data
+				.BCIN('x),                          // 24-bit input: B cascade
+				.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+				.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+				.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+				// Control inputs: Control Inputs/Status Bits
+				.ALUMODE(4'h0),                     // 4-bit input: ALU control
+				.CARRYINSEL('0),                    // 3-bit input: Carry select
+				.CLK(clk),                          // 1-bit input: Clock
+				.INMODE({
+						INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+						2'b00,
+						TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+						INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+				}),                                 // 5-bit input: INMODE control
+				.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+				.OPMODE({
+						LAST ? {1'b0, L[1]} : 2'b00,
+						7'b000_0000
+				}), // 9-bit input: Operation mode
+				// Data inputs: Data Ports
+				.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+				.B(b_in_i[j][i]),                   // 24-bit input: B data
+				.C('x),                             // 58-bit input: C data
+				.CARRYIN('0),                       // 1-bit input: Carry-in
+				.D('x),                             // 27-bit input: D data
+				// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+				.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+				.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+				.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+				.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+				.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+				.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+				.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+				.CEC('0),                           // 1-bit input: Clock enable for CREG
+				.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+				.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+				.CED('0),                           // 1-bit input: Clock enable for DREG
+				.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+				.CEM(en),                           // 1-bit input: Clock enable for MREG
+				.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+				.RSTA(rst),                         // 1-bit input: Reset for AREG
+				.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+				.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+				.RSTB(rst),                         // 1-bit input: Reset for BREG
+				.RSTC('0),                          // 1-bit input: Reset for CREG
+				.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+				.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+				.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+				.RSTM(rst),                         // 1-bit input: Reset for MREG
+				.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+			);
+		end : genDSPChain  
+	end : genDSPPE
     
 endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
index 8765c50a26..6c7eaeaeca 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -41,36 +41,36 @@ module mvu_8sx9_axi #(
     int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0,
-		parameter RAM_STYLE = "auto",
+	parameter RAM_STYLE = "auto",
 
     localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
     localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-		localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-		localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
     localparam int unsigned SF = MW/SIMD,
-		localparam int unsigned NF = MH/PE,
+	localparam int unsigned NF = MH/PE,
     localparam int unsigned OUTPUT_LANES = PE,
     localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
-	input		logic  ap_clk,
-	input		logic  ap_rst_n,
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
 
 	// Weight Stream
-	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input		logic  s_axis_weights_tvalid,
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
 	output	logic  s_axis_weights_tready,
 
 	// Input Stream
-	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input		logic  s_axis_input_tvalid,
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
 	output	logic  s_axis_input_tready,
 
 	// Output Stream
 	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
 	output	logic  m_axis_output_tvalid,
-	input		logic  m_axis_output_tready
+	input	logic  m_axis_output_tready
 );
 
 //-------------------- Parameter sanity checks --------------------\\
@@ -121,13 +121,13 @@ module mvu_8sx9_axi #(
 		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
-	//-------------------- Input control --------------------\\
+//-------------------- Input control --------------------\\
 	uwire en;
 	uwire istb = avld && s_axis_weights_tvalid;
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-	//-------------------- Core MVU --------------------\\
+//-------------------- Core MVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][57:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
@@ -138,7 +138,7 @@ module mvu_8sx9_axi #(
 		.vld(ovld), .p(odat)
 	);
 
-	//-------------------- Output register slice --------------------\\
+//-------------------- Output register slice --------------------\\
 	struct {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
@@ -148,7 +148,7 @@ module mvu_8sx9_axi #(
 
 	uwire  b_load;
 	always_ff @(posedge clk) begin
-		if(rst)  A <= '{ vld: 0, default: 'x };
+		if(rst)		A <= '{ vld: 0, default: 'x };
 		else if(!A.vld || b_load) begin
 			A.vld <= ovld && en;
 			for(int unsigned  i = 0; i < PE; i++) begin
@@ -169,7 +169,7 @@ module mvu_8sx9_axi #(
 	always_ff @(posedge clk) begin
 		if(rst)		B <= '{ default: 'x };
 		else begin
-			if(b_load)	 B <= '{ vld: A.vld, dat: A.dat};
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
 		end	
 	end
 
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
index ea97e0708c..70ffa096ef 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
@@ -33,176 +33,176 @@
 
 module mvu_8sx9_axi_tb();
 
-  //-------------------- Simulation parameters --------------------\\
-  // Matrix & parallelism config
-  localparam int unsigned MW = 600;
-  localparam int unsigned MH = 256;
-  localparam int unsigned SIMD = 60;
-  localparam int unsigned PE = 16;
-  localparam int unsigned SEGMENTLEN = 4;
-  // Bit-width config  
-  localparam int unsigned ACTIVATION_WIDTH = 8;
-  localparam int unsigned WEIGHT_WIDTH = 4;
-  localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-  localparam bit SIGNED_ACTIVATIONS = 1;
-  // Simulation constants  
-  localparam int unsigned NF = MH/PE;
-  localparam int unsigned SF = MW/SIMD;
-  localparam int unsigned NUM_OF_DSP = SIMD/3;
-  localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-  localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-  localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-  localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-  localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-  // Generate clk and reset signal   
-  logic clk = 0;
-  always #5ns clk = !clk;
-  
-  logic ap_rst_n = 0;
-  initial begin
-    repeat(16) @(posedge clk);
-    ap_rst_n <= 1;
-  end
-
-  uwire ap_clk = clk;
-
-  // Generate activations  
-  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-  typedef activation_t activation_vector_t[SF];
-    
-  function activation_vector_t init_ACTIVATIONS;
-    automatic activation_vector_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_ACTIVATIONS
-
-  activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-  struct {
-    activation_t dat;
-    logic vld;
-    logic rdy;
-  } activations;
-
-  initial begin
-    activations.vld = 0;
-    activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-    @(posedge clk iff ap_rst_n);
-
-    for (int i=0; i<SF; i++) begin
-      activations.dat <= ACTIVATIONS[i];
-      do begin 
-        activations.vld = $urandom()%7 > 1;
-        @(posedge clk);
-      end while (!(activations.vld === 1 && activations.rdy === 1));
-    end
-    
-    activations.vld <= 0;
-    activations.dat <= 'x;
-  end
-   
-  // Generate weights   
-  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-  typedef weight_t weight_matrix_t[NF][SF]; 
-  
-  function weight_matrix_t init_WEIGHTS;
-    automatic weight_matrix_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_WEIGHTS;
-
-  weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-  struct {
-    weight_t dat;
-    logic vld;
-    logic rdy;
-  } weights;
-
-  initial begin
-    weights.vld = 0;
-    weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-    @(posedge clk iff ap_rst_n);
-
-    weights.vld <= 1;
-    for (int i=0; i<NF; i++) begin
-      for (int j=0; j<SF; j++) begin
-        weights.dat <= WEIGHTS[i][j];
-        @(posedge clk iff weights.rdy);
-      end
-    end
-
-    weights.vld <= 0;
-    weights.dat <= 'x;
-  end
-  
-  // Function to compute golden output  
-  // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-  // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-  typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-  typedef output_t output_vector_t [NF];
-
-  struct {
-    output_t dat;
-    logic vld;
-    logic rdy;
-  } outputs;
-
-  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-    automatic output_vector_t res = '{default: 0};
-    for (int j = 0; j<MH; j++) begin
-      for (int i = 0; i<MW; i++) begin
-        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-      end
-    end  
-    return res;
-  endfunction : check_output;
-
-  output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-  int unsigned NF_CNT = 0;
-  initial begin
-    outputs.rdy = 0;
-    while (NF_CNT < NF) begin
-      // Loop until both rdy & vld are asserted
-      do begin
-        outputs.rdy <= $urandom()%7 >= 1;
-        @(posedge clk iff ap_rst_n);
-      end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-      // Compare produced outputs against golden outputs
-      foreach(outputs.dat[i]) begin
-        assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-        else begin 
-          $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-          $stop;
-        end  
-      end
-      
-      NF_CNT += 1;
-    end
-    
-    $finish;  
-  end
-
-  // Instantiate DUT
-  mvu_8sx9_axi #(
-      .MW(MW),
-      .MH(MH),
-      .PE(PE),
-      .SIMD(SIMD),
-      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-      .WEIGHT_WIDTH(WEIGHT_WIDTH),
-      .ACCU_WIDTH(ACCU_WIDTH),
-      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-      .SEGMENTLEN(SEGMENTLEN)
-    )
-    dut (
-      .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-      .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-      .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-      .m_axis_output_tready(outputs.rdy)
-    );
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MW = 600;
+	localparam int unsigned MH = 256;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned PE = 16;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants  
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal   
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations  
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin 
+				activations.vld = $urandom()%7 > 1;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights   
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF]; 
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output  
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 1;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin 
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end  
+			end
+			
+			NF_CNT += 1;
+		end
+
+		$finish;  
+	end
+
+	// Instantiate DUT
+	mvu_8sx9_axi #(
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
   
 endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index ff3779d211..2456eb3a47 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -33,7 +33,7 @@
 
 module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	MW = $MW$,
-	parameter		MH = $MH$,
+	parameter	MH = $MH$,
 	parameter 	PE = $PE$,
 	parameter 	SIMD = $SIMD$,
 	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
@@ -44,29 +44,29 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
 
 	// Safely deducible parameters
-	parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter OUTPUT_LANES = PE,
-	parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter 	OUTPUT_LANES = PE,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  // Global Control
-	input		logic  ap_clk,
-	input		logic  ap_rst_n,
+  	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
 
 	// Weight Stream
-	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input		logic  s_axis_weights_tvalid,
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
 	output	logic  s_axis_weights_tready,
 
 	// Input Stream
-	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input		logic  s_axis_input_tvalid,
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
 	output	logic  s_axis_input_tready,
 
 	// Output Stream
 	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
 	output	logic  m_axis_output_tvalid,
-	input		logic  m_axis_output_tready
+	input	logic  m_axis_output_tready
 );
 
 mvu_8sx9_axi #(
diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
index ea3ecbbd70..adf6a8f9c2 100644
--- a/finn-rtllib/mvu/mvu_8sx9_tb.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv
@@ -33,133 +33,133 @@
 
 module mvu_8sx9_tb();
 
-  //-------------------- Simulation parameters --------------------\\
-  // Matrix & parallelism config
-  localparam int unsigned MH = 256;
-  localparam int unsigned PE = 16;
-  localparam int unsigned MW = 600;
-  localparam int unsigned SIMD = 60;
-  localparam int unsigned SEGMENTLEN = 4;
-  // Bit-width config  
-  localparam int unsigned ACTIVATION_WIDTH = 8;
-  localparam int unsigned WEIGHT_WIDTH = 4;
-  localparam bit SIGNED_ACTIVATIONS = 1;
-  // Simulation constants
-  localparam int unsigned NF = MH/PE;
-  localparam int unsigned SF = MW/SIMD;
-  localparam int unsigned NUM_OF_DSP = SIMD/3;
-  
-  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-  typedef activation_t activation_vector_t[SF];
-
-  function activation_vector_t init_ACTIVATIONS;
-    automatic activation_vector_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_ACTIVATIONS
-
-  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-  typedef weight_t weight_matrix_t[NF][SF];
-  
-  function weight_matrix_t init_WEIGHTS;
-    automatic weight_matrix_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_WEIGHTS;
-  
-  typedef logic signed [PE-1:0][57:0] output_t;
-  typedef output_t output_vector_t [NF];
-
-  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-    automatic output_vector_t res = '{default: 0};
-    for (int j = 0; j<MH; j++) begin
-      for (int i = 0; i<MW; i++) begin
-        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-      end
-    end  
-    return res;
-  endfunction : check_output;
-  
-  logic clk = 0;
-  always #5ns clk = !clk;
-  
-  logic rst;
-  initial begin
-    rst = 1;
-    repeat(16) @(posedge clk);
-    rst <= 0;
-  end
-   
-  logic last;
-  logic zero;
-  logic vld;
-  activation_t a;
-  weight_t w;
-  output_t p;
-  // Reference signals
-  activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-  weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-  output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
-  // Counter for number of outputs (NF dimension) that are produced
-  int NF_CNT = 0;
-  
-  initial begin
-    ACTIVATIONS = init_ACTIVATIONS();
-    WEIGHTS = init_WEIGHTS();
-    GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-    last = 0;
-    zero = 0;
-    a = 'x;
-    w = 'x;
-    
-    @(posedge clk iff !rst);
-
-    for (int j=0; j<NF; j++) begin
-      for (int i=0; i<SF; i++) begin
-        last <= (i==SF-1) ? 1 : 0;
-        a <= ACTIVATIONS[i];
-        w <= WEIGHTS[j][i];
-        @(posedge clk iff en);
-      end
-    end
-
-    last <= 0;
-    zero <= 1;  
-
-    // Continue until all NF outputs are produced & compared
-    @(posedge clk && (NF_CNT==NF));
-
-    $finish;
-  end
-
-  logic en = 0;
-  always_ff @(posedge clk) begin
-    en <= ($urandom()%7 > 1) && !rst;
-  end
-
-  // Compare computed output against golden output when vld flag is raised by DUT
-  always_ff @(posedge clk iff (vld && en)) begin
-    foreach(p[i]) begin
-      assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-      else begin 
-        $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-        $stop;
-      end  
-    end
-    NF_CNT += 1;
-  end
-
-  // Instantiate DUT
-  mvu_8sx9 #(
-      .PE(PE),
-      .SIMD(SIMD),
-      .WEIGHT_WIDTH(WEIGHT_WIDTH),
-      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-      .SEGMENTLEN(SEGMENTLEN)
-    )
-    dut (
-      .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
-    );
-  
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MH = 256;
+	localparam int unsigned PE = 16;
+	localparam int unsigned MW = 600;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	typedef logic signed [PE-1:0][57:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic rst;
+	initial begin
+		rst = 1;
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic last;
+	logic zero;
+	logic vld;
+	activation_t a;
+	weight_t w;
+	output_t p;
+	// Reference signals
+	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+	// Counter for number of outputs (NF dimension) that are produced
+	int NF_CNT = 0;
+
+	initial begin
+		ACTIVATIONS = init_ACTIVATIONS();
+		WEIGHTS = init_WEIGHTS();
+		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+		last = 0;
+		zero = 0;
+		a = 'x;
+		w = 'x;
+
+		@(posedge clk iff !rst);
+
+		for (int j=0; j<NF; j++) begin
+			for (int i=0; i<SF; i++) begin
+				last <= (i==SF-1) ? 1 : 0;
+				a <= ACTIVATIONS[i];
+				w <= WEIGHTS[j][i];
+				@(posedge clk iff en);
+			end
+		end
+
+		last <= 0;
+		zero <= 1;  
+
+		// Continue until all NF outputs are produced & compared
+		@(posedge clk && (NF_CNT==NF));
+
+		$finish;
+	end
+
+	logic en = 0;
+	always_ff @(posedge clk) begin
+		en <= ($urandom()%7 > 1) && !rst;
+	end
+
+	// Compare computed output against golden output when vld flag is raised by DUT
+	always_ff @(posedge clk iff (vld && en)) begin
+		foreach(p[i]) begin
+			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+			else begin 
+				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				$stop;
+			end  
+		end
+		NF_CNT += 1;
+	end
+
+	// Instantiate DUT
+	mvu_8sx9 #(
+		.PE(PE),
+		.SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+	);
+
 endmodule

From 5e61f42afd991233153ee8b7fe0fb6e9e8ac562d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 08:54:45 +0100
Subject: [PATCH 009/123] [rtl custom op]: fix to indentation

---
 finn-rtllib/mvu/mvu_8sx9_axi.sv | 54 ++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
index 6c7eaeaeca..5f215927d8 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -32,25 +32,25 @@
  *****************************************************************************/
 
 module mvu_8sx9_axi #(
-    int unsigned MW,
-    int unsigned MH,
-    int unsigned PE,
-    int unsigned SIMD,
-    int unsigned ACTIVATION_WIDTH,
-    int unsigned WEIGHT_WIDTH,
-    int unsigned ACCU_WIDTH,
-    bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0,
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
 	parameter RAM_STYLE = "auto",
 
-    localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-    localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
-    localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = MH/PE,
-    localparam int unsigned OUTPUT_LANES = PE,
-    localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned OUTPUT_LANES = PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
@@ -76,31 +76,31 @@ module mvu_8sx9_axi #(
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
 		if (MW % SIMD != 0) begin
-		$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-		$finish;
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
 		end
 		if (MH % PE != 0) begin
-		$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-		$finish;
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
 		end
 		if (ACTIVATION_WIDTH > 9) begin
-		$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-		$finish;
+			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+			$finish;
 		end
 		if (WEIGHT_WIDTH > 8) begin
-		$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-		$finish;
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
 		end
 		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-		$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-		$finish;
+			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+			$finish;
 		end
 		if (SEGMENTLEN == 0) begin
-		$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
 		end
 		if (SEGMENTLEN > (SIMD+2)/3) begin
-		$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-		$finish;
+			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			$finish;
 		end
 	end
 

From cbee193d746763044a870bdf1af248bbe8d31156 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 14:33:13 +0100
Subject: [PATCH 010/123] [rtl custom-op]: minor changes for compiler
 integration

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index 2456eb3a47..502a72d3f2 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -41,7 +41,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
 	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
+	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
 
 	// Safely deducible parameters
 	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
@@ -85,6 +85,6 @@ mvu_8sx9_axi #(
 	.m_axis_output_tdata(m_axis_output_tdata),
 	.m_axis_output_tvalid(m_axis_output_tvalid),
 	.m_axis_output_tready(m_axis_output_tready)
-)
+);
 
-endmodule : mvau_8sx9_axi_wrapper
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From ba5e77bde008fff2a445d6ef469072dd67f67f42 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:26:05 +0100
Subject: [PATCH 011/123] [rtl custom op]: moved testbenches to separate
 directory

---
 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++
 finn-rtllib/mvu/tb/mvu_axi_tb.sv  | 213 ++++++++++++++++++++++++++++++
 2 files changed, 378 insertions(+)
 create mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
 create mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
new file mode 100644
index 0000000000..c8bfe5370a
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_8sx9_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MH = 256;
+	localparam int unsigned PE = 16;
+	localparam int unsigned MW = 600;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	typedef logic signed [PE-1:0][57:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic rst;
+	initial begin
+		rst = 1;
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic last;
+	logic zero;
+	logic vld;
+	activation_t a;
+	weight_t w;
+	output_t p;
+	// Reference signals
+	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+	// Counter for number of outputs (NF dimension) that are produced
+	int NF_CNT = 0;
+
+	initial begin
+		ACTIVATIONS = init_ACTIVATIONS();
+		WEIGHTS = init_WEIGHTS();
+		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+		last = 0;
+		zero = 0;
+		a = 'x;
+		w = 'x;
+
+		@(posedge clk iff !rst);
+
+		for (int j=0; j<NF; j++) begin
+			for (int i=0; i<SF; i++) begin
+				last <= (i==SF-1) ? 1 : 0;
+				a <= ACTIVATIONS[i];
+				w <= WEIGHTS[j][i];
+				@(posedge clk iff en);
+			end
+		end
+
+		last <= 0;
+		zero <= 1;  
+
+		// Continue until all NF outputs are produced & compared
+		@(posedge clk && (NF_CNT==NF));
+
+		$finish;
+	end
+
+	logic en = 0;
+	always_ff @(posedge clk) begin
+		en <= ($urandom()%7 > 1) && !rst;
+	end
+
+	// Compare computed output against golden output when vld flag is raised by DUT
+	always_ff @(posedge clk iff (vld && en)) begin
+		foreach(p[i]) begin
+			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+			else begin 
+				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				$stop;
+			end  
+		end
+		NF_CNT += 1;
+	end
+
+	// Instantiate DUT
+	mvu_8sx9 #(
+		.PE(PE),
+		.SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+	);
+
+endmodule : mvu_8sx9_tb
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
new file mode 100644
index 0000000000..08a349da84
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -0,0 +1,213 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MW = 90;
+	localparam int unsigned MH = 16;
+	localparam int unsigned SIMD = 9;
+	localparam int unsigned PE = 4;
+	localparam int unsigned SEGMENTLEN = 1;
+	localparam string MVU_IMPL_STYLE = "mvu_8sx9";
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants  
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal   
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations  
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin 
+				activations.vld = $urandom()%7 > 1;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights   
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF]; 
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output  
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				if (SIGNED_ACTIVATIONS==1) 
+					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+				else
+					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 1;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin 
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end  
+			end
+			
+			NF_CNT += 1;
+		end
+
+		$finish;  
+	end
+
+	// Instantiate DUT
+	mvu_axi #(
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+  
+endmodule : mvu_axi_tb

From 69310b4e6d2ee4bf2e60b236582656fd7f364a6d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:27:50 +0100
Subject: [PATCH 012/123] [rtl custom op]: fixed output width to ACCU_WIDTH

---
 finn-rtllib/mvu/mvu_8sx9.sv | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index d082d4fb2e..5af27ab0ce 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -36,19 +36,25 @@ module mvu_8sx9 #(
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
     int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
   )
   (
-    input   logic clk,
+    // Global Control
+	input   logic clk,
     input   logic rst,
     input   logic en,
+
+	// Input
     input   logic last,
-    input   logic zero,
-    input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a,
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w,
-    output  logic vld,
-    output  logic [PE-1:0][57:0] p 
+    input   logic zero, // ignore current inputs and force this partial product to zero
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+    
+	// Ouput
+	output  logic vld,
+    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
 
 //-------------------- Declare global signals --------------------\\
@@ -146,7 +152,7 @@ module mvu_8sx9 #(
 			uwire [57:0] pp;
 
 			if (LAST) begin : genPOUT
-				assign p[j] = pp;
+				assign p[j] = pp[ACCU_WIDTH-1:0];
 			end      
 
 			DSP58 #(
@@ -281,4 +287,4 @@ module mvu_8sx9 #(
 		end : genDSPChain  
 	end : genDSPPE
     
-endmodule
+endmodule : mvu_8sx9

From cfcff0040c85a76d7c5a16b2bf1b6b966b62e87d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:29:06 +0100
Subject: [PATCH 013/123] [rtl custom op]: renamed file and added generic to
 switch between compute kernels

---
 finn-rtllib/mvu/mvu_axi.sv | 194 +++++++++++++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_axi.sv

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
new file mode 100644
index 0000000000..5d8700738f
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -0,0 +1,194 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_axi #(
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
+	parameter RAM_STYLE = "auto",
+	parameter MVU_IMPL_STYLE,
+
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned NF = MH/PE,
+	localparam int unsigned OUTPUT_LANES = PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
+		end
+		if (MH % PE != 0) begin
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
+		end
+		if (ACTIVATION_WIDTH > 9) begin
+			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+			$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
+		end
+		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
+			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+			$finish;
+		end
+		if (SEGMENTLEN == 0) begin
+			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+		end
+		if (SEGMENTLEN > (SIMD+2)/3) begin
+			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			$finish;
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+//-------------------- Core MVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	
+	if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9
+		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+			.clk, .rst, .en,
+			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core (
+			.clk, .rst, .en,
+			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	//else begin
+	//	$error("Unrecognized MVU_IMPL_STYLE!");
+	//	$finish;
+	//end
+
+//-------------------- Output register slice --------------------\\
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)		A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+	
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ default: 'x };
+		else begin
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+		end	
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule : mvu_axi
\ No newline at end of file

From 72b519691369b9ebc31983a6723485860837e37b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:29:45 +0100
Subject: [PATCH 014/123] [rtl custom op]: renamed file and added generic to
 switch between compute kernels

---
 finn-rtllib/mvu/mvu_axi_wrapper.v | 90 +++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
new file mode 100644
index 0000000000..323d2711e4
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter 	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter 	PE = $PE$,
+	parameter 	SIMD = $SIMD$,
+	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter 	SEGMENTLEN = $SEGMENTLEN$,
+	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
+
+	// Safely deducible parameters
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter 	OUTPUT_LANES = PE,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)(
+  	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+mvu_axi #(
+	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(s_axis_weights_tdata),
+	.s_axis_weights_tvalid(s_axis_weights_tvalid),
+	.s_axis_weights_tready(s_axis_weights_tready),
+	.s_axis_input_tdata(s_axis_input_tdata),
+	.s_axis_input_tvalid(s_axis_input_tvalid),
+	.s_axis_input_tready(s_axis_input_tready),
+	.m_axis_output_tdata(m_axis_output_tdata),
+	.m_axis_output_tvalid(m_axis_output_tvalid),
+	.m_axis_output_tready(m_axis_output_tready)
+);
+
+endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From c068bb65c6a4b877876c5b1278e7b2663b81d8e1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:15:16 +0100
Subject: [PATCH 015/123] [rtl mvu]: added behavioral model DSP58

---
 finn-rtllib/mvu/mvu_8sx9.sv | 343 ++++++++++++++++++++++--------------
 1 file changed, 212 insertions(+), 131 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 5af27ab0ce..2d1da26efb 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -38,7 +38,8 @@ module mvu_8sx9 #(
     int unsigned WEIGHT_WIDTH,
 	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
+    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
+	bit FORCE_BEHAVIORAL = 0
   )
   (
     // Global Control
@@ -70,7 +71,10 @@ module mvu_8sx9 #(
 
 	always_ff @(posedge clk) begin
 		if(rst)     L <= '{default: 0};
-		else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+		else if(en) begin
+			L[1+MAX_PIPELINE_STAGES] <= last;
+			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
+		end
 	end  
 	assign vld = L[0];
 
@@ -155,135 +159,212 @@ module mvu_8sx9 #(
 				assign p[j] = pp[ACCU_WIDTH-1:0];
 			end      
 
-			DSP58 #(
-				// Feature Control Attributes: Data Path Selection
-				.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-				.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-				.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-				.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-				.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-													// legacy mode.
-				.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-				.RND(58'h000000000000000),          // Rounding Constant
-				.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-				.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-				.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-				.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-				// Pattern Detector Attributes: Pattern Detection Configuration
-				.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-				.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-				.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-				.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-				.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-				.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-				.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-				// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-				.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-				.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-				.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-				.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-				.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-				.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-									FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
-									2'b01, // Y : M
-									2'b01  // X: M
-				}), // Optional inversion for OPMODE
-				.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-				.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-				.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-				.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-				.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-				.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-				.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-				.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-				.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-				.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-				// Register Control Attributes: Pipeline Register Configuration
-				.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-				.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-				.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-				.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-				.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-				.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-				.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-				.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-				.CREG(0),                           // Pipeline stages for C (0-1)
-				.DREG(0),                           // Pipeline stages for D (0-1)
-				.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-				.MREG(1),                           // Multiplier pipeline stages (0-1)
-				.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-				.PREG(PREG),                        // Number of pipeline stages for P (0-1)
-				.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-			)
-			DSP58_inst (
-				// Cascade outputs: Cascade Ports
-				.ACOUT(),                           // 34-bit output: A port cascade
-				.BCOUT(),                           // 24-bit output: B cascade
-				.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-				.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-				.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
-				// Control outputs: Control Inputs/Status Bits
-				.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-				.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-				.PATTERNDETECT(),                   // 1-bit output: Pattern detect
-				.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-				// Data outputs: Data Ports
-				.CARRYOUT(),                        // 4-bit output: Carry
-				.P(pp),                             // 58-bit output: Primary data
-				.XOROUT(),                          // 8-bit output: XOR data
-				// Cascade inputs: Cascade Ports
-				.ACIN('x),                          // 34-bit input: A cascade data
-				.BCIN('x),                          // 24-bit input: B cascade
-				.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-				.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-				.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
-				// Control inputs: Control Inputs/Status Bits
-				.ALUMODE(4'h0),                     // 4-bit input: ALU control
-				.CARRYINSEL('0),                    // 3-bit input: Carry select
-				.CLK(clk),                          // 1-bit input: Clock
-				.INMODE({
-						INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-						2'b00,
-						TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-						INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
-				}),                                 // 5-bit input: INMODE control
-				.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-				.OPMODE({
-						LAST ? {1'b0, L[1]} : 2'b00,
-						7'b000_0000
-				}), // 9-bit input: Operation mode
-				// Data inputs: Data Ports
-				.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-				.B(b_in_i[j][i]),                   // 24-bit input: B data
-				.C('x),                             // 58-bit input: C data
-				.CARRYIN('0),                       // 1-bit input: Carry-in
-				.D('x),                             // 27-bit input: D data
-				// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-				.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-				.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-				.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-				.CEAD('0),                          // 1-bit input: Clock enable for ADREG
-				.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-				.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-				.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-				.CEC('0),                           // 1-bit input: Clock enable for CREG
-				.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-				.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-				.CED('0),                           // 1-bit input: Clock enable for DREG
-				.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-				.CEM(en),                           // 1-bit input: Clock enable for MREG
-				.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-				.RSTA(rst),                         // 1-bit input: Reset for AREG
-				.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-				.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-				.RSTB(rst),                         // 1-bit input: Reset for BREG
-				.RSTC('0),                          // 1-bit input: Reset for CREG
-				.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-				.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-				.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-				.RSTM(rst),                         // 1-bit input: Reset for MREG
-				.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-			);
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input A/B
+				logic signed [33:0] Areg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Areg <= '{ default : 0};
+					else if (en) begin
+						Areg[0] <= { 7'bx, a_in_i[i] };
+						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
+					end
+				end
+				logic signed [23:0] Breg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Breg <= '{ default : 0};
+					else if (en) begin
+						Breg[0] <= b_in_i[j][i];
+						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
+					end
+				end
+
+				// Stage #2: Multiply-Accumulate
+				logic signed [57:0] Mreg;
+				logic InmodeZero = 0;
+				always_ff @(posedge clk) begin
+					if (rst)		InmodeZero <= 0;
+					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+				end
+				always_ff @(posedge clk) begin
+					if (rst)	Mreg <= 0;
+					else if (en) begin
+						automatic logic signed [57:0] m = 0;
+						for (int k = 0; k < 3; k++) begin
+							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
+						end
+						Mreg <= m;
+					end
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0] Preg;
+				logic Opmode = 0;
+				if (FIRST && !LAST) begin : genFirst
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg;
+						end
+					end
+					else	assign Preg = Mreg;
+				end
+				else if (LAST) begin : genLast
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1];
+					end
+				end
+				else begin : genMid
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg + pcout[j][i-1];
+						end
+					end
+					else	assign Preg = Mreg + pcout[j][i-1];
+				end
+				assign pp = Preg;
+				assign pcout[j][i] = pp;
+			end : genBehav
+
+			else begin: genDSP
+				DSP58 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+														// legacy mode.
+					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+					.RND(58'h000000000000000),          // Rounding Constant
+					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+										2'b01, // Y : M
+										2'b01  // X: M
+					}), // Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                           // Pipeline stages for C (0-1)
+					.DREG(0),                           // Pipeline stages for D (0-1)
+					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+					.MREG(1),                           // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+				)
+				DSP58_inst (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),                           // 34-bit output: A port cascade
+					.BCOUT(),                           // 24-bit output: B cascade
+					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+					.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+					// Data outputs: Data Ports
+					.CARRYOUT(),                        // 4-bit output: Carry
+					.P(pp),                             // 58-bit output: Primary data
+					.XOROUT(),                          // 8-bit output: XOR data
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),                          // 34-bit input: A cascade data
+					.BCIN('x),                          // 24-bit input: B cascade
+					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+					.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+					// Control inputs: Control Inputs/Status Bits
+					.ALUMODE(4'h0),                     // 4-bit input: ALU control
+					.CARRYINSEL('0),                    // 3-bit input: Carry select
+					.CLK(clk),                          // 1-bit input: Clock
+					.INMODE({
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+							2'b00,
+							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+					}),                                 // 5-bit input: INMODE control
+					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+					.OPMODE({
+							LAST ? {1'b0, L[1]} : 2'b00,
+							7'b000_0000
+					}), // 9-bit input: Operation mode
+					// Data inputs: Data Ports
+					.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+					.B(b_in_i[j][i]),                   // 24-bit input: B data
+					.C('x),                             // 58-bit input: C data
+					.CARRYIN('0),                       // 1-bit input: Carry-in
+					.D('x),                             // 27-bit input: D data
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),                           // 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),                           // 1-bit input: Clock enable for DREG
+					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+					.CEM(en),                           // 1-bit input: Clock enable for MREG
+					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+					.RSTA(rst),                         // 1-bit input: Reset for AREG
+					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+					.RSTB(rst),                         // 1-bit input: Reset for BREG
+					.RSTC('0),                          // 1-bit input: Reset for CREG
+					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+					.RSTM(rst),                         // 1-bit input: Reset for MREG
+					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+				);
+			end : genDSP
 		end : genDSPChain  
 	end : genDSPPE
     

From 18f94e7ab03a3034083680faa91a80359858589e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:18:58 +0100
Subject: [PATCH 016/123] [rtl mvu]: extended flow control wrapper with
 additional compute core and other minor changes

---
 finn-rtllib/mvu/mvu_axi.sv        | 51 +++++++++++++++++++------------
 finn-rtllib/mvu/mvu_axi_wrapper.v | 48 ++++++++++++++---------------
 2 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index 5d8700738f..e4a919ba88 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -41,8 +41,8 @@ module mvu_axi #(
 	int unsigned ACCU_WIDTH,
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
-	parameter RAM_STYLE = "auto",
-	parameter MVU_IMPL_STYLE,
+	bit FORCE_BEHAVIORAL = 0,
+	string MVU_IMPL_STYLE,
 
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
@@ -96,12 +96,14 @@ module mvu_axi #(
 			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
 			$finish;
 		end
-		if (SEGMENTLEN == 0) begin
-			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
-		end
-		if (SEGMENTLEN > (SIMD+2)/3) begin
-			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			$finish;
+		if (MVU_IMPL_STYLE == "mvu_8sx9") begin
+			if (SEGMENTLEN == 0) begin
+				$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+			end
+			if (SEGMENTLEN > (SIMD+2)/3) begin
+				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+				$finish;
+			end
 		end
 	end
 
@@ -116,7 +118,7 @@ module mvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
 		.clk, .rst,
 		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
 		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
@@ -133,28 +135,37 @@ module mvu_axi #(
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	
-	if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9
+	if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9
 		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
 	end
 	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core (
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
 	end
-	//else begin
-	//	$error("Unrecognized MVU_IMPL_STYLE!");
-	//	$finish;
-	//end
+	else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u
+		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	else initial begin
+		$error("Unrecognized MVU_IMPL_STYLE!");
+		$finish;
+	end
 
 //-------------------- Output register slice --------------------\\
-	struct {
+	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
 	} A = '{ vld: 0, default: 'x};
@@ -175,7 +186,7 @@ module mvu_axi #(
 		end
 	end
 	
-	struct {
+	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
 	} B = '{ vld: 0, default: 'x};
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index 323d2711e4..b79ba6bbd1 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -41,7 +41,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
 	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
+	parameter	MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$",
+	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
 	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
@@ -50,41 +51,38 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
   	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
+	input	ap_clk,
+	input	ap_rst_n,
 	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
+	input   weights_V_TVALID,
+	output  weights_V_TREADY,
 	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
+	input	in0_V_TVALID,
+	output	in0_V_TREADY,
 	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
+	output	out_V_TVALID,
+	input	out_V_TREADY
 );
 
 mvu_axi #(
 	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
 	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	.SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE)
 	) inst (
 	.ap_clk(ap_clk),
 	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(s_axis_weights_tdata),
-	.s_axis_weights_tvalid(s_axis_weights_tvalid),
-	.s_axis_weights_tready(s_axis_weights_tready),
-	.s_axis_input_tdata(s_axis_input_tdata),
-	.s_axis_input_tvalid(s_axis_input_tvalid),
-	.s_axis_input_tready(s_axis_input_tready),
-	.m_axis_output_tdata(m_axis_output_tdata),
-	.m_axis_output_tvalid(m_axis_output_tvalid),
-	.m_axis_output_tready(m_axis_output_tready)
+	.s_axis_weights_tdata(weights_V_TDATA),
+	.s_axis_weights_tvalid(weights_V_TVALID),
+	.s_axis_weights_tready(weights_V_TREADY),
+	.s_axis_input_tdata(in0_V_TDATA),
+	.s_axis_input_tvalid(in0_V_TVALID),
+	.s_axis_input_tready(in0_V_TREADY),
+	.m_axis_output_tdata(out_V_TDATA),
+	.m_axis_output_tvalid(out_V_TVALID),
+	.m_axis_output_tready(out_V_TREADY)
 );
 
 endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From 6d4a0a764e0e6ded16d7034e0d69f5408c76ca75 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:22:51 +0100
Subject: [PATCH 017/123] [rtl mvu]: fix to done_len flag when SIMD dimension
 fully unrolled and PyVerilator-related syntax change

---
 finn-rtllib/mvu/replay_buffer.sv | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 685ac03137..89bbbdb88f 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -35,8 +35,7 @@
 module replay_buffer #(
 	int unsigned  LEN,	// Sequence length
 	int unsigned  REP,	// Sequence replay count
-	int unsigned  W,	// Data width
-	parameter RAM_STYLE = "auto" 	// ram style for buffer {block, distributed, ultra, auto}
+	int unsigned  W 	// Data width
 )(
 	input	logic  clk,
 	input	logic  rst,
@@ -54,7 +53,7 @@ module replay_buffer #(
 
 	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
 	count_t  Count = 0;
-	uwire  done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
+	uwire  done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
 	uwire  done_rep;
 	uwire  done_all = done_len && done_rep;
 
@@ -83,7 +82,6 @@ module replay_buffer #(
 		end
 		assign	first_rep = FirstRep;
 
-		(* RAM_STYLE = RAM_STYLE *)
 		data_t  Buf[LEN];
 		if(LEN == 1) begin : genTrivial
 			always_ff @(posedge clk) begin
@@ -92,7 +90,10 @@ module replay_buffer #(
 		end : genTrivial
 		else begin : genShift
 			always_ff @(posedge clk) begin
-				if(shift)  Buf <= { odat, Buf[0:LEN-2] };
+				if(shift) begin
+					Buf[0] <= odat;
+					Buf[1:LEN-1] <= Buf[0:LEN-2];
+				end
 			end
 		end : genShift
 

From 90c547d54756aed2aa101862fb6f55c05149173c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:23:22 +0100
Subject: [PATCH 018/123] [rtl mvu tb]: updated testbench

---
 finn-rtllib/mvu/tb/mvu_axi_tb.sv | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index 08a349da84..ef5fa7d682 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -35,17 +35,18 @@ module mvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam int unsigned MW = 90;
-	localparam int unsigned MH = 16;
-	localparam int unsigned SIMD = 9;
-	localparam int unsigned PE = 4;
-	localparam int unsigned SEGMENTLEN = 1;
-	localparam string MVU_IMPL_STYLE = "mvu_8sx9";
+	localparam int unsigned MW = 50;
+	localparam int unsigned MH = 8;
+	localparam int unsigned SIMD = 10;
+	localparam int unsigned PE = 2;
+	localparam int unsigned SEGMENTLEN = 2;
+	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
+	localparam bit FORCE_BEHAVIORAL = 1;
 	// Bit-width config  
 	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 8;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
+	localparam bit SIGNED_ACTIVATIONS = 0;
 	// Simulation constants  
 	localparam int unsigned NF = MH/PE;
 	localparam int unsigned SF = MW/SIMD;
@@ -94,7 +95,7 @@ module mvu_axi_tb();
 		for (int i=0; i<SF; i++) begin
 			activations.dat <= ACTIVATIONS[i];
 			do begin 
-				activations.vld = $urandom()%7 > 1;
+				activations.vld = $urandom()%7 >= 1;
 				@(posedge clk);
 			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
@@ -201,6 +202,7 @@ module mvu_axi_tb();
 		.ACCU_WIDTH(ACCU_WIDTH),
 		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
 		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
 		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
 	)
 	dut (

From 0c37f1f7bed1143833649accceb59bd6821bed3c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:25:10 +0100
Subject: [PATCH 019/123] [builder]: added specialize_to_rtl step and changed
 standalone threshold layers to be by default true

---
 src/finn/builder/build_dataflow_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 4c3e4ff899..24940489df 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -121,6 +121,7 @@ class VerificationStepType(str, Enum):
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
+    "step_specialize_to_rtl",
     "step_hls_codegen",
     "step_hls_ipgen",
     "step_set_fifo_depths",
@@ -233,7 +234,7 @@ class DataflowBuildConfig:
     #: activations in FINN) will be implemented as stand-alone HLS layers,
     #: instead of being part of MatrixVectorActivation layer. This gives larger
     #: flexibility, and makes it possible to have runtime-writable thresholds.
-    standalone_thresholds: Optional[bool] = False
+    standalone_thresholds: Optional[bool] = True
 
     #: (Optional) Whether optimizations that minimize the bit width of the
     #: weights and accumulator will be applied. Because this optimization relies

From 5ccb016a640dbed6818a9f1f3ef46136ce949c0d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:26:03 +0100
Subject: [PATCH 020/123] [builder]: added specialize_to_rtl step

---
 src/finn/builder/build_dataflow_steps.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index e43a29d632..3e4d047a51 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -123,6 +123,7 @@
 )
 from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 
 
 def verify_step(
@@ -483,6 +484,16 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
     return model
 
 
+def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible."""
+    specialize_to_rtl_transforms = [
+        to_rtl.InferRTLMatrixVectorActivation()
+    ]
+    for trn in specialize_to_rtl_transforms:
+        model = model.transform(trn)
+    return model
+    
+
 def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Tighten the weight and accumulator bit widths for each layer."""
     if cfg.minimize_bit_width:
@@ -855,6 +866,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
     "step_apply_folding_config": step_apply_folding_config,
     "step_minimize_bit_width": step_minimize_bit_width,
     "step_generate_estimate_reports": step_generate_estimate_reports,
+    "step_specialize_to_rtl": step_specialize_to_rtl,
     "step_hls_codegen": step_hls_codegen,
     "step_hls_ipgen": step_hls_ipgen,
     "step_set_fifo_depths": step_set_fifo_depths,

From f099f4bbfd01b628a89c6099f637a4a85a8158ca Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:26:44 +0100
Subject: [PATCH 021/123] [custom op]: added custom op
 MatrixVectorActivation_rtl

---
 src/finn/custom_op/fpgadataflow/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 56d4230a3a..19c0ddd999 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -49,6 +49,7 @@
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 from finn.custom_op.fpgadataflow.lookup import Lookup
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
+from finn.custom_op.fpgadataflow.matrixvectoractivation_rtl import MatrixVectorActivation_rtl
 from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
 from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
     StreamingDataflowPartition,
@@ -70,6 +71,7 @@
 custom_op["DownSampler"] = DownSampler
 custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
 custom_op["MatrixVectorActivation"] = MatrixVectorActivation
+custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
 custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl

From 9a3b0fdc54f8c7c1b541c8cfdaaf6e96315da092 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:28:34 +0100
Subject: [PATCH 022/123] [custom op]: added additional attribute to enable
 conversion to RTL (custom-op)

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index aa987384dd..e54abb0c3f 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -70,7 +70,7 @@ def get_nodeattr_types(self):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
             "ActVal": ("i", False, 0),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
@@ -125,6 +125,8 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # Flag to specify whether RTL-based or HLS-based implementation is preferred
+            "impl": ("s", False, "rtl", {"hls", "rtl"})
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs

From 38aa930baa1296a7099f9df22e3d0d000c8d5a05 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:30:15 +0100
Subject: [PATCH 023/123] [custom op]: modified ip-stitching and code
 generation

---
 .../matrixvectoractivation_rtl.py             | 231 ++++++++++--------
 1 file changed, 127 insertions(+), 104 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index c8a0aa675b..6b1c2f3be7 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
+from shutil import copy
 import numpy as np
 import os
 import textwrap
@@ -45,6 +46,12 @@
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
 
 from . import templates
 
@@ -60,8 +67,8 @@ class MatrixVectorActivation_rtl(HLSCustomOp):
     """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
     function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
         self.decoupled_wrapper = templates.decoupled_wrapper
 
     def get_nodeattr_types(self):
@@ -78,11 +85,6 @@ def get_nodeattr_types(self):
             "outputDataType": ("s", True, ""),
             # FINN DataType for accumulator -- auto-computed and updated
             "accDataType": ("s", False, "INT32"),
-            # use xnor-popcount for binary weights/inputs, thus treating them
-            # as bipolar
-            "binaryXnorMode": ("i", False, 0, {0, 1}),
-            # no-activation mode (produce accumulators)
-            "noActivation": ("i", False, 0, {0, 1}),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -105,16 +107,6 @@ def get_nodeattr_types(self):
                 "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
-            # FPGA resource type for threshold memories (if noActivation is False)
-            # auto -- let Vivado decide
-            # block -- use BRAM
-            # distributed -- use LUTRAM
-            "ram_style_thresholds": (
-                "s",
-                False,
-                "auto",
-                {"auto", "block", "distributed"},
-            ),
             # (mem_mode = decoupled only) whether weights will be writable through
             # an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -125,6 +117,8 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -142,7 +136,6 @@ def calc_wmem(self):
 
     def calc_tmem(self):
         """Calculates and returns TMEM."""
-        assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer"
         return 0
 
     def make_shape_compatible_op(self, model):
@@ -192,27 +185,9 @@ def verify_node(self):
                 """The required MatrixVectorActivation attributes do not exist."""
             )
 
-        # verify the number of inputs depending on noActivation value
-        # check noActivation value to determine the number of inputs
-        no_act = self.get_nodeattr("noActivation")
-
-        if no_act == 1:
-            if len(self.onnx_node.input) == 2:
-                info_messages.append("The number of inputs is correct")
-            else:
-                info_messages.append(
-                    """RTL-based MatrixVectorActivation needs in no
-                            activation mode 2 inputs (data input and weights)"""
-                )
-        elif no_act == 0:
-            info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer")
-        else:
-            info_messages.append(
-                """noActivation attribute contains {} should
-                be 1 for RTL-based MatrixVectorActivation""".format(
-                    no_act
-                )
-            )
+        num_of_inputs = len(self.onnx_node.input)
+        if num_of_inputs!=2:
+            info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input)))
 
         mem_mode = self.get_nodeattr("mem_mode")
 
@@ -221,6 +196,7 @@ def verify_node(self):
 
         return info_messages
 
+# TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -242,6 +218,7 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
+# TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -268,7 +245,7 @@ def bram_estimation(self):
         ):
             return 0
         # assuming SDP mode RAMB18s (see UG573 Table 1-10)
-        # assuming decoupled (RTL) memory, which is more efficient than const (HLS)
+        # assuming decoupled (RTL) memory
         if mem_width == 1:
             return math.ceil(omega / 16384)
         elif mem_width == 2:
@@ -282,6 +259,7 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
+# TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -294,6 +272,7 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
+# TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -308,7 +287,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -348,23 +327,14 @@ def lut_estimation(self):
         # accumulator
         acc_bits = W + A + np.ceil(math.log(MW, 2))
         acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        if noact == 0:
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
-            comp_luts = (2**B - 1) * acc_bits
 
         return int(
             c0
-            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
+            + c1 * (P * (mult_luts + addertree_luts + acc_luts))
             + c2
         )
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def dsp_estimation(self):
         # multiplication
         P = self.get_nodeattr("PE")
@@ -380,7 +350,7 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -389,6 +359,7 @@ def get_exp_cycles(self):
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
+        # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10)
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -413,7 +384,7 @@ def get_output_datatype(self, ind=0):
 
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
-        assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
@@ -431,8 +402,8 @@ def get_weightstream_width(self):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
+            assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             w_width = pe * simd * wp
-            assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             return w_width
         else:
             return 0
@@ -544,10 +515,8 @@ def minimize_accumulator_width(self, model):
                 adt = DataType.get_smallest_possible(-acc_max - 1)
         else:
             adt = DataType.get_smallest_possible(acc_max)
-        # ensure a datatype divisible by 8-bits in case this is the last node
-        bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
-        new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
-        adt = DataType[new_adt_name]
+        # Note: we are interested in simply the width of the output dot product.
+        # Padding the actual output stream to a multiple of 8-bits is done in the RTL component
         self.set_nodeattr("accDataType", adt.name)
         # for no-activation nodes, output dt = acc dt
         self.set_nodeattr("outputDataType", adt.name)
@@ -588,7 +557,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
                 1, -1, pe * simd
             )
             weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
-            if weight_file_mode == "decoupled_verilog_dat":
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
                 # convert weight values into hexstring
                 weight_width = self.get_weightstream_width()
                 # pad to nearest 4 bits to get hex strings
@@ -638,7 +610,7 @@ def generate_params(self, model, path):
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
-            # also save weights as Verilog .dat file
+            # Also save weights as Verilog .dat file
             # note that we provide two different .dat files, one for synth
             # and one for synthesis. this is because URAM-based weights always
             # need zero weights for synthesis, otherwise they get inferred
@@ -693,7 +665,6 @@ def execute_node(self, context, graph):
         for inputs in node.input:
             # it is assumed that the first input of the node is the data input
             # the second input are the weights
-            # the third input are the thresholds
             if in_ind == 0:
                 assert (
                     str(context[inputs].dtype) == "float32"
@@ -709,7 +680,7 @@ def execute_node(self, context, graph):
                     reshaped_input,
                 )
             elif in_ind > 2:
-                raise Exception("Unexpected input found for MatrixVectorActivation")
+                raise Exception("Unexpected input found for MatrixVectorActivation_rtl")
             in_ind += 1
 
         if mode == "rtlsim":
@@ -759,7 +730,7 @@ def execute_node(self, context, graph):
     def code_generation_ipgen(self, model, fpgapart, clk):
         """Normally: Generates C++ code and tcl script for IP generation.
         Here: Generates (System-)Verilog code for IP generation."""
-        self.generate_hdl()
+        self.generate_hdl(model, fpgapart, clk)
 
     def ipgen_singlenode_code(self):
         """Normally: Builds the bash script for IP generation."""
@@ -828,11 +799,21 @@ def code_generation_ipi(self):
                 "create_bd_intf_pin -mode Slave "
                 "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # instantiate the hls ip
-            cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
-            )
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv"
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name))
+
             # instantiate a streamer and connect it to the HLS IP
             strm_vlnv = "xilinx.com:user:memstream:1.0"
             strm_inst = node_name + "_wstrm"
@@ -947,12 +928,6 @@ def get_op_and_param_counts(self):
         weight_param_type = "param_weight_%db" % (weight_bits)
         weight_count = in_features * out_features
         ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        if self.get_nodeattr("noActivation") == 0:
-            tdt = DataType[self.get_nodeattr("accDataType")]
-            thres_bits = tdt.bitwidth()
-            thres_param_type = "param_threshold_%db" % (thres_bits)
-            thres_count = out_features
-            ret_dict[thres_param_type] = thres_count
         return ret_dict
 
     def derive_characteristic_fxns(self, period):
@@ -972,65 +947,113 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-    def generate_hdl(self):
-#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded
-        template_path, code_gen_dict = self.prepare_codegen_default()
+# TODO: characterize max_clk and implement this function in look-up style
+    def _resolve_segment_len(self, clk):
+        # Insert pipeline registers in the DSP chain to meet target clock frequency
+        segmentlen = 0
+        return segmentlen
+
+    def _resolve_impl_style(self, fpgapart):
+        # Based on target device and activation/weight-width, choose the supported RTL module
+        act_width = self.get_input_datatype(0).bitwidth()
+        weight_width = self.get_input_datatype(1).bitwidth()
+        is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc"
+        if (act_width == 4 and weight_width == 4):
+            return "mvu_4sx4u"
+        else:
+            if (is_versal):
+                return "mvu_8sx9_dsp58"
+            else:
+                return "mvu_8sx8u_dsp48"
+
+    def generate_hdl(self, model, fpgapart, clk):
+        # Generate params as part of IP preparation
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        self.generate_params(model, code_gen_dir)
 
+        template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
         # add general parameters to dictionary
-        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
         # save top module name so we can refer to it after this node has been renamed
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
-#TODO: currently only ram_style=auto is supported
+
         ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "auto":
-            continue
-        else:
-            raise Exception("Unrecognized ram_style for MatrixVectorActivation")
+        assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl"
 
-        # apply code generation to templates
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # apply code generation to template
         with open(template_path, "r") as f:
-            template = f.read()
+            template_wrapper = f.read()
         for key in code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(code_gen_dict[key])
-            template = template.replace(key, code_gen_line)
             template_wrapper = template_wrapper.replace(key, code_gen_line)
         with open(
             os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
             ),
             "w",
         ) as f:
-            f.write(template)
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0)))
         with open(
             os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
             ),
             "w",
         ) as f:
-            f.write(template_wrapper)
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1)))
 
         # set ipgen_path and ip_path so that HLS-Synth transformation
         # and stich_ip transformation do not complain
         self.set_nodeattr("ipgen_path", code_gen_dir)
-        self.set_nodeattr("ip_path", code_gen_dir)    
+        self.set_nodeattr("ip_path", code_gen_dir)
 
-    def prepare_codegen_default(self):
-        # TODO: Differentiate between PE folding and fully unrolled along MH dimension
+    def prepare_codegen_default(self, fpgapart, clk):
         template_path = (
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl"
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
         )
+        
         code_gen_dict = {}
-
-        code_gen_dict["$PE$"] = self.get_nodeattr("PE")
-        code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD")
-        code_gen_dict["$MW$"] = self.get_nodeattr("MW")
-        code_gen_dict["$MH$"] = self.get_nodeattr("MH")
-        code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth()
-        code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth()
-        code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth()
+        code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
+        code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
+        code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
+        code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())]
+        code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
+        code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
+        code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+        code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
 
         return template_path, code_gen_dict
 
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")        
+        # Path to (System-)Verilog files used by top-module & path to top-module
+        verilog_paths = [
+            code_gen_dir,
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"
+        ]
+        verilog_files = [
+            self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
+        ]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name()
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        
+        return sim
\ No newline at end of file

From 4e44934c3001174e52c62caf5d320104a308e611 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:31:35 +0100
Subject: [PATCH 024/123] [tests]: initial version of unit test for RTL custom
 op and specialize_to_rtl transformation for MVU

---
 .../test_fpgadataflow_mvau_rtl.py             | 172 ++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
new file mode 100644
index 0000000000..20a249bd08
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -0,0 +1,172 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import os
+
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.util.basic import (
+    qonnx_make_model,
+    gen_finn_dt_tensor
+)
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.core.datatype import DataType
+from qonnx.transformation.general import GiveUniqueNodeNames
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from qonnx.transformation.general import ApplyConfig
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+#import qonnx.core.data_layout as DataLayout
+
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt):
+    (ofm_h, ofm_w) = ofm_shape
+    ofm = helper.make_tensor_value_info(
+        "ofm",
+        TensorProto.FLOAT,
+        (1, ofm_h, ofm_w, mh)
+    )
+
+    matmul_node = helper.make_node(
+        "MatMul",
+        ["ifm", "weights"],
+        ["ofm"]
+    )
+    graph = helper.make_graph(
+        nodes=[matmul_node],
+        name="matmul_graph",
+        inputs=[ifm],
+        outputs=[ofm]
+    )
+
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("ifm", idt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype
+    model.set_initializer("weights", W)
+
+    # model.set_tensor_layout("ifm", DataLayout.NHWC)
+
+    return model
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+@pytest.mark.parametrize("mh", [16])
+@pytest.mark.parametrize("mw", [90])
+#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("pe", [16])
+#@pytest.mark.parametrize("simd", [1, 30, 90])
+@pytest.mark.parametrize("simd", [90])
+@pytest.mark.parametrize("idt", [DataType["INT8"]])
+@pytest.mark.parametrize("wdt", [DataType["UINT4"]])
+#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
+@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+    # Create test input vector (produced by SWG)
+    ofm_shape = (5, 5)
+    ofm_h, ofm_w = ofm_shape
+    ifm = helper.make_tensor_value_info(
+        "ifm",
+        TensorProto.FLOAT,
+        [1, ofm_h, ofm_w, mw]
+    )
+    weights = helper.make_tensor_value_info(
+        "weights",
+        TensorProto.FLOAT,
+        [mw, mh]
+    )
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
+    model = model.transform(GiveUniqueNodeNames())
+
+    model.save(build_dir+"/matmul.onnx")
+
+    # Create MatMul & obtain golden reference output
+    A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    input_dict = prepare_inputs(A)
+
+    ## Execute ONNX model
+    output_matmul = oxe.execute_onnx(model, input_dict)
+
+    # Create MVAU (HLS)
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
+    model = model.transform(GiveUniqueNodeNames())
+    
+    # Apply folding (i.e. specify to use DSPs)
+    folding_config = {
+        "Defaults": {},
+        "MatrixVectorActivation_0": {
+            "PE" : pe,
+            "SIMD" : simd,
+            "mem_mode" : "decoupled",
+            "ram_style" : "auto",
+            "resType" : "dsp",
+            "impl" : "rtl"
+        }
+    }
+    model = model.transform(ApplyConfig(folding_config))
+    model.save(build_dir+"/mvau_hls.onnx")
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+
+    # Apply convert-to-rtl step
+    model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
+    model = model.transform(GiveUniqueNodeNames())
+    model.save(build_dir+"/mvau_rtl.onnx")
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"]
+
+    model.save(build_dir+"/mvau_rtl_sim.onnx")
+
+    assert (output_mvau_hls == output_mvau_rtl).all()
+    assert (output_mvau_hls.size > 0)
+
+
+# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl"
+# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim"
\ No newline at end of file

From cc361d9fd4ea082e04d7a1a6bc3932406b0a4f14 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:32:52 +0100
Subject: [PATCH 025/123] [rtl mvu]: specialized compute core for 4-bit weights
 and activations for DSP48/DSP58

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 359 +++++++++++++++++++++++++++++++++++
 1 file changed, 359 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
new file mode 100644
index 0000000000..5993154355
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -0,0 +1,359 @@
+module mvu_4sx4u #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+	bit FORCE_BEHAVIORAL = 0
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][3:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+	typedef int unsigned  leave_load_t[2*SIMD-1];
+	function leave_load_t init_leave_loads();
+		automatic leave_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leave_loads
+
+	// Pipeline for last indicator flag
+	logic [1:5] L = '0;
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en)  L <= { last, L[1:4] };
+	end
+	assign	vld = L[5];
+
+	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+	localparam int unsigned  D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets
+
+	localparam int unsigned  PIPE_COUNT = (PE+3)/4;
+	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+		localparam int unsigned  PE_BEG = 4*c;
+		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
+
+		uwire        [57:0]  p3[SIMD];
+		uwire signed [ 1:0]  h3[SIMD][3];
+		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
+
+			// Input Lane Assembly
+			uwire [23:0]  bb = a[s];
+			logic [33:0]  aa;
+			logic [26:0]  dd;
+			logic [ 1:0]  xx[3:1];
+			if(1) begin : blkVectorize
+				uwire [3:0]  ww[PE_END - PE_BEG];
+				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+					assign	ww[pe] = w[PE_BEG + pe][s];
+					if(pe) begin
+//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+							.O6(xx[pe][1]),
+							.O5(xx[pe][0]),
+							.I5(1'b1),
+							.I4(zero),
+							.I3(ww[pe][1]),
+							.I2(a[s][1]),
+							.I1(ww[pe][0]),
+							.I0(a[s][0])
+						);
+					end
+				end
+				always_comb begin
+					dd = '0;
+					aa = '0;
+					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+						dd[D[pe]+:3] = ww[pe];
+						aa[D[pe]+ 3] = ww[pe][3];
+					end
+				end
+			end : blkVectorize
+
+			uwire [57:0]  pp;
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input Refine
+				logic signed [23:0]  B1  = 0;
+				always_ff @(posedge clk) begin
+					if(zero)     B1  <= 0;
+					else if(en)  B1  <= bb;
+				end
+
+				logic signed [26:0]  AD1 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      AD1 <= 0;
+					else if(en)  AD1 <= dd - aa;
+				end
+
+				// Stage #2: Multiply
+				logic signed [50:0]  M2 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      M2 <= 0;
+					else if(en)  M2 <=
+// synthesis translate off
+						(B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+						B1 * AD1;
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0]  P3 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      P3 <= 0;
+					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
+				end
+
+				assign	pp = P3;
+			end : genBehav
+			else begin : genDSP
+				DSP48E2 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
+					.RND('0),                          // Rounding Constant
+					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),                      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
+					.AREG(0),                          // Pipeline stages for A (0-2)
+					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),                          // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                          // Pipeline stages for C (0-1)
+					.DREG(0),                          // Pipeline stages for D (0-1)
+					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
+					.MREG(1),                          // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
+					.PREG(1)                          // Number of pipeline stages for P (0-1)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A(aa),						// 34-bit input: A data
+					.B(bb),						// 24-bit input: B data
+					.C('x),						// 58-bit input: C data
+					.CARRYIN('0),				// 1-bit input: Carry-in
+					.D(dd),						// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+			end : genDSP
+
+			// External Canary Pipeline
+			logic [1:0]  X1[3:1] = '{ default: 0 };
+			logic [1:0]  X2[3:1] = '{ default: 0 };
+			logic [1:0]  X3[3:1] = '{ default: 0 };
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					X1 <= '{ default: 0 };
+					X2 <= '{ default: 0 };
+					X3 <= '{ default: 0 };
+				end
+				else if(en) begin
+					X1 <= xx;
+					X2 <= X1;
+					foreach(X3[i]) begin
+						X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
+					end
+				end
+			end
+
+			// Derive actual cross-lane overflows
+			for(genvar  i = 0; i < 3; i++) begin
+				assign	h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
+			end
+			assign	p3[s] = pp;
+
+		end : genSIMD
+
+		// Stage #4: Cross-SIMD Reduction
+
+		// Count leaves reachable from each node
+		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+
+		uwire signed [ACCU_WIDTH  -1:0]  up4;
+		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
+		uwire        [$clog2(SIMD)+7:0]  lo4[3];
+		for(genvar  i = 0; i < 4; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
+
+			// Conclusive high part accumulation
+			if(i < 3) begin : genHi
+				// Adder Tree across all SIMD high contributions, each from [-1:1]
+				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				// High Sideband Accumulation
+				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Hi4 <= 0;
+					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+				end
+				assign	hi4[i] = Hi4;
+			end : genHi
+
+			// Conclusive low part accumulation
+			if(1) begin : blkLo
+				// Adder Tree across all SIMD low contributions
+				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Lo4 <= 0;
+					else if(en)  Lo4 <= tree[0];
+				end
+
+				if(i == 3)  assign  up4 = Lo4;
+				else  assign  lo4[i] = Lo4;
+			end : blkLo
+
+		end
+
+		// Stage #5: Resolve lane totals
+		logic signed [3:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		always_ff @(posedge clk) begin
+			if(rst)  Res5 <= '{ default: 0 };
+			else if(en) begin
+				Res5[3] <= up4 - hi4[2];
+				Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
+				Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
+				Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
+			end
+		end
+
+		// Output
+		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
+			assign	p[pe] = Res5[pe - PE_BEG];
+		end
+
+	end : genPipes
+
+endmodule : mvu_4sx4u
\ No newline at end of file

From 8eefb535c3da6482f95465df05b8d3e1c610be21 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:33:31 +0100
Subject: [PATCH 026/123] [rtl mvu]: specialized compute core for > 4-bit
 weights and activations for DSP48

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 358 +++++++++++++++++++++++++++++
 1 file changed, 358 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
new file mode 100644
index 0000000000..e06a92c8fa
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -0,0 +1,358 @@
+module mvu_8sx8u_dsp48 #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  WEIGHT_WIDTH,
+	bit FORCE_BEHAVIORAL = 0,
+
+	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+	typedef int unsigned  leave_load_t[2*SIMD-1];
+	function leave_load_t init_leave_loads();
+		automatic leave_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leave_loads
+
+	// Pipeline for last indicator flag
+	logic [1:5] L = '0;
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en)  L <= { last, L[1:4] };
+	end
+	assign	vld = L[5];
+
+	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+    localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
+
+	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
+	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+		localparam int unsigned  PE_BEG = 2*c;
+		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
+
+		uwire        [57:0]  p3[SIMD];
+		uwire signed [ 1:0]  h3[SIMD];
+		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
+
+			// Input Lane Assembly
+			uwire [23:0]  bb = a[s];
+			logic [33:0]  aa;
+			logic [26:0]  dd;
+			logic [ 1:0]  xx;
+			if(1) begin : blkVectorize
+				uwire [WEIGHT_WIDTH-1:0]  ww[PE_END - PE_BEG];
+				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+					assign	ww[pe] = w[PE_BEG + pe][s];
+					if(pe) begin
+//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+							.O6(xx[1]),
+							.O5(xx[0]),
+							.I5(1'b1),
+							.I4(zero),
+							.I3(ww[pe][1]),
+							.I2(a[s][1]),
+							.I1(ww[pe][0]),
+							.I0(a[s][0])
+						);
+					end
+				end
+				always_comb begin
+					dd = '0;
+					aa = '0;
+					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+						dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+					end
+				end
+			end : blkVectorize
+
+			uwire [57:0]  pp;
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input Refine
+				logic signed [23:0]  B1  = 0;
+				always_ff @(posedge clk) begin
+					if(zero)     B1  <= 0;
+					else if(en)  B1  <= bb;
+				end
+
+				logic signed [26:0]  AD1 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      AD1 <= 0;
+					else if(en)  AD1 <= dd - aa;
+				end
+
+				// Stage #2: Multiply
+				logic signed [50:0]  M2 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      M2 <= 0;
+					else if(en)  M2 <=
+// synthesis translate off
+						(B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+						B1 * AD1;
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0]  P3 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      P3 <= 0;
+					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
+				end
+
+				assign	pp = P3;
+			end : genBehav
+			else begin : genDSP
+				DSP48E2 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
+					.RND('0),                          // Rounding Constant
+					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),                      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
+					.AREG(0),                          // Pipeline stages for A (0-2)
+					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),                          // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                          // Pipeline stages for C (0-1)
+					.DREG(0),                          // Pipeline stages for D (0-1)
+					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
+					.MREG(1),                          // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
+					.PREG(1)                          // Number of pipeline stages for P (0-1)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A(aa),						// 34-bit input: A data
+					.B(bb),						// 24-bit input: B data
+					.C('x),						// 58-bit input: C data
+					.CARRYIN('0),				// 1-bit input: Carry-in
+					.D(dd),						// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+			end : genDSP
+
+			// External Canary Pipeline
+			logic [1:0]  X1 = '{ default: 0 };
+			logic [1:0]  X2 = '{ default: 0 };
+			logic [1:0]  X3 = '{ default: 0 };
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					X1 <= '{ default: 0 };
+					X2 <= '{ default: 0 };
+					X3 <= '{ default: 0 };
+				end
+				else if(en) begin
+					X1 <= xx;
+					X2 <= X1;
+					X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]);
+				end
+			end
+
+			// Derive actual cross-lane overflows
+			assign  h3[s] = pp[D[1]+:2] - X3;
+
+			assign	p3[s] = pp;
+
+		end : genSIMD
+
+		// Stage #4: Cross-SIMD Reduction
+
+		// Count leaves reachable from each node
+		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+
+		uwire signed [ACCU_WIDTH  -1:0]  up4;
+		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
+		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
+		for(genvar  i = 0; i < 2; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
+
+			// Conclusive high part accumulation
+			if(i == 0) begin : genHi
+				// Adder Tree across all SIMD high contributions, each from [-1:1]
+				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				// High Sideband Accumulation
+				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Hi4 <= 0;
+					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+				end
+				assign	hi4 = Hi4;
+			end : genHi
+
+			// Conclusive low part accumulation
+			if(1) begin : blkLo
+				// Adder Tree across all SIMD low contributions
+				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Lo4 <= 0;
+					else if(en)  Lo4 <= tree[0];
+				end
+
+				if(i == 1)  assign  up4 = Lo4;
+				else  assign  lo4 = Lo4;
+			end : blkLo
+
+		end
+
+		// Stage #5: Resolve lane totals
+		logic signed [1:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		always_ff @(posedge clk) begin
+			if(rst)  Res5 <= '{ default: 0 };
+			else if(en) begin
+				Res5[1] <= up4 - hi4;
+				Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 });
+			end
+		end
+
+		// Output
+		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
+			assign	p[pe] = Res5[pe - PE_BEG];
+		end
+
+	end : genPipes
+
+endmodule : mvu_8sx8u_dsp48
\ No newline at end of file

From e7109e75161774280b24e5884f6c9b9c17a07f7b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:34:23 +0100
Subject: [PATCH 027/123] [fpgadataflow transform]: initial
 specialize_to_rtl_layers-transform for MVU

---
 .../fpgadataflow/specialize_to_rtl_layers.py  | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
new file mode 100644
index 0000000000..7d677ec216
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023, AMD
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from qonnx.transformation.base import Transformation
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.core.datatype import DataType
+from onnx import helper
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
+
+class InferRTLMatrixVectorActivation(Transformation):
+    """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported."""
+
+    def __init__(self):
+        super().__init__()
+
+    def _is_rtl_variant_compatible(self, n):
+        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
+        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
+        folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0)
+
+        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
+            return True
+        else:
+            return False
+
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "MatrixVectorActivation":
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp"
+                supported_in_rtl = self._is_rtl_variant_compatible(n)
+                if (preferred_in_rtl and supported_in_rtl):
+                    mvau_input = n.input[0]
+                    mvau_weight = n.input[1]
+                    mvau_output = n.output[0]
+                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
+                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
+                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
+                    numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors")
+                    mw = getCustomOp(n).get_nodeattr("MW")
+                    mh = getCustomOp(n).get_nodeattr("MH")
+                    simd = getCustomOp(n).get_nodeattr("SIMD")
+                    pe = getCustomOp(n).get_nodeattr("PE")
+                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+
+                    new_node = helper.make_node(
+                        "MatrixVectorActivation_rtl",
+                        [mvau_input, mvau_weight],
+                        [mvau_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        MW=mw,
+                        MH=mh,
+                        SIMD=simd,
+                        PE=pe,
+                        inputDataType=inputDataType,
+                        weightDataType=weightDataType,
+                        outputDataType=outputDataType,
+                        numInputVectors=numInputVectors,
+                        mem_mode=mem_mode,
+                        name=n.name + "_rtl",
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified=True
+        
+        if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        
+        return (model, graph_modified)
\ No newline at end of file

From 5a868d19e5955abdb894bf1e8b93d2d1f6f8410d Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Tue, 9 May 2023 09:41:15 +0200
Subject: [PATCH 028/123] [rtl mvu] fixes for latest memstream + linting

---
 .../matrixvectoractivation_rtl.py             | 136 ++++++++++--------
 1 file changed, 77 insertions(+), 59 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 6b1c2f3be7..8fd261d395 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -27,7 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
-from shutil import copy
 import numpy as np
 import os
 import textwrap
@@ -40,20 +39,18 @@
 )
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
-    numpy_to_hls_code,
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 
 try:
     from pyverilator import PyVerilator
 except ModuleNotFoundError:
     PyVerilator = None
 
-from . import templates
 
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -69,7 +66,6 @@ class MatrixVectorActivation_rtl(HLSCustomOp):
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
-        self.decoupled_wrapper = templates.decoupled_wrapper
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -186,17 +182,24 @@ def verify_node(self):
             )
 
         num_of_inputs = len(self.onnx_node.input)
-        if num_of_inputs!=2:
-            info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input)))
+        if num_of_inputs != 2:
+            info_messages.append(
+                "RTL-based MatrixVectorActivation expects two inputs "
+                "(weights and activation), but got {} inputs.".format(
+                    len(self.onnx_node.input)
+                )
+            )
 
         mem_mode = self.get_nodeattr("mem_mode")
 
         if mem_mode != "decoupled":
-            info_messages.append("RTL-based MVAU supports only decoupled weights currently")
+            info_messages.append(
+                "RTL-based MVAU supports only decoupled weights currently"
+            )
 
         return info_messages
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -218,7 +221,7 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -259,7 +262,7 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -272,7 +275,7 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -287,7 +290,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point?
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -328,13 +331,9 @@ def lut_estimation(self):
         acc_bits = W + A + np.ceil(math.log(MW, 2))
         acc_luts = acc_bits
 
-        return int(
-            c0
-            + c1 * (P * (mult_luts + addertree_luts + acc_luts))
-            + c2
-        )
+        return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point?
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def dsp_estimation(self):
         # multiplication
         P = self.get_nodeattr("PE")
@@ -350,7 +349,7 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -359,7 +358,9 @@ def get_exp_cycles(self):
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
-        # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10)
+        # Actual exp_cycles is probably slightly larger (say 3 cycles
+        # (DSP A/B, M, P - reg) + additional pipeline buffer cycles.
+        # Most probably <10)
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -384,7 +385,9 @@ def get_output_datatype(self, ind=0):
 
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
-        assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        assert (
+            i_bits <= 9
+        ), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
@@ -402,7 +405,9 @@ def get_weightstream_width(self):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
-            assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            assert (
+                wp <= 8
+            ), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             w_width = pe * simd * wp
             return w_width
         else:
@@ -516,7 +521,8 @@ def minimize_accumulator_width(self, model):
         else:
             adt = DataType.get_smallest_possible(acc_max)
         # Note: we are interested in simply the width of the output dot product.
-        # Padding the actual output stream to a multiple of 8-bits is done in the RTL component
+        # Padding the actual output stream to a multiple of 8-bits is done in
+        # the RTL component
         self.set_nodeattr("accDataType", adt.name)
         # for no-activation nodes, output dt = acc dt
         self.set_nodeattr("outputDataType", adt.name)
@@ -615,9 +621,7 @@ def generate_params(self, model, path):
             # and one for synthesis. this is because URAM-based weights always
             # need zero weights for synthesis, otherwise they get inferred
             # as BRAM
-            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
-                code_gen_dir
-            )
+            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
             weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
             # sim weights are always the true weights
             self.make_weight_file(
@@ -734,11 +738,11 @@ def code_generation_ipgen(self, model, fpgapart, clk):
 
     def ipgen_singlenode_code(self):
         """Normally: Builds the bash script for IP generation."""
-        pass   
+        pass
 
     def code_generation_cppsim(self, model):
         """Normally: Generates C++ code for simulation (cppsim)."""
-        pass     
+        pass
 
     def compile_singlenode_code(self):
         pass
@@ -803,19 +807,28 @@ def code_generation_ipi(self):
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
             rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
             sourcefiles = [
-                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
                 rtllib_dir + "mvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_8sx9.sv",
-                rtllib_dir + "mvu_8sx8u_dsp48.sv"
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
-            cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name))
+            cmd.append(
+                "create_bd_cell -type hier -reference %s /%s/%s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                    self.onnx_node.name,
+                )
+            )
 
             # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_vlnv = "amd.com:FINN:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
                 "create_bd_cell -type ip -vlnv %s /%s/%s"
@@ -849,11 +862,11 @@ def code_generation_ipi(self):
                 % (node_name, strm_inst, node_name, node_name, sname)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
                 % (node_name, rst_name, node_name, strm_inst)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
                 % (node_name, clk_name, node_name, strm_inst)
             )
             cmd.append(
@@ -947,21 +960,25 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-# TODO: characterize max_clk and implement this function in look-up style
+    # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP chain to meet target clock frequency
         segmentlen = 0
         return segmentlen
 
     def _resolve_impl_style(self, fpgapart):
-        # Based on target device and activation/weight-width, choose the supported RTL module
+        # Based on target device and activation/weight-width, choose the
+        # supported RTL module
         act_width = self.get_input_datatype(0).bitwidth()
         weight_width = self.get_input_datatype(1).bitwidth()
-        is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc"
-        if (act_width == 4 and weight_width == 4):
+        is_versal = (
+            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+            or fpgapart[0:5] == "xqrvc"
+        )
+        if act_width == 4 and weight_width == 4:
             return "mvu_4sx4u"
         else:
-            if (is_versal):
+            if is_versal:
                 return "mvu_8sx9_dsp58"
             else:
                 return "mvu_8sx8u_dsp48"
@@ -973,13 +990,17 @@ def generate_hdl(self, model, fpgapart, clk):
 
         template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
         # add general parameters to dictionary
-        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
+            self.get_verilog_top_module_name()
+        ]
         # save top module name so we can refer to it after this node has been renamed
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
 
         ram_style = self.get_nodeattr("ram_style")
-        assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl"
+        assert (
+            ram_style == "auto"
+        ), "Unrecognized ram_style for MatrixVectorActivation_rtl"
 
         # apply code generation to template
         with open(template_path, "r") as f:
@@ -1009,19 +1030,21 @@ def generate_hdl(self, model, fpgapart, clk):
         self.set_nodeattr("ip_path", code_gen_dir)
 
     def prepare_codegen_default(self, fpgapart, clk):
-        template_path = (
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
-        )
-        
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
+
         code_gen_dict = {}
         code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
         code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
-        code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [
+            str(self.get_input_datatype(0).bitwidth())
+        ]
         code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
         code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
-        code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        code_gen_dict["$SIGNED_ACTIVATIONS$"] = (
+            [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
         code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
 
@@ -1035,15 +1058,10 @@ def prepare_rtlsim(self):
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
 
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")        
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         # Path to (System-)Verilog files used by top-module & path to top-module
-        verilog_paths = [
-            code_gen_dir,
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"
-        ]
-        verilog_files = [
-            self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
-        ]
+        verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
+        verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"]
 
         # build the Verilator emu library
         sim = PyVerilator.build(
@@ -1051,9 +1069,9 @@ def prepare_rtlsim(self):
             build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
             verilog_path=verilog_paths,
             trace_depth=get_rtlsim_trace_depth(),
-            top_module_name=self.get_verilog_top_module_name()
+            top_module_name=self.get_verilog_top_module_name(),
         )
         # save generated lib filename in attribute
         self.set_nodeattr("rtlsim_so", sim.lib._name)
-        
-        return sim
\ No newline at end of file
+
+        return sim

From 4a9cfa1c7a17497578faad3f76c25b80c116ba58 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 May 2023 10:56:07 +0100
Subject: [PATCH 029/123] [rtl custom_op]: add support for external weights

---
 .../matrixvectoractivation_rtl.py             | 67 ++++++++++---------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 8fd261d395..162b5e2e16 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -192,9 +192,9 @@ def verify_node(self):
 
         mem_mode = self.get_nodeattr("mem_mode")
 
-        if mem_mode != "decoupled":
+        if mem_mode not in ["decoupled", "external"]:
             info_messages.append(
-                "RTL-based MVAU supports only decoupled weights currently"
+                "RTL-based MVAU supports only decoupled or external weights."
             )
 
         return info_messages
@@ -612,35 +612,20 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
-        if mem_mode == "decoupled":
+        if mem_mode in ["decoupled", "external"]:
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
-            # Also save weights as Verilog .dat file
-            # note that we provide two different .dat files, one for synth
-            # and one for synthesis. this is because URAM-based weights always
-            # need zero weights for synthesis, otherwise they get inferred
-            # as BRAM
-            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
-            weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
-            # sim weights are always the true weights
-            self.make_weight_file(
-                weights, "decoupled_verilog_dat", weight_filename_rtl_sim
-            )
-            ram_style = self.get_nodeattr("ram_style")
-            if ram_style == "ultra":
-                # UltraRAM must have no memory initializer, or only zeroes
-                # otherwise BRAM will be inferred instead of URAM
-                # as a workaround we provide a zero-weight init here
-                synth_weights = np.zeros_like(weights, dtype=np.float32)
-            else:
-                synth_weights = weights
-            self.make_weight_file(
-                synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
-            )
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                # This file will be ignored when synthesizing UltraScale memory.
+                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+                self.make_weight_file(
+                    weights, "decoupled_verilog_dat", weight_filename_rtl
+                )
         else:
             raise Exception(
-                """Please set mem_mode to "decoupled",
+                """Please set mem_mode to "const", "decoupled", or "external",
                 currently no other parameter value is supported!"""
             )
 
@@ -695,7 +680,7 @@ def execute_node(self, context, graph):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            if mem_mode == "external" or mem_mode == "decoupled":
+            if mem_mode in ["external", "decoupled"]:
                 wnbits = self.get_weightstream_width()
                 export_wdt = self.get_weight_datatype()
                 wei = npy_to_rtlsim_input(
@@ -903,9 +888,31 @@ def code_generation_ipi(self):
                 # TODO calculate and pass in segment size here
                 cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
-        elif mem_mode == "const" or mem_mode == "external":
-            # base class impl sufficient for const/external modes
-            return super().code_generation_ipi()
+        elif mem_mode == "external":
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
+                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append(
+                "create_bd_cell -type module -reference %s %s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                )
+            )
+            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name))
+            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name))
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd

From 8a9ac1af4d6c62e7c9557ab41992b84cf2c37ae1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 11 May 2023 11:04:28 +0100
Subject: [PATCH 030/123] Specify clock and reset associations of bus
 interfaces.

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 4 +++-
 finn-rtllib/mvu/mvu_axi_wrapper.v      | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index 502a72d3f2..fb3c62a15a 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -49,8 +49,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_LANES = PE,
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  	// Global Control
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
 	input	logic  ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
 	input	logic  ap_rst_n,
 
 	// Weight Stream
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index b79ba6bbd1..d8acaefcc7 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -50,8 +50,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_LANES = PE,
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  	// Global Control
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
 	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
 	input	ap_rst_n,
 	// Weight Stream
 	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,

From d9b90793bd54a5e112531c737fa7c60a51b21d34 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Mon, 15 May 2023 10:16:48 +0200
Subject: [PATCH 031/123] [rtlmvu] More fixes for memstream and param gen

---
 .../fpgadataflow/matrixvectoractivation_rtl.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 162b5e2e16..1791327e78 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -612,7 +612,11 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
+<<<<<<< HEAD
         if mem_mode in ["decoupled", "external"]:
+=======
+        if mem_mode == "decoupled" or mem_mode == "external":
+>>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen)
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
@@ -821,22 +825,16 @@ def code_generation_ipi(self):
             )
             cmd.append(
                 "set_property -dict [list "
-                "CONFIG.NSTREAMS {1} "
-                "CONFIG.MEM_DEPTH {%d} "
-                "CONFIG.MEM_WIDTH {%d} "
-                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.DEPTH {%d} "
+                "CONFIG.WIDTH {%d} "
+                "CONFIG.INIT_FILE {%s} "
                 "CONFIG.RAM_STYLE {%s} "
-                "CONFIG.STRM0_DEPTH {%d} "
-                "CONFIG.STRM0_WIDTH {%d} "
-                "CONFIG.STRM0_OFFSET {0} "
                 "] [get_bd_cells /%s/%s]"
                 % (
                     self.calc_wmem(),
                     self.get_weightstream_width_padded(),
-                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
                     self.get_nodeattr("ram_style"),
-                    self.calc_wmem(),
-                    self.get_weightstream_width_padded(),
                     node_name,
                     strm_inst,
                 )

From a5f2a83897e33acb4b3e2231d9bfa534e56bb6b2 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Thu, 11 May 2023 23:49:10 +0200
Subject: [PATCH 032/123] [Build] apply config to only FIFO nodes in
 step_set_fifo_depths

---
 src/finn/builder/build_dataflow_steps.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 65ab2b0b93..d4af757491 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -53,6 +53,7 @@
 from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -123,7 +124,6 @@
 )
 from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 
 
 def verify_step(
@@ -486,14 +486,13 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
 
 
 def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
-    """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible."""
-    specialize_to_rtl_transforms = [
-        to_rtl.InferRTLMatrixVectorActivation()
-    ]
+    """Convert layers implemented in HLS to an equivalent specialized RTL
+    implementation if possible."""
+    specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()]
     for trn in specialize_to_rtl_transforms:
         model = model.transform(trn)
     return model
-    
+
 
 def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Tighten the weight and accumulator bit widths for each layer."""
@@ -594,7 +593,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
-            model = model.transform(ApplyConfig(cfg.folding_config_file))
+            model = model.transform(
+                ApplyConfig(
+                    cfg.folding_config_file,
+                    node_filter=lambda x: x.op_type == "StreamingFIFO",
+                )
+            )
 
     # extract the final configuration and save it as json
     hw_attrs = [

From 08cbdc59a95ed6281c3234c5e8b0b9d7327a2988 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 07:58:41 +0100
Subject: [PATCH 033/123] Revised control interface attributes.

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 29 +++++++++++++-------------
 finn-rtllib/mvu/mvu_axi_wrapper.v      |  8 ++++---
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index fb3c62a15a..e15f77fbae 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -50,25 +50,26 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
-	input	logic  ap_clk,
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
-	input	logic  ap_rst_n,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
 
 	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	s_axis_weights_tvalid,
+	output	s_axis_weights_tready,
 
 	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	s_axis_input_tvalid,
+	output	s_axis_input_tready,
 
 	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	m_axis_output_tvalid,
+	input	m_axis_output_tready
 );
 
 mvu_8sx9_axi #(
@@ -89,4 +90,4 @@ mvu_8sx9_axi #(
 	.m_axis_output_tready(m_axis_output_tready)
 );
 
-endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index d8acaefcc7..239c5bbacd 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -51,10 +51,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
 	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
 	input	ap_rst_n,
+
 	// Weight Stream
 	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
 	input   weights_V_TVALID,
@@ -87,4 +89,4 @@ mvu_axi #(
 	.m_axis_output_tready(out_V_TREADY)
 );
 
-endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$

From d058cc2a5c1ed71a2c2ea12034cfa921818381ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 09:16:50 +0100
Subject: [PATCH 034/123] Mask device primitives from Verilator in favor of
 using behavioral code.

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 38 ++++++++++++++++++++----------
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 38 ++++++++++++++++++++----------
 finn-rtllib/mvu/mvu_8sx9.sv        | 29 ++++++++++++++---------
 3 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 5993154355..21594e46ac 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -19,6 +19,12 @@ module mvu_4sx4u #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 	typedef int unsigned  leave_load_t[2*SIMD-1];
 	function leave_load_t init_leave_loads();
@@ -59,17 +65,21 @@ module mvu_4sx4u #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
-						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-							.O6(xx[pe][1]),
-							.O5(xx[pe][0]),
-							.I5(1'b1),
-							.I4(zero),
-							.I3(ww[pe][1]),
-							.I2(a[s][1]),
-							.I1(ww[pe][0]),
-							.I0(a[s][0])
-						);
+						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+						else begin
+							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+								.O6(xx[pe][1]),
+								.O5(xx[pe][0]),
+								.I5(1'b1),
+								.I4(zero),
+								.I3(ww[pe][1]),
+								.I2(a[s][1]),
+								.I1(ww[pe][0]),
+								.I0(a[s][0])
+							);
+						end
+`endif
 					end
 				end
 				always_comb begin
@@ -87,7 +97,7 @@ module mvu_4sx4u #(
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if (BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
 				logic signed [23:0]  B1  = 0;
 				always_ff @(posedge clk) begin
@@ -121,6 +131,7 @@ module mvu_4sx4u #(
 
 				assign	pp = P3;
 			end : genBehav
+`ifndef VERILATOR
 			else begin : genDSP
 				DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
@@ -252,6 +263,7 @@ module mvu_4sx4u #(
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
 			end : genDSP
+`endif
 
 			// External Canary Pipeline
 			logic [1:0]  X1[3:1] = '{ default: 0 };
@@ -356,4 +368,4 @@ module mvu_4sx4u #(
 
 	end : genPipes
 
-endmodule : mvu_4sx4u
\ No newline at end of file
+endmodule : mvu_4sx4u
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index e06a92c8fa..09db360b77 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -23,6 +23,12 @@ module mvu_8sx8u_dsp48 #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 	typedef int unsigned  leave_load_t[2*SIMD-1];
 	function leave_load_t init_leave_loads();
@@ -63,17 +69,21 @@ module mvu_8sx8u_dsp48 #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
-						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-							.O6(xx[1]),
-							.O5(xx[0]),
-							.I5(1'b1),
-							.I4(zero),
-							.I3(ww[pe][1]),
-							.I2(a[s][1]),
-							.I1(ww[pe][0]),
-							.I0(a[s][0])
-						);
+						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+						else begin
+							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+								.O6(xx[1]),
+								.O5(xx[0]),
+								.I5(1'b1),
+								.I4(zero),
+								.I3(ww[pe][1]),
+								.I2(a[s][1]),
+								.I1(ww[pe][0]),
+								.I0(a[s][0])
+							);
+						end
+`endif
 					end
 				end
 				always_comb begin
@@ -91,7 +101,7 @@ module mvu_8sx8u_dsp48 #(
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
 				logic signed [23:0]  B1  = 0;
 				always_ff @(posedge clk) begin
@@ -125,6 +135,7 @@ module mvu_8sx8u_dsp48 #(
 
 				assign	pp = P3;
 			end : genBehav
+`ifndef VERILATOR
 			else begin : genDSP
 				DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
@@ -256,6 +267,7 @@ module mvu_8sx8u_dsp48 #(
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
 			end : genDSP
+`endif
 
 			// External Canary Pipeline
 			logic [1:0]  X1 = '{ default: 0 };
@@ -355,4 +367,4 @@ module mvu_8sx8u_dsp48 #(
 
 	end : genPipes
 
-endmodule : mvu_8sx8u_dsp48
\ No newline at end of file
+endmodule : mvu_8sx8u_dsp48
diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 2d1da26efb..f8e2ab3985 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -52,11 +52,17 @@ module mvu_8sx9 #(
     input   logic zero, // ignore current inputs and force this partial product to zero
     input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
 	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
-    
+
 	// Ouput
 	output  logic vld,
     output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 //-------------------- Declare global signals --------------------\\
 	localparam int unsigned CHAINLEN = (SIMD+2)/3;
@@ -75,7 +81,7 @@ module mvu_8sx9 #(
 			L[1+MAX_PIPELINE_STAGES] <= last;
 			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
 		end
-	end  
+	end
 	assign vld = L[0];
 
 //-------------------- Shift register for ZERO flag --------------------\\
@@ -87,7 +93,7 @@ module mvu_8sx9 #(
 			else if(en) begin
 				Z[0] <= zero;
 				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
-			end    
+			end
 		end
 	end;
 
@@ -157,12 +163,12 @@ module mvu_8sx9 #(
 
 			if (LAST) begin : genPOUT
 				assign p[j] = pp[ACCU_WIDTH-1:0];
-			end      
+			end
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input A/B
 				logic signed [33:0] Areg [INTERNAL_PREGS];
 				always_ff @(posedge clk) begin
@@ -233,7 +239,7 @@ module mvu_8sx9 #(
 				assign pp = Preg;
 				assign pcout[j][i] = pp;
 			end : genBehav
-
+`ifndef VERILATOR
 			else begin: genDSP
 				DSP58 #(
 					// Feature Control Attributes: Data Path Selection
@@ -263,8 +269,8 @@ module mvu_8sx9 #(
 					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
 					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
 					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
 										2'b01, // Y : M
 										2'b01  // X: M
 					}), // Optional inversion for OPMODE
@@ -325,7 +331,7 @@ module mvu_8sx9 #(
 							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
 							2'b00,
 							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
 					}),                                 // 5-bit input: INMODE control
 					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
 					.OPMODE({
@@ -365,7 +371,8 @@ module mvu_8sx9 #(
 					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
 				);
 			end : genDSP
-		end : genDSPChain  
+`endif
+		end : genDSPChain
 	end : genDSPPE
-    
+
 endmodule : mvu_8sx9

From a66f38f2d06901fd27cf874701572268ea4793d6 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Thu, 11 May 2023 23:48:36 +0200
Subject: [PATCH 035/123] [Deps] update qonnx

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index e039ca9144..f1cf8754f2 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="20a34289cf2297d2b2bbbe75d6ac152ece86e3b4"
+QONNX_COMMIT="bc36fd56bf1e4abfcf98cd76a001cad13d57baac"
 FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c"
 BREVITAS_COMMIT="c65f9c13dc124971f14739349531bbcda5c2a4aa"
 PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"

From 8f9bd04b3311e56da4684a58d4de868d61f342ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 12:44:53 +0100
Subject: [PATCH 036/123] Adding folding hints. Impl selection by case
 statement.

---
 finn-rtllib/mvu/mvu_axi.sv | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index e4a919ba88..a181f54ac5 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -29,6 +29,14 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ * @details
+ *  Folding hints:
+ *	 - 4-bit MVU:          PE scaling should aim at a full multiple of 4.
+ *	 - 8-bit MVU - DSP48:  PE scaling should aim at a full multiple of 2.
+ *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3.
+ *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
+ *	   impact critical paths more than PE scaling. PE scaling implies a
+ *	   bigger fanout on the input activations.
  *****************************************************************************/
 
 module mvu_axi #(
@@ -134,8 +142,9 @@ module mvu_axi #(
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	
-	if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9
+
+	case(MVU_IMPL_STYLE)
+	"mvu_8sx9_dsp58":
 		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
@@ -143,26 +152,27 @@ module mvu_axi #(
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
+
+	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u
+
+	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else initial begin
-		$error("Unrecognized MVU_IMPL_STYLE!");
+
+	default: initial begin
+		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
 		$finish;
 	end
+	endcase
 
 //-------------------- Output register slice --------------------\\
 	struct packed {
@@ -185,7 +195,7 @@ module mvu_axi #(
 			end
 		end
 	end
-	
+
 	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
@@ -196,10 +206,10 @@ module mvu_axi #(
 		if(rst)		B <= '{ default: 'x };
 		else begin
 			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end	
+		end
 	end
 
 	assign	m_axis_output_tvalid = B.vld;
 	assign	m_axis_output_tdata  = B.dat;
 
-endmodule : mvu_axi
\ No newline at end of file
+endmodule : mvu_axi

From 9de5ed6f7b459f37bb127f0cd105e6f927d25611 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 13:52:40 +0100
Subject: [PATCH 037/123] Fixed behavioral sideband prediction.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 09db360b77..bd1f813af6 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -69,7 +69,7 @@ module mvu_8sx8u_dsp48 #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						if(BEHAVIORAL)  assign  xx = zero? 0 : ww[pe] * a[s];
 `ifndef VERILATOR
 						else begin
 							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (

From 239759a6a4b8cb008aa9b80d52d15f53f77e5965 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 15:49:19 +0100
Subject: [PATCH 038/123] [rtl mvu]: extension to allow selecting PE values
 that are not multiples of 4

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 21594e46ac..111d651cf5 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -50,6 +50,7 @@ module mvu_4sx4u #(
 
 		localparam int unsigned  PE_BEG = 4*c;
 		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
+		localparam int unsigned  PE_REM = 4*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD][3];
@@ -65,12 +66,12 @@ module mvu_4sx4u #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						if(BEHAVIORAL)  assign  xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
 `ifndef VERILATOR
 						else begin
 							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-								.O6(xx[pe][1]),
-								.O5(xx[pe][0]),
+								.O6(xx[pe + PE_REM][1]),
+								.O5(xx[pe + PE_REM][0]),
 								.I5(1'b1),
 								.I4(zero),
 								.I3(ww[pe][1]),
@@ -86,8 +87,8 @@ module mvu_4sx4u #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe]+:3] = ww[pe];
-						aa[D[pe]+ 3] = ww[pe][3];
+						dd[D[pe + PE_REM]+:3] = ww[pe];
+						aa[D[pe + PE_REM]+ 3] = ww[pe][3];
 					end
 				end
 			end : blkVectorize
@@ -305,7 +306,7 @@ module mvu_4sx4u #(
 			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
 
 			// Conclusive high part accumulation
-			if(i < 3) begin : genHi
+			if(i >= PE_REM && i < 3) begin : genHi
 				// Adder Tree across all SIMD high contributions, each from [-1:1]
 				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
@@ -323,9 +324,12 @@ module mvu_4sx4u #(
 				end
 				assign	hi4[i] = Hi4;
 			end : genHi
+			else begin : genHiZero
+				assign hi4[i] = '0;
+			end : genHiZero
 
 			// Conclusive low part accumulation
-			if(1) begin : blkLo
+			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -346,6 +350,9 @@ module mvu_4sx4u #(
 				if(i == 3)  assign  up4 = Lo4;
 				else  assign  lo4[i] = Lo4;
 			end : blkLo
+			else begin : blkLoZero
+				assign lo4[i] = '0;
+			end : blkLoZero
 
 		end
 
@@ -363,7 +370,7 @@ module mvu_4sx4u #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
 		end
 
 	end : genPipes

From 8d3247ccf7657aeb534147a5dd9511fa397d4eb2 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Wed, 24 May 2023 15:56:07 +0200
Subject: [PATCH 039/123] [rtlmvu] Avoid unintentional verilator metacomments

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 2 +-
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 finn-rtllib/mvu/mvu_8sx9.sv        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 21594e46ac..9f101e8c29 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -19,7 +19,7 @@ module mvu_4sx4u #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index bd1f813af6..6b54e91b6a 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -23,7 +23,7 @@ module mvu_8sx8u_dsp48 #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||
diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index f8e2ab3985..a601066cfd 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -57,7 +57,7 @@ module mvu_8sx9 #(
 	output  logic vld,
     output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||

From c8663505dcd2c2eeb3ddad05d361f82be32040eb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 17:14:23 +0100
Subject: [PATCH 040/123] [rtl mvu]: extension to allow selecting PE values
 that are not multiples of 2

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 57 +++++++++++++++++-------------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 6b54e91b6a..5cc3fa4c49 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -54,6 +54,7 @@ module mvu_8sx8u_dsp48 #(
 
 		localparam int unsigned  PE_BEG = 2*c;
 		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
+		localparam int unsigned  PE_RES = 2*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD];
@@ -90,8 +91,8 @@ module mvu_8sx8u_dsp48 #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe];
-						aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+						dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
 					end
 				end
 			end : blkVectorize
@@ -301,32 +302,35 @@ module mvu_8sx8u_dsp48 #(
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
 		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
-		for(genvar  i = 0; i < 2; i++) begin
-			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
-			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
 
-			// Conclusive high part accumulation
-			if(i == 0) begin : genHi
-				// Adder Tree across all SIMD high contributions, each from [-1:1]
-				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
-					assign  tree[n] = s;
-				end
+		// Conclusive high part accumulation
+		if(PE_RES == 0) begin : genHi
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
+			// Adder Tree across all SIMD high contributions, each from [-1:1]
+			uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+			for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
+			for(genvar  n = 0; n < SIMD-1; n++) begin
+				// Sum truncated to actual maximum bit width at this node
+				uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+				assign  tree[n] = s;
+			end
 
-				// High Sideband Accumulation
-				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Hi4 <= 0;
-					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
-				end
-				assign	hi4 = Hi4;
-			end : genHi
+			// High Sideband Accumulation
+			logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+			always_ff @(posedge clk) begin
+				if(rst)      Hi4 <= 0;
+				else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+			end
+			assign	hi4 = Hi4;
+		end : genHi
+		else begin : genHiZero
+			assign hi4 = '0;
+		end : genHiZero
 
+		for(genvar  i = 0; i < 2; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
 			// Conclusive low part accumulation
-			if(1) begin : blkLo
+			if(i >= PE_RES) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -347,6 +351,9 @@ module mvu_8sx8u_dsp48 #(
 				if(i == 1)  assign  up4 = Lo4;
 				else  assign  lo4 = Lo4;
 			end : blkLo
+			else begin : blkLoZero
+				assign lo4 = '0;
+			end : blkLoZero
 
 		end
 
@@ -362,7 +369,7 @@ module mvu_8sx8u_dsp48 #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_RES];
 		end
 
 	end : genPipes

From fd1e038c643c05199b38320f8815f430e538d936 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 17:21:56 +0100
Subject: [PATCH 041/123] [rtl mvu axi]: updated comments on folding hints

---
 finn-rtllib/mvu/mvu_axi.sv | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index a181f54ac5..cef55949ed 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -31,12 +31,13 @@
  * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
  * @details
  *  Folding hints:
- *	 - 4-bit MVU:          PE scaling should aim at a full multiple of 4.
- *	 - 8-bit MVU - DSP48:  PE scaling should aim at a full multiple of 2.
- *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3.
+ *	 - 4-bit MVU:          PE scaling should divide MH.
+ *	 - 8-bit MVU - DSP48:  PE scaling should divide MH.
+ *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3 and divide MW.
  *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
  *	   impact critical paths more than PE scaling. PE scaling implies a
  *	   bigger fanout on the input activations.
+ *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
  *****************************************************************************/
 
 module mvu_axi #(

From f60d4c6fa105bd29689b93aafd880ec92c32358c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:48:26 +0100
Subject: [PATCH 042/123] [rtl custom op]: minor fixes to codegen

---
 .../fpgadataflow/matrixvectoractivation_rtl.py     | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 1791327e78..9f8130806b 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -612,11 +612,7 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
-<<<<<<< HEAD
-        if mem_mode in ["decoupled", "external"]:
-=======
         if mem_mode == "decoupled" or mem_mode == "external":
->>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen)
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
@@ -909,8 +905,6 @@ def code_generation_ipi(self):
                     self.onnx_node.name,
                 )
             )
-            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name))
-            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name))
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd
@@ -968,8 +962,7 @@ def derive_characteristic_fxns(self, period):
     # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP chain to meet target clock frequency
-        segmentlen = 0
-        return segmentlen
+        return 4 # default to 4 for now
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
@@ -1002,11 +995,6 @@ def generate_hdl(self, model, fpgapart, clk):
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
 
-        ram_style = self.get_nodeattr("ram_style")
-        assert (
-            ram_style == "auto"
-        ), "Unrecognized ram_style for MatrixVectorActivation_rtl"
-
         # apply code generation to template
         with open(template_path, "r") as f:
             template_wrapper = f.read()

From a1ad304a42bf89b36d6507cf9f749a7a1a7d130a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:48:58 +0100
Subject: [PATCH 043/123] [specialize-to-rtl]: add ram_style and
 rt_writeable_weights support

---
 .../transformation/fpgadataflow/specialize_to_rtl_layers.py   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 7d677ec216..23b6e59abe 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -74,6 +74,8 @@ def apply(self, model):
                     simd = getCustomOp(n).get_nodeattr("SIMD")
                     pe = getCustomOp(n).get_nodeattr("PE")
                     mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
 
                     new_node = helper.make_node(
                         "MatrixVectorActivation_rtl",
@@ -91,6 +93,8 @@ def apply(self, model):
                         numInputVectors=numInputVectors,
                         mem_mode=mem_mode,
                         name=n.name + "_rtl",
+                        ram_style=ram_style,
+                        runtime_writeable_weights=runtime_writeable_weights
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old node

From 2cbb68fe016ff7ea292ffa071741b352222d1a4c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:50:05 +0100
Subject: [PATCH 044/123] [rtllib]: change string type to parameter type due to
 Vivado error

---
 finn-rtllib/mvu/mvu_axi.sv | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index cef55949ed..46167af95b 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -51,7 +51,7 @@ module mvu_axi #(
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
 	bit FORCE_BEHAVIORAL = 0,
-	string MVU_IMPL_STYLE,
+	parameter MVU_IMPL_STYLE, // string type causes error in Vivado
 
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
@@ -163,12 +163,11 @@ module mvu_axi #(
 
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-
 	default: initial begin
 		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
 		$finish;

From 92eb0edba2d059b8b170ed7e6d8ac7a224c9208c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:51:40 +0100
Subject: [PATCH 045/123] [rtllib]: renamed variable for consistency

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 5cc3fa4c49..3cd9cef560 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -54,7 +54,7 @@ module mvu_8sx8u_dsp48 #(
 
 		localparam int unsigned  PE_BEG = 2*c;
 		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
-		localparam int unsigned  PE_RES = 2*(c+1) - PE_END;
+		localparam int unsigned  PE_REM = 2*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD];
@@ -91,8 +91,8 @@ module mvu_8sx8u_dsp48 #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe];
-						aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+						dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
 					end
 				end
 			end : blkVectorize
@@ -304,7 +304,7 @@ module mvu_8sx8u_dsp48 #(
 		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
 
 		// Conclusive high part accumulation
-		if(PE_RES == 0) begin : genHi
+		if(PE_REM == 0) begin : genHi
 			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
 			// Adder Tree across all SIMD high contributions, each from [-1:1]
 			uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
@@ -330,7 +330,7 @@ module mvu_8sx8u_dsp48 #(
 		for(genvar  i = 0; i < 2; i++) begin
 			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
 			// Conclusive low part accumulation
-			if(i >= PE_RES) begin : blkLo
+			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -369,7 +369,7 @@ module mvu_8sx8u_dsp48 #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG + PE_RES];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
 		end
 
 	end : genPipes

From 471a221b975e549e462e7ff9488c65ad182fe278 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 2 Jun 2023 12:39:14 +0100
Subject: [PATCH 046/123] Fix improper blocking assignment & linting.

---
 finn-rtllib/mvu/tb/mvu_axi_tb.sv | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index ef5fa7d682..b89b58f55b 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -42,12 +42,12 @@ module mvu_axi_tb();
 	localparam int unsigned SEGMENTLEN = 2;
 	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
 	localparam bit FORCE_BEHAVIORAL = 1;
-	// Bit-width config  
+	// Bit-width config
 	localparam int unsigned ACTIVATION_WIDTH = 8;
 	localparam int unsigned WEIGHT_WIDTH = 8;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
 	localparam bit SIGNED_ACTIVATIONS = 0;
-	// Simulation constants  
+	// Simulation constants
 	localparam int unsigned NF = MH/PE;
 	localparam int unsigned SF = MW/SIMD;
 	localparam int unsigned NUM_OF_DSP = SIMD/3;
@@ -57,7 +57,7 @@ module mvu_axi_tb();
 	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
 
-	// Generate clk and reset signal   
+	// Generate clk and reset signal
 	logic clk = 0;
 	always #5ns clk = !clk;
 
@@ -69,7 +69,7 @@ module mvu_axi_tb();
 
 	uwire ap_clk = clk;
 
-	// Generate activations  
+	// Generate activations
 	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
 	typedef activation_t activation_vector_t[SF];
 
@@ -94,8 +94,8 @@ module mvu_axi_tb();
 
 		for (int i=0; i<SF; i++) begin
 			activations.dat <= ACTIVATIONS[i];
-			do begin 
-				activations.vld = $urandom()%7 >= 1;
+			do begin
+				activations.vld <= $urandom()%7 >= 1;
 				@(posedge clk);
 			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
@@ -104,9 +104,9 @@ module mvu_axi_tb();
 		activations.dat <= 'x;
 	end
 
-	// Generate weights   
+	// Generate weights
 	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF]; 
+	typedef weight_t weight_matrix_t[NF][SF];
 
 	function weight_matrix_t init_WEIGHTS;
 		automatic weight_matrix_t res;
@@ -139,7 +139,7 @@ module mvu_axi_tb();
 		weights.dat <= 'x;
 	end
 
-	// Function to compute golden output  
+	// Function to compute golden output
 	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
 	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
 	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
@@ -155,12 +155,12 @@ module mvu_axi_tb();
 		automatic output_vector_t res = '{default: 0};
 		for (int j = 0; j<MH; j++) begin
 			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS==1) 
+				if (SIGNED_ACTIVATIONS)
 					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
 				else
 					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
 			end
-		end  
+		end
 		return res;
 	endfunction : check_output;
 
@@ -179,16 +179,16 @@ module mvu_axi_tb();
 			// Compare produced outputs against golden outputs
 			foreach(outputs.dat[i]) begin
 				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin 
+				else begin
 					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
 					$stop;
-				end  
+				end
 			end
-			
+
 			NF_CNT += 1;
 		end
 
-		$finish;  
+		$finish;
 	end
 
 	// Instantiate DUT
@@ -211,5 +211,5 @@ module mvu_axi_tb();
 		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
 		.m_axis_output_tready(outputs.rdy)
 	);
-  
+
 endmodule : mvu_axi_tb

From 5c5dc09c98d4e1a07a7e4cae17ca358b197a57c8 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 13:35:04 +0100
Subject: [PATCH 047/123] [test rtl mvu]: modified/extended test cases

---
 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index 20a249bd08..3db7a718f5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -86,13 +86,12 @@ def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
 @pytest.mark.parametrize("mh", [16])
-@pytest.mark.parametrize("mw", [90])
-#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16])
-@pytest.mark.parametrize("pe", [16])
+@pytest.mark.parametrize("mw", [32])
+@pytest.mark.parametrize("pe", [1, 4, 16])
 #@pytest.mark.parametrize("simd", [1, 30, 90])
-@pytest.mark.parametrize("simd", [90])
-@pytest.mark.parametrize("idt", [DataType["INT8"]])
-@pytest.mark.parametrize("wdt", [DataType["UINT4"]])
+@pytest.mark.parametrize("simd", [1, 4, 32])
+@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
+@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
 #@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
 @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
 @pytest.mark.parametrize("segmentlen", [1])
@@ -166,7 +165,3 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
 
     assert (output_mvau_hls == output_mvau_rtl).all()
     assert (output_mvau_hls.size > 0)
-
-
-# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl"
-# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim"
\ No newline at end of file

From b4eb9b69a8a6920fdb3141752395e672f78479e3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 30 Jun 2023 15:36:17 +0100
Subject: [PATCH 048/123] [rtl mvu]: updated DSP58 >4-bit variant to lift
 SIMD%3==0 restriction

---
 finn-rtllib/mvu/mvu_8sx9.sv | 103 +++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 38 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index a601066cfd..439fbc44f9 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -92,77 +92,95 @@ module mvu_8sx9 #(
 			if (rst)      Z <= '{default: 0};
 			else if(en) begin
 				Z[0] <= zero;
-				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
 			end
 		end
 	end;
 
 //-------------------- Buffer for input activations --------------------\\
 	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-	typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
 
 	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
 		localparam int TOTAL_PREGS = i/SEGLEN;
 		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+		localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
 
 		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-			a_buffer_t A [0:EXTERNAL_PREGS-1];
+			logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
 			always_ff @(posedge clk) begin
 				if (rst)     A <= '{default: 0};
 				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+					A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED];
 					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
 				end
 			end
-			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
-									: { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+											: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+			end : genAin
+			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+				assign a_in_i[i][9*j +: 9] = 9'b0;
+			end : genAinZero
 		end : genExternalPregAct
 		else begin : genInpDSPAct
-			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
-									: { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
+											: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
+			end : genAin
+			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+				assign a_in_i[i][9*j +: 9] = 9'b0;
+			end : genAinZero
 		end : genInpDSPAct
 
 	end : genActSIMD
 
 //-------------------- Buffer for weights --------------------\\
 	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-	typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
 
-	for (genvar j=0; j<PE; j++) begin : genWeightPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
-			localparam int TOTAL_PREGS = i/SEGLEN;
+	for (genvar i=0; i<PE; i++) begin : genWeightPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = j/SEGLEN;
 			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
 
 			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-				b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
 				always_ff @(posedge clk) begin
 					if (rst)    B <= '{default: 0};
 					else if (en) begin
-						B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
-						if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+						B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
-				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
 			end : genExternalPregWeight
 			else begin : genInpDSPWeight
-				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
 			end : genInpDSPWeight
 		end : genWeightSIMD
-
 	end : genWeightPE
 
 //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-	for (genvar j=0; j<PE; j++) begin : genDSPPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
-			localparam int TOTAL_PREGS = i/SEGLEN;
+	for (genvar i=0; i<PE; i++) begin : genDSPPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
+			localparam int TOTAL_PREGS = j/SEGLEN;
 			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-			localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
-			localparam bit FIRST = i == 0;
-			localparam bit LAST = i == CHAINLEN-1;
-			uwire [57:0] pp;
+			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
+			localparam bit FIRST = j == 0;
+			localparam bit LAST = j == CHAINLEN-1;
 
 			if (LAST) begin : genPOUT
-				assign p[j] = pp[ACCU_WIDTH-1:0];
+				assign p[i] = pcout[i][j][ACCU_WIDTH-1:0];
 			end
 
 			// Note: Since the product B * AD is computed,
@@ -174,7 +192,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Areg <= '{ default : 0};
 					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[i] };
+						Areg[0] <= { 7'bx, a_in_i[j] };
 						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
 					end
 				end
@@ -182,7 +200,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Breg <= '{ default : 0};
 					else if (en) begin
-						Breg[0] <= b_in_i[j][i];
+						Breg[0] <= b_in_i[i][j];
 						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
 					end
 				end
@@ -217,27 +235,36 @@ module mvu_8sx9 #(
 					end
 					else	assign Preg = Mreg;
 				end
-				else if (LAST) begin : genLast
+				else if (FIRST && LAST) begin : genSingle
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
+					end
+				end
+				else if (!FIRST && LAST) begin : genLast
 					always_ff @(posedge clk) begin
 						if (rst)		Opmode <= 0;
 						else if (en)	Opmode <= L[1];
 					end
 					always_ff @(posedge clk) begin
 						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1];
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
 					end
 				end
 				else begin : genMid
 					if (PREG) begin : genPregBehav
 						always_ff @(posedge clk) begin
 							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg + pcout[j][i-1];
+							else if (en)	Preg <= Mreg + pcout[i][j-1];
 						end
 					end
-					else	assign Preg = Mreg + pcout[j][i-1];
+					else	assign Preg = Mreg + pcout[i][j-1];
 				end
-				assign pp = Preg;
-				assign pcout[j][i] = pp;
+				assign pcout[i][j] = Preg;
 			end : genBehav
 `ifndef VERILATOR
 			else begin: genDSP
@@ -307,7 +334,7 @@ module mvu_8sx9 #(
 					.BCOUT(),                           // 24-bit output: B cascade
 					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
 					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-					.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
 					// Control outputs: Control Inputs/Status Bits
 					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
 					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
@@ -322,7 +349,7 @@ module mvu_8sx9 #(
 					.BCIN('x),                          // 24-bit input: B cascade
 					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
 					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-					.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
 					// Control inputs: Control Inputs/Status Bits
 					.ALUMODE(4'h0),                     // 4-bit input: ALU control
 					.CARRYINSEL('0),                    // 3-bit input: Carry select
@@ -339,8 +366,8 @@ module mvu_8sx9 #(
 							7'b000_0000
 					}), // 9-bit input: Operation mode
 					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-					.B(b_in_i[j][i]),                   // 24-bit input: B data
+					.A({ 7'bx, a_in_i[j] }),            // 34-bit input: A data
+					.B(b_in_i[i][j]),                   // 24-bit input: B data
 					.C('x),                             // 58-bit input: C data
 					.CARRYIN('0),                       // 1-bit input: Carry-in
 					.D('x),                             // 27-bit input: D data

From ad63673cda849ecf0df993bc83d00e676998ab03 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 30 Jun 2023 15:45:26 +0100
Subject: [PATCH 049/123] [rtl mvu]: bug fix for SIMD=1 init_leave_loads

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 2 +-
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 4674576d23..ac95b5f8a9 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -296,7 +296,7 @@ module mvu_4sx4u #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 3cd9cef560..416c12c1cc 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -297,7 +297,7 @@ module mvu_8sx8u_dsp48 #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;

From 79e8a5ef208f7bcdeafa231a5a3dff74177008c9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 13 Jul 2023 18:34:05 +0100
Subject: [PATCH 050/123] [mvu rtl]: restrict index i to be less than 3 (within
 bounds of hi4)

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index ac95b5f8a9..88985312c9 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -324,7 +324,7 @@ module mvu_4sx4u #(
 				end
 				assign	hi4[i] = Hi4;
 			end : genHi
-			else begin : genHiZero
+			else if (i < 3) begin : genHiZero
 				assign hi4[i] = '0;
 			end : genHiZero
 

From e3493c30529949a77a3f384fd75c030c551cd2cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 2 Jun 2023 12:47:53 +0100
Subject: [PATCH 051/123] Rewrite replay_buffer for input elasticity.

---
 finn-rtllib/mvu/replay_buffer.sv       | 153 ++++++++++++++++++-------
 finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 +++++++++++++++++++++
 2 files changed, 242 insertions(+), 41 deletions(-)
 create mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 89bbbdb88f..3dfe72d6c6 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,60 +51,131 @@ module replay_buffer #(
 	input	logic  ordy
 );
 
-	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
-	count_t  Count = 0;
-	uwire  done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
-	uwire  done_rep;
-	uwire  done_all = done_len && done_rep;
+	if(LEN == 0)  initial begin
+		$error("%m: Illegal zero sequence LEN.");
+		$finish;
+	end
+	if(REP == 0) initial begin
+		$error("%m: Illegal zero REP count.");
+		$finish;
+	end
 
+	// Track position in Sequence
+	uwire  last_item;
 	uwire  shift;
-	uwire  clr = rst || (done_all && shift);
-	always_ff @(posedge clk) begin
-		if(clr)         Count <= 0;
-		else if(shift)  Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1);
+	if(LEN == 1)  assign  last_item = 1;
+	else begin
+		typedef logic [$clog2(LEN)-1:0]  count_t;
+		count_t  Count = 0;
+		logic    Last  = 0;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				Count <= 0;
+				Last  <= 0;
+			end
+			else if(shift) begin
+				Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1);
+				Last  <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last);
+			end
+		end
+		assign	last_item = Last;
 	end
 
-	typedef logic [W-1:0]  data_t;
-	uwire data_t  rdat;
-	uwire  first_rep;
 	if(REP == 1) begin
-		assign	done_rep  = 1;
-		assign	first_rep = 1;
-		assign	rdat = 'x;
+		assign	shift = ivld && ordy;
+
+		assign	irdy  = ordy;
+		assign	odat  = idat;
+		assign	olast = last_item;
+		assign	ofin  = last_item;
+		assign	ovld  = ivld;
 	end
 	else begin
-		assign	done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0;
 
-		logic  FirstRep = 1;
+		// Track Repetitions
+		uwire  last_rep;
+		if(1) begin : blkRep
+			typedef logic [$clog2(REP)-1:0]  rep_t;
+			rep_t  RepCnt = 0;
+			logic  RepLst = 0;
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					RepCnt <= 0;
+					RepLst <= 0;
+				end
+				else if(last_item && shift) begin
+					RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1);
+					RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst);
+				end
+			end
+			assign	last_rep = RepLst;
+		end : blkRep
+
+		localparam int unsigned  AWIDTH = $clog2(LEN);
+		typedef logic [AWIDTH  :0]  ptr_t;	// pointers with additional generational MSB
+		typedef logic [W     -1:0]  data_t;
+
+		// Output Registers
+		data_t  ODat;
+		logic   OVld =  0;
+		logic   OLst = 'x;
+		logic   OFin = 'x;
+		assign	odat  = ODat;
+		assign	olast = OLst;
+		assign	ofin  = OFin;
+		assign	ovld  = OVld;
+
+		// Buffer Memory Management
+		data_t  Mem[2**AWIDTH];
+		ptr_t  WP = 0;	// Write Pointer
+		ptr_t  RP = 0;	// Read Pointer
+		ptr_t  FP = 0;	// Free Pointer
+
+		// Operational Guards
+		//	Occupancy:    WP-FP
+		//	  WP-FP < 2**AWIDTH -> writing allowed
+		//		- increments WP
+		//	Availability: WP-RP
+		//	  WP-RP > 0         -> reading allowed
+		//		- increments RP, last in sequence rewinds to FP for non-final repetition
+		//		- increments FP in last repetition
+		assign	irdy = !((WP-FP) >> AWIDTH);
+
+		uwire  wr = irdy && ivld;
+		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(clr)         FirstRep <= 1;
-			else if(shift)  FirstRep <= FirstRep && !done_len;
+			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
 		end
-		assign	first_rep = FirstRep;
 
-		data_t  Buf[LEN];
-		if(LEN == 1) begin : genTrivial
-			always_ff @(posedge clk) begin
-				if(shift && FirstRep)  Buf[0] <= idat;
+		uwire  vld = (RP != WP);
+		assign	shift = rd && vld;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				WP <= 0;
+				RP <= 0;
+				FP <= 0;
+
+				OVld <=  0;
+				OLst <= 'x;
+				OFin <= 'x;
 			end
-		end : genTrivial
-		else begin : genShift
-			always_ff @(posedge clk) begin
-				if(shift) begin
-					Buf[0] <= odat;
-					Buf[1:LEN-1] <= Buf[0:LEN-2];
+			else begin
+				if(wr)  WP <= WP + 1;
+				if(rd) begin
+					if(vld) begin
+						automatic logic  rewind = last_item && !last_rep;
+						RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1);
+						FP <= FP + last_rep;
+					end
+
+					OVld <= vld;
+					OLst <= last_item;
+					OFin <= last_rep && last_item;
 				end
 			end
-		end : genShift
+		end
 
-		assign	rdat = Buf[LEN-1];
 	end
 
-	assign  irdy  = ordy && first_rep;
-	assign	odat  = first_rep? idat : rdat;
-	assign	olast = done_len;
-	assign	ofin  = done_all;
-	assign	ovld  = first_rep? ivld : 1;
-	assign	shift = ovld && ordy;
-
-endmodule : replay_buffer
\ No newline at end of file
+endmodule : replay_buffer
diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
new file mode 100644
index 0000000000..5581354e0e
--- /dev/null
+++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
@@ -0,0 +1,130 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for replay_buffer module.
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ *****************************************************************************/
+
+module replay_buffer_tb;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	uwire  rst = 0;
+
+	// DUT Geometries
+	localparam int unsigned  DIMS[3] = '{ 7, 8, 10 };
+	localparam int unsigned  W = 8;
+	typedef logic [W-1:0]  data_t;
+
+	bit [2**$size(DIMS)-1:0]  done = 0;
+	always_comb begin
+		if(&done) begin
+			$display("Test completed.");
+			$finish;
+		end
+	end
+
+	// Parallel DUT Instantiations
+	for(genvar  r = 0; r < $size(DIMS); r++) begin
+		for(genvar  l = 0; l < $size(DIMS); l++) begin
+			localparam int unsigned  REP = DIMS[r];
+			localparam int unsigned  LEN = DIMS[l];
+
+			data_t  idat;
+			logic  ivld;
+			uwire  irdy;
+
+			uwire data_t  odat;
+			uwire  olast;
+			uwire  ofin;
+			uwire  ovld;
+			logic  ordy;
+
+			replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut (
+				.clk, .rst,
+				.idat, .ivld, .irdy,
+				.odat, .olast, .ofin, .ovld, .ordy
+			);
+
+			// Input Feed: 0, 1, ..., 10*LEN-1
+			initial begin
+				idat = 'x;
+				ivld =  0;
+				@(posedge clk iff !rst);
+
+				for(int unsigned  i = 0; i < 10*LEN; i++) begin
+					idat <= i;
+					ivld <= 1;
+					@(posedge clk iff irdy);
+					idat <= 'x;
+					ivld <=  0;
+					while($urandom()%(REP-1) != 0) @(posedge clk);
+				end
+			end
+
+			// Output Check
+			initial begin
+				automatic int unsigned  base = 0;
+
+				ordy = 0;
+				@(posedge clk iff !rst);
+
+				for(int unsigned  k = 0; k < 10; k++) begin
+					for(int unsigned  j = 0; j < REP; j++) begin
+						for(int unsigned  i = 0; i < LEN; i++) begin
+							ordy <= 1;
+							@(posedge clk iff ovld);
+							assert(odat == base+i) else begin
+								$error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i);
+								$stop;
+							end
+							assert(olast == (i == LEN-1)) else begin
+								$error("#%0d.%0d: Last mismatch.", r, l);
+								$stop;
+							end
+							assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin
+								$error("#%0d.%0d: Fin mismatch.", r, l);
+								$stop;
+							end
+
+							ordy <= 0;
+							while($urandom()%13 == 0) @(posedge clk);
+						end
+					end
+					base += LEN;
+				end
+
+				done[$size(DIMS)*r + l] <= 1;
+			end
+		end
+	end
+
+endmodule : replay_buffer_tb

From 2efba6854267873c84d58f6d8fe6b64f649eaa99 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 5 Sep 2023 13:53:01 +0100
Subject: [PATCH 052/123] [to-rtl]: Infer unique node names after
 transformation is applied

---
 .../transformation/fpgadataflow/specialize_to_rtl_layers.py     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 23b6e59abe..47ed5ce863 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -32,6 +32,7 @@
 from onnx import helper
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.general import GiveUniqueNodeNames
 from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
 
 class InferRTLMatrixVectorActivation(Transformation):
@@ -105,5 +106,6 @@ def apply(self, model):
             model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
+            model = model.transform(GiveUniqueNodeNames())
         
         return (model, graph_modified)
\ No newline at end of file

From 114ea1bfed2dd2f14196f98aea97d6cac9d1d57e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 14:56:07 +0100
Subject: [PATCH 053/123] [mvu rtl]: add synthesis directive to handle 'X in
 simulation

---
 finn-rtllib/mvu/mvu_8sx9.sv | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 439fbc44f9..34aa856b1b 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -110,13 +110,17 @@ module mvu_8sx9 #(
 			always_ff @(posedge clk) begin
 				if (rst)     A <= '{default: 0};
 				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED];
+					A[EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+						zero ? '1 : 
+// synthesis translate_on						
+						a[3*i +: LANES_OCCUPIED];
 					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
 				end
 			end
 			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-											: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+			assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+												: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
 			end : genAin
 			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
 				assign a_in_i[i][9*j +: 9] = 9'b0;
@@ -124,8 +128,12 @@ module mvu_8sx9 #(
 		end : genExternalPregAct
 		else begin : genInpDSPAct
 			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
-											: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
+				assign a_in_i[i][9*j +: 9] = 
+// synthesis translate_off
+					zero ? '1 : 				
+// synthesis translate_on
+					SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
+												: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
 			end : genAin
 			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
 				assign a_in_i[i][9*j +: 9] = 9'b0;
@@ -148,7 +156,11 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)    B <= '{default: 0};
 					else if (en) begin
-						B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED];
+						B[i][EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+							zero ? '1 : 						
+// synthesis translate_on							
+							w[i][3*j +: LANES_OCCUPIED];
 						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
@@ -161,7 +173,11 @@ module mvu_8sx9 #(
 			end : genExternalPregWeight
 			else begin : genInpDSPWeight
 				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+					assign b_in_i[i][j][8*k +: 8] = 
+// synthesis translate_off					
+						zero ? '1 : 
+// synthesis translate_on					
+						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
 				end : genBin
 				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
 					assign b_in_i[i][j][8*k +: 8] = 8'b0;
@@ -178,9 +194,10 @@ module mvu_8sx9 #(
 			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
 			localparam bit FIRST = j == 0;
 			localparam bit LAST = j == CHAINLEN-1;
+			uwire [57:0] pp;
 
 			if (LAST) begin : genPOUT
-				assign p[i] = pcout[i][j][ACCU_WIDTH-1:0];
+				assign p[i] = pp[ACCU_WIDTH-1:0];
 			end
 
 			// Note: Since the product B * AD is computed,
@@ -264,6 +281,7 @@ module mvu_8sx9 #(
 					end
 					else	assign Preg = Mreg + pcout[i][j-1];
 				end
+				assign pp = Preg;
 				assign pcout[i][j] = Preg;
 			end : genBehav
 `ifndef VERILATOR

From 79fafdb25a8707f740a0a7e21aa4f55ef7101882 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 15:06:36 +0100
Subject: [PATCH 054/123] [replay buffer rtl]: minor fix to when LEN=1 (=
 AWIDTH=0)

---
 finn-rtllib/mvu/replay_buffer.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 3dfe72d6c6..942f1823ca 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -144,8 +144,8 @@ module replay_buffer #(
 		uwire  wr = irdy && ivld;
 		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
-			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
+			if(wr)  Mem[WP[AWIDTH:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH:0]];
 		end
 
 		uwire  vld = (RP != WP);

From 619d9db0d5872d1afd72b1d1df841e1f87a9f33a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 15:09:45 +0100
Subject: [PATCH 055/123] [mvu lut]: LUT-based MVU compute core

---
 finn-rtllib/mvu/mvu_lut.sv | 102 +++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
new file mode 100644
index 0000000000..b100a589e8
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_lut.sv
@@ -0,0 +1,102 @@
+module mvu_lut #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+    int unsigned  ACTIVATION_WIDTH,
+    int unsigned  WEIGHT_WIDTH,
+    bit  SIGNED_ACTIVATIONS,
+    bit  M_REG = 1,
+
+    localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]      w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+    typedef int unsigned  leave_load_t[2*SIMD-1];
+    function leave_load_t init_leave_loads();
+        automatic leave_load_t  res;
+        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+        return res;
+    endfunction : init_leave_loads
+
+    // Pipeline for last indicator flag
+    uwire last_i;
+    generate if (M_REG) begin
+        logic [0:1] L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= {last, L[0]};
+        end
+        assign  last_i = L[1];
+    end
+    else begin 
+        logic L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= last;
+        end
+        assign  last_i = L;
+    end
+    endgenerate
+
+    // For each PE generate
+    for (genvar  i = 0; i < PE; i++)  begin : genPE
+        // Stage #1: SIMD multipliers in parallel
+        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
+        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
+            if (M_REG) begin : genMreg
+                logic [MULT_WIDTH-1 : 0] M [SIMD];
+                always_ff @(posedge clk) begin
+                    if(rst)         M[j] = '{ default : 0 };
+                    else if (en)    M[j] = zero ? 0 :
+                                            SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                end
+                assign  m1[j] = M[j];
+            end : genMreg
+            else begin : genNoMreg 
+                assign m1[j] = zero ? 0 :
+                               SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[j]}) * $signed(w[i][j]);
+            end : genNoMreg
+        end : genSIMD
+
+        // Stage #2: Adder tree to reduce SIMD products
+        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
+        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
+        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
+        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
+        for(genvar n = 0; n < SIMD-1; n++) begin
+            // Sum truncated to actual maximum bit width at this node
+            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
+            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+            assign tree[n] = s;
+        end
+
+        // Stage #3: Buffer output
+        logic [ACCU_WIDTH-1:0] P2 [PE];
+        always_ff @(posedge clk) begin
+            if(rst)         P2[i] = '{ default : 0};
+            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
+        end
+
+        assign  vld = last_i;
+        assign  p[i] = P2[i];
+    end : genPE
+
+endmodule : mvu_lut

From 090f2ac4adf4b0523b23b27fce05f7422269d72a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 19 Sep 2023 12:23:55 +0100
Subject: [PATCH 056/123] [custom op]: add preferred_backend attribute

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 73d39ce642..4f24d71ccc 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -123,7 +123,7 @@ def get_nodeattr_types(self):
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
             # Flag to specify whether RTL-based or HLS-based implementation is preferred
-            "impl": ("s", False, "rtl", {"hls", "rtl"})
+            "preferred_backend": ("s", False, "rtl", {"hls", "rtl"})
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs

From ac5e82d9944f5b7475eb13546affd1bc03d57f4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 21 Sep 2023 13:03:27 +0100
Subject: [PATCH 057/123] Ensure a minimum of two buffer slots even for
 length-1 sequences.

---
 finn-rtllib/mvu/replay_buffer.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 942f1823ca..d4342f705c 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -111,7 +111,7 @@ module replay_buffer #(
 			assign	last_rep = RepLst;
 		end : blkRep
 
-		localparam int unsigned  AWIDTH = $clog2(LEN);
+		localparam int unsigned  AWIDTH = LEN < 2? 1 : $clog2(LEN);
 		typedef logic [AWIDTH  :0]  ptr_t;	// pointers with additional generational MSB
 		typedef logic [W     -1:0]  data_t;
 

From 85156935163fc803d453db5ce2c1c5163808bc9f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 15:07:12 +0100
Subject: [PATCH 058/123] [rtl mvu wrapper]: support for vvu layer and rename

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 92 +++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
new file mode 100644
index 0000000000..6dbf82cb7b
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter	IS_MVU = "$IS_MVU$",
+	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
+	parameter	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter	PE = $PE$,
+	parameter	SIMD = $SIMD$,
+	parameter	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter	SEGMENTLEN = $SEGMENTLEN$,
+	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
+
+	// Safely deducible parameters
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)(
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
+
+	// Weight Stream
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
+	input   weights_V_TVALID,
+	output  weights_V_TREADY,
+	// Input Stream
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
+	input	in0_V_TVALID,
+	output	in0_V_TREADY,
+	// Output Stream
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
+	output	out_V_TVALID,
+	input	out_V_TREADY
+);
+
+mvu_vvu_axi #(
+	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(weights_V_TDATA),
+	.s_axis_weights_tvalid(weights_V_TVALID),
+	.s_axis_weights_tready(weights_V_TREADY),
+	.s_axis_input_tdata(in0_V_TDATA),
+	.s_axis_input_tvalid(in0_V_TVALID),
+	.s_axis_input_tready(in0_V_TREADY),
+	.m_axis_output_tdata(out_V_TDATA),
+	.m_axis_output_tvalid(out_V_TVALID),
+	.m_axis_output_tready(out_V_TREADY)
+);
+
+endmodule // $MODULE_NAME_AXI_WRAPPER$

From cf28d780041fec1effdf743e62390eebc5c81f98 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:24:18 +0100
Subject: [PATCH 059/123] [mvu vvu tb]: modified testbench to also support
 testing VVU on DSP58

---
 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 222 +++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
new file mode 100644
index 0000000000..82c2e8e7b0
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_vvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam bit IS_MVU = 1;
+	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
+	localparam int unsigned MW = 1500;
+	localparam int unsigned MH = 256;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned PE = 16;
+	localparam int unsigned SEGMENTLEN = 2.0;
+	localparam bit FORCE_BEHAVIORAL = 1;
+	localparam bit M_REG_LUT = 1;
+	// Bit-width config
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW)
+	localparam bit SIGNED_ACTIVATIONS = 0;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations
+	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin
+			for (int i=0; i<SF; i++) begin
+				activations.dat <= ACTIVATIONS[SF*j+i];
+				do begin
+					activations.vld <= $urandom()%7 >= 0;
+					@(posedge clk);
+				end while (!(activations.vld === 1 && activations.rdy === 1));
+			end
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output
+	// a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				if (SIGNED_ACTIVATIONS)
+					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+				else
+					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 0;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end
+			end
+
+			NF_CNT += 1;
+		end
+
+		$finish;
+	end
+
+	// Instantiate DUT
+	mvu_vvu_axi #(
+		.IS_MVU(IS_MVU),
+		.COMPUTE_CORE(COMPUTE_CORE),
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+		.M_REG_LUT(M_REG_LUT)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+
+endmodule : mvu_vvu_axi_tb

From 2617c391e1d2c9b19fb881acb6012fc56df35eae Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:25:22 +0100
Subject: [PATCH 060/123] [axi wrapper]: minor modification to comment
 description

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 6dbf82cb7b..788e49a71b 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -28,7 +28,7 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * @brief	Verilog AXI-lite wrapper for MVU.
+ * @brief	Verilog AXI-lite wrapper for MVU & VVU.
  *****************************************************************************/
 
 module $MODULE_NAME_AXI_WRAPPER$ #(

From 8ca5fe73c003aec3e7998d83e233102c012dd531 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:34:12 +0100
Subject: [PATCH 061/123] [mvu axi]: add support for VVU on DSP58

---
 finn-rtllib/mvu/mvu_axi.sv | 105 ++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index 46167af95b..07ad32e6c8 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -28,19 +28,25 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
  * @details
+ *	 The following compute cores are supported:
+ *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
+ *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
+ *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
+ *     'unconstrained' LUT-based MVU and VVU.
  *  Folding hints:
- *	 - 4-bit MVU:          PE scaling should divide MH.
- *	 - 8-bit MVU - DSP48:  PE scaling should divide MH.
- *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3 and divide MW.
+ *	 - PE scaling should divide MH.
+ *   - SIMD scaling should divide MW.
  *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
  *	   impact critical paths more than PE scaling. PE scaling implies a
  *	   bigger fanout on the input activations.
  *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
  *****************************************************************************/
 
-module mvu_axi #(
+module mvu_vvu_axi #(
+	bit IS_MVU, // string type causes error in Vivado
+	parameter COMPUTE_CORE,
 	int unsigned MW,
 	int unsigned MH,
 	int unsigned PE,
@@ -51,16 +57,16 @@ module mvu_axi #(
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
 	bit FORCE_BEHAVIORAL = 0,
-	parameter MVU_IMPL_STYLE, // string type causes error in Vivado
+	bit M_REG_LUT = 1,
 
+	// Safely deducible parameters
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_LANES = PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
@@ -93,27 +99,31 @@ module mvu_axi #(
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
-		if (ACTIVATION_WIDTH > 9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-			$finish;
-		end
 		if (WEIGHT_WIDTH > 8) begin
 			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
 			$finish;
 		end
-		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-			$finish;
+		if (ACTIVATION_WIDTH > 8) begin
+			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
+				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
+				$finish;
+			end
 		end
-		if (MVU_IMPL_STYLE == "mvu_8sx9") begin
+		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
 			if (SEGMENTLEN == 0) begin
-				$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
 			end
 			if (SEGMENTLEN > (SIMD+2)/3) begin
 				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
 				$finish;
 			end
 		end
+		if (!IS_MVU) begin
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+				$error("VVU only supported on DSP58 or LUT-based implementation");
+				$finish;
+			end
+		end
 	end
 
 	uwire clk = ap_clk;
@@ -127,10 +137,10 @@ module mvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
-		.clk, .rst,
-		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	.clk, .rst,
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
 //-------------------- Input control --------------------\\
@@ -139,37 +149,60 @@ module mvu_axi #(
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-//-------------------- Core MVU --------------------\\
+//-------------------- Core MVU/VVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-
-	case(MVU_IMPL_STYLE)
-	"mvu_8sx9_dsp58":
-		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+	uwire mvauin_t amvau_i;
+
+	if (IS_MVU) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
+			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+		end : genRewire
+	end : genVVUInput
+
+	case(COMPUTE_CORE)
+	"mvu_vvu_8sx9_dsp58":
+		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-
 	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_vvu_lut":
+		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	default: initial begin
-		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
+		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
 		$finish;
 	end
 	endcase
@@ -203,7 +236,7 @@ module mvu_axi #(
 
 	assign	b_load = !B.vld || m_axis_output_tready;
 	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ default: 'x };
+		if(rst)		B <= '{ vld: 0, default: 'x };
 		else begin
 			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
 		end
@@ -212,4 +245,4 @@ module mvu_axi #(
 	assign	m_axis_output_tvalid = B.vld;
 	assign	m_axis_output_tdata  = B.dat;
 
-endmodule : mvu_axi
+endmodule : mvu_vvu_axi

From 32d6338c626b26d2e48cdb21cde438d557cc9bcd Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:34:36 +0100
Subject: [PATCH 062/123] [mvu vvu axi]: renamed file for consistency purposes

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 248 +++++++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
new file mode 100644
index 0000000000..07ad32e6c8
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -0,0 +1,248 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
+ * @details
+ *	 The following compute cores are supported:
+ *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
+ *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
+ *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
+ *     'unconstrained' LUT-based MVU and VVU.
+ *  Folding hints:
+ *	 - PE scaling should divide MH.
+ *   - SIMD scaling should divide MW.
+ *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
+ *	   impact critical paths more than PE scaling. PE scaling implies a
+ *	   bigger fanout on the input activations.
+ *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
+ *****************************************************************************/
+
+module mvu_vvu_axi #(
+	bit IS_MVU, // string type causes error in Vivado
+	parameter COMPUTE_CORE,
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
+	bit FORCE_BEHAVIORAL = 0,
+	bit M_REG_LUT = 1,
+
+	// Safely deducible parameters
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned NF = MH/PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
+		end
+		if (MH % PE != 0) begin
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
+		end
+		if (ACTIVATION_WIDTH > 8) begin
+			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
+				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
+				$finish;
+			end
+		end
+		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
+			if (SEGMENTLEN == 0) begin
+				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			end
+			if (SEGMENTLEN > (SIMD+2)/3) begin
+				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+				$finish;
+			end
+		end
+		if (!IS_MVU) begin
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+				$error("VVU only supported on DSP58 or LUT-based implementation");
+				$finish;
+			end
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	.clk, .rst,
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+//-------------------- Core MVU/VVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	uwire mvauin_t amvau_i;
+
+	if (IS_MVU) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
+			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+		end : genRewire
+	end : genVVUInput
+
+	case(COMPUTE_CORE)
+	"mvu_vvu_8sx9_dsp58":
+		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_4sx4u":
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_8sx8u_dsp48":
+		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_vvu_lut":
+		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	default: initial begin
+		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
+		$finish;
+	end
+	endcase
+
+//-------------------- Output register slice --------------------\\
+	struct packed {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)		A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+
+	struct packed {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ vld: 0, default: 'x };
+		else begin
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+		end
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule : mvu_vvu_axi

From 031406d73fa36a02638a94affd6a0bef36956c3c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:39:22 +0100
Subject: [PATCH 063/123] [mvu 8sx9]: added support for VVU on DSP58, resolved
 PyVerilator-caused error and added synthesis directive to handle 'X in input
 data

---
 finn-rtllib/mvu/mvu_8sx9.sv | 100 +++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 34aa856b1b..52a93739d6 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -31,7 +31,8 @@
  * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
  *****************************************************************************/
 
-module mvu_8sx9 #(
+module mvu_vvu_8sx9 #(
+	parameter IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
@@ -39,7 +40,9 @@ module mvu_8sx9 #(
 	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
-	bit FORCE_BEHAVIORAL = 0
+	bit FORCE_BEHAVIORAL = 0,
+
+	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
   )
   (
     // Global Control
@@ -51,7 +54,7 @@ module mvu_8sx9 #(
     input   logic last,
     input   logic zero, // ignore current inputs and force this partial product to zero
     input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
-	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
 
 	// Ouput
 	output  logic vld,
@@ -67,9 +70,10 @@ module mvu_8sx9 #(
 //-------------------- Declare global signals --------------------\\
 	localparam int unsigned CHAINLEN = (SIMD+2)/3;
 	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-	uwire [26:0] a_in_i [CHAINLEN];
+	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
+	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
 	uwire [23:0] b_in_i [PE][CHAINLEN];
-	uwire [57:0] pcout [PE][CHAINLEN];
+	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
 
 //-------------------- Shift register for opmode select signal --------------------\\
 	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
@@ -99,48 +103,48 @@ module mvu_8sx9 #(
 
 //-------------------- Buffer for input activations --------------------\\
 	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
 
-	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-		localparam int TOTAL_PREGS = i/SEGLEN;
-		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-		localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
-
-		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-			logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
-			always_ff @(posedge clk) begin
-				if (rst)     A <= '{default: 0};
-				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= 
-// synthesis translate_off
-						zero ? '1 : 
-// synthesis translate_on						
-						a[3*i +: LANES_OCCUPIED];
-					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)     A <= '{default: 0};
+					else if(en) begin
+						A[EXTERNAL_PREGS-1] <= 
+	// synthesis translate_off
+							zero ? '1 : 
+	// synthesis translate_on						
+							a[SIMD*k + 3*i +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+					end
 				end
-			end
-			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-			assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-												: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
-			end : genAin
-			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-				assign a_in_i[i][9*j +: 9] = 9'b0;
-			end : genAinZero
-		end : genExternalPregAct
-		else begin : genInpDSPAct
-			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = 
-// synthesis translate_off
-					zero ? '1 : 				
-// synthesis translate_on
-					SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
-												: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
-			end : genAin
-			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-				assign a_in_i[i][9*j +: 9] = 9'b0;
-			end : genAinZero
-		end : genInpDSPAct
-
-	end : genActSIMD
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genExternalPregAct
+			else begin : genInpDSPAct
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
+	// synthesis translate_off
+						zero ? '1 : 				
+	// synthesis translate_on
+						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
+													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genInpDSPAct
+		end : genActSIMD
+	end : genActPE
 
 //-------------------- Buffer for weights --------------------\\
 	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
@@ -209,7 +213,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Areg <= '{ default : 0};
 					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[j] };
+						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
 						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
 					end
 				end
@@ -384,7 +388,7 @@ module mvu_8sx9 #(
 							7'b000_0000
 					}), // 9-bit input: Operation mode
 					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[j] }),            // 34-bit input: A data
+					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
 					.B(b_in_i[i][j]),                   // 24-bit input: B data
 					.C('x),                             // 58-bit input: C data
 					.CARRYIN('0),                       // 1-bit input: Carry-in
@@ -420,4 +424,4 @@ module mvu_8sx9 #(
 		end : genDSPChain
 	end : genDSPPE
 
-endmodule : mvu_8sx9
+endmodule : mvu_vvu_8sx9

From e2c1f1589c374a2fd7d0eb17621568621ea88bda Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:39:52 +0100
Subject: [PATCH 064/123] [mvu vvu 8sx9]: renamed compute core for consistency

---
 finn-rtllib/mvu/mvu_vvu_8sx9.sv | 427 ++++++++++++++++++++++++++++++++
 1 file changed, 427 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
new file mode 100644
index 0000000000..52a93739d6
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
+ *****************************************************************************/
+
+module mvu_vvu_8sx9 #(
+	parameter IS_MVU,
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
+	bit FORCE_BEHAVIORAL = 0,
+
+	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+  )
+  (
+    // Global Control
+	input   logic clk,
+    input   logic rst,
+    input   logic en,
+
+	// Input
+    input   logic last,
+    input   logic zero, // ignore current inputs and force this partial product to zero
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+
+	// Ouput
+	output  logic vld,
+    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
+  );
+	// for verilator always use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
+
+//-------------------- Declare global signals --------------------\\
+	localparam int unsigned CHAINLEN = (SIMD+2)/3;
+	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
+	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
+	uwire [23:0] b_in_i [PE][CHAINLEN];
+	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
+
+//-------------------- Shift register for opmode select signal --------------------\\
+	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+
+	always_ff @(posedge clk) begin
+		if(rst)     L <= '{default: 0};
+		else if(en) begin
+			L[1+MAX_PIPELINE_STAGES] <= last;
+			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
+		end
+	end
+	assign vld = L[0];
+
+//-------------------- Shift register for ZERO flag --------------------\\
+	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+
+	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+		always_ff @(posedge clk) begin
+			if (rst)      Z <= '{default: 0};
+			else if(en) begin
+				Z[0] <= zero;
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
+			end
+		end
+	end;
+
+//-------------------- Buffer for input activations --------------------\\
+	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
+
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)     A <= '{default: 0};
+					else if(en) begin
+						A[EXTERNAL_PREGS-1] <= 
+	// synthesis translate_off
+							zero ? '1 : 
+	// synthesis translate_on						
+							a[SIMD*k + 3*i +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+					end
+				end
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genExternalPregAct
+			else begin : genInpDSPAct
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
+	// synthesis translate_off
+						zero ? '1 : 				
+	// synthesis translate_on
+						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
+													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genInpDSPAct
+		end : genActSIMD
+	end : genActPE
+
+//-------------------- Buffer for weights --------------------\\
+	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+
+	for (genvar i=0; i<PE; i++) begin : genWeightPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = j/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
+
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)    B <= '{default: 0};
+					else if (en) begin
+						B[i][EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+							zero ? '1 : 						
+// synthesis translate_on							
+							w[i][3*j +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
+					end
+				end
+				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
+			end : genExternalPregWeight
+			else begin : genInpDSPWeight
+				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = 
+// synthesis translate_off					
+						zero ? '1 : 
+// synthesis translate_on					
+						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
+			end : genInpDSPWeight
+		end : genWeightSIMD
+	end : genWeightPE
+
+//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
+	for (genvar i=0; i<PE; i++) begin : genDSPPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
+			localparam int TOTAL_PREGS = j/SEGLEN;
+			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
+			localparam bit FIRST = j == 0;
+			localparam bit LAST = j == CHAINLEN-1;
+			uwire [57:0] pp;
+
+			if (LAST) begin : genPOUT
+				assign p[i] = pp[ACCU_WIDTH-1:0];
+			end
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if(BEHAVIORAL) begin : genBehav
+				// Stage #1: Input A/B
+				logic signed [33:0] Areg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Areg <= '{ default : 0};
+					else if (en) begin
+						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
+						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
+					end
+				end
+				logic signed [23:0] Breg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Breg <= '{ default : 0};
+					else if (en) begin
+						Breg[0] <= b_in_i[i][j];
+						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
+					end
+				end
+
+				// Stage #2: Multiply-Accumulate
+				logic signed [57:0] Mreg;
+				logic InmodeZero = 0;
+				always_ff @(posedge clk) begin
+					if (rst)		InmodeZero <= 0;
+					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+				end
+				always_ff @(posedge clk) begin
+					if (rst)	Mreg <= 0;
+					else if (en) begin
+						automatic logic signed [57:0] m = 0;
+						for (int k = 0; k < 3; k++) begin
+							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
+						end
+						Mreg <= m;
+					end
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0] Preg;
+				logic Opmode = 0;
+				if (FIRST && !LAST) begin : genFirst
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg;
+						end
+					end
+					else	assign Preg = Mreg;
+				end
+				else if (FIRST && LAST) begin : genSingle
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
+					end
+				end
+				else if (!FIRST && LAST) begin : genLast
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
+					end
+				end
+				else begin : genMid
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg + pcout[i][j-1];
+						end
+					end
+					else	assign Preg = Mreg + pcout[i][j-1];
+				end
+				assign pp = Preg;
+				assign pcout[i][j] = Preg;
+			end : genBehav
+`ifndef VERILATOR
+			else begin: genDSP
+				DSP58 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+														// legacy mode.
+					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+					.RND(58'h000000000000000),          // Rounding Constant
+					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
+										2'b01, // Y : M
+										2'b01  // X: M
+					}), // Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                           // Pipeline stages for C (0-1)
+					.DREG(0),                           // Pipeline stages for D (0-1)
+					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+					.MREG(1),                           // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+				)
+				DSP58_inst (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),                           // 34-bit output: A port cascade
+					.BCOUT(),                           // 24-bit output: B cascade
+					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+					// Data outputs: Data Ports
+					.CARRYOUT(),                        // 4-bit output: Carry
+					.P(pp),                             // 58-bit output: Primary data
+					.XOROUT(),                          // 8-bit output: XOR data
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),                          // 34-bit input: A cascade data
+					.BCIN('x),                          // 24-bit input: B cascade
+					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
+					// Control inputs: Control Inputs/Status Bits
+					.ALUMODE(4'h0),                     // 4-bit input: ALU control
+					.CARRYINSEL('0),                    // 3-bit input: Carry select
+					.CLK(clk),                          // 1-bit input: Clock
+					.INMODE({
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+							2'b00,
+							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
+					}),                                 // 5-bit input: INMODE control
+					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+					.OPMODE({
+							LAST ? {1'b0, L[1]} : 2'b00,
+							7'b000_0000
+					}), // 9-bit input: Operation mode
+					// Data inputs: Data Ports
+					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
+					.B(b_in_i[i][j]),                   // 24-bit input: B data
+					.C('x),                             // 58-bit input: C data
+					.CARRYIN('0),                       // 1-bit input: Carry-in
+					.D('x),                             // 27-bit input: D data
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),                           // 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),                           // 1-bit input: Clock enable for DREG
+					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+					.CEM(en),                           // 1-bit input: Clock enable for MREG
+					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+					.RSTA(rst),                         // 1-bit input: Reset for AREG
+					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+					.RSTB(rst),                         // 1-bit input: Reset for BREG
+					.RSTC('0),                          // 1-bit input: Reset for CREG
+					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+					.RSTM(rst),                         // 1-bit input: Reset for MREG
+					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+				);
+			end : genDSP
+`endif
+		end : genDSPChain
+	end : genDSPPE
+
+endmodule : mvu_vvu_8sx9

From adb58694be36bd0fa2e8558f760d1642f14a2a38 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:58:20 +0100
Subject: [PATCH 065/123] [axi wrapper]: changed parameter to localparam

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 788e49a71b..270fe7351f 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	localparam 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)

From f54d438f78fe4ce78c84fdd7bcbc514048bd2fe0 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:59:32 +0100
Subject: [PATCH 066/123] [axi]: added support for LUT-based VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 07ad32e6c8..ff677fc244 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -195,8 +195,8 @@ module mvu_vvu_axi #(
 			.vld(ovld), .p(odat)
 		);
 	"mvu_vvu_lut":
-		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+		mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)

From a4e2ac7146afeab4271344785f638c88cf78da73 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:00:07 +0100
Subject: [PATCH 067/123] [mvu vvu 8sx9]: minor change to list of generics

---
 finn-rtllib/mvu/mvu_vvu_8sx9.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
index 52a93739d6..2aa9d71b6c 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
@@ -32,7 +32,7 @@
  *****************************************************************************/
 
 module mvu_vvu_8sx9 #(
-	parameter IS_MVU,
+	bit IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
@@ -42,7 +42,7 @@ module mvu_vvu_8sx9 #(
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
 	bit FORCE_BEHAVIORAL = 0,
 
-	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
   )
   (
     // Global Control

From 40ad0b46c03b10b47ec4d72dd04a4ad96149fa89 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:00:51 +0100
Subject: [PATCH 068/123] [mvu lut]: added support for VVU

---
 finn-rtllib/mvu/mvu_lut.sv | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
index b100a589e8..c100910d75 100644
--- a/finn-rtllib/mvu/mvu_lut.sv
+++ b/finn-rtllib/mvu/mvu_lut.sv
@@ -1,13 +1,15 @@
-module mvu_lut #(
-	int unsigned  PE,
-	int unsigned  SIMD,
+module mvu_vvu_lut #(
+    bit IS_MVU,
+    int unsigned  PE,
+    int unsigned  SIMD,
 	int unsigned  ACCU_WIDTH,
     int unsigned  ACTIVATION_WIDTH,
     int unsigned  WEIGHT_WIDTH,
     bit  SIGNED_ACTIVATIONS,
     bit  M_REG = 1,
 
-    localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
+    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
 )(
 	// Global Control
 	input	logic  clk,
@@ -17,8 +19,8 @@ module mvu_lut #(
 	// Input
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]      w,	// signed weights
-	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
+	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
 
 	// Ouput
 	output	logic  vld,
@@ -63,16 +65,16 @@ module mvu_lut #(
                 always_ff @(posedge clk) begin
                     if(rst)         M[j] = '{ default : 0 };
                     else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
                 end
                 assign  m1[j] = M[j];
             end : genMreg
             else begin : genNoMreg 
                 assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[j]}) * $signed(w[i][j]);
+                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
             end : genNoMreg
         end : genSIMD
 
@@ -99,4 +101,4 @@ module mvu_lut #(
         assign  p[i] = P2[i];
     end : genPE
 
-endmodule : mvu_lut
+endmodule : mvu_vvu_lut

From 30fcb5b734f86d0032549a4efe29d96b13ee5451 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:01:10 +0100
Subject: [PATCH 069/123] [mvu vvu lut]: renamed file for consistency

---
 finn-rtllib/mvu/mvu_vvu_lut.sv | 104 +++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv
new file mode 100644
index 0000000000..c100910d75
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_lut.sv
@@ -0,0 +1,104 @@
+module mvu_vvu_lut #(
+    bit IS_MVU,
+    int unsigned  PE,
+    int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+    int unsigned  ACTIVATION_WIDTH,
+    int unsigned  WEIGHT_WIDTH,
+    bit  SIGNED_ACTIVATIONS,
+    bit  M_REG = 1,
+
+    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
+    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
+	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+    typedef int unsigned  leave_load_t[2*SIMD-1];
+    function leave_load_t init_leave_loads();
+        automatic leave_load_t  res;
+        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+        return res;
+    endfunction : init_leave_loads
+
+    // Pipeline for last indicator flag
+    uwire last_i;
+    generate if (M_REG) begin
+        logic [0:1] L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= {last, L[0]};
+        end
+        assign  last_i = L[1];
+    end
+    else begin 
+        logic L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= last;
+        end
+        assign  last_i = L;
+    end
+    endgenerate
+
+    // For each PE generate
+    for (genvar  i = 0; i < PE; i++)  begin : genPE
+        // Stage #1: SIMD multipliers in parallel
+        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
+        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
+            if (M_REG) begin : genMreg
+                logic [MULT_WIDTH-1 : 0] M [SIMD];
+                always_ff @(posedge clk) begin
+                    if(rst)         M[j] = '{ default : 0 };
+                    else if (en)    M[j] = zero ? 0 :
+                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                end
+                assign  m1[j] = M[j];
+            end : genMreg
+            else begin : genNoMreg 
+                assign m1[j] = zero ? 0 :
+                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
+            end : genNoMreg
+        end : genSIMD
+
+        // Stage #2: Adder tree to reduce SIMD products
+        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
+        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
+        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
+        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
+        for(genvar n = 0; n < SIMD-1; n++) begin
+            // Sum truncated to actual maximum bit width at this node
+            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
+            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+            assign tree[n] = s;
+        end
+
+        // Stage #3: Buffer output
+        logic [ACCU_WIDTH-1:0] P2 [PE];
+        always_ff @(posedge clk) begin
+            if(rst)         P2[i] = '{ default : 0};
+            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
+        end
+
+        assign  vld = last_i;
+        assign  p[i] = P2[i];
+    end : genPE
+
+endmodule : mvu_vvu_lut

From cb434386fa8bf6f63964dd889c8025c3e9616a6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 21 Sep 2023 15:58:34 +0100
Subject: [PATCH 070/123] Revert to proper address truncation without
 generation bit.

---
 finn-rtllib/mvu/replay_buffer.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index d4342f705c..3e2766f63d 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -144,8 +144,8 @@ module replay_buffer #(
 		uwire  wr = irdy && ivld;
 		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(wr)  Mem[WP[AWIDTH:0]] <= idat;
-			if(rd)  ODat <= Mem[RP[AWIDTH:0]];
+			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
 		end
 
 		uwire  vld = (RP != WP);

From b4b69f3fa7caae4be9357abf596aff4a66561228 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:04:05 +0100
Subject: [PATCH 071/123] remove deletd/renamed files

---
 finn-rtllib/mvu/mvu_8sx9.sv            | 427 -------------------------
 finn-rtllib/mvu/mvu_8sx9_axi.sv        | 179 -----------
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv     | 208 ------------
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v |  93 ------
 finn-rtllib/mvu/mvu_8sx9_tb.sv         | 165 ----------
 finn-rtllib/mvu/mvu_axi.sv             | 248 --------------
 finn-rtllib/mvu/mvu_axi_wrapper.v      |  92 ------
 finn-rtllib/mvu/mvu_lut.sv             | 104 ------
 finn-rtllib/mvu/tb/mvu_axi_tb.sv       | 215 -------------
 9 files changed, 1731 deletions(-)
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv
 delete mode 100644 finn-rtllib/mvu/mvu_axi.sv
 delete mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v
 delete mode 100644 finn-rtllib/mvu/mvu_lut.sv
 delete mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
deleted file mode 100644
index 52a93739d6..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ /dev/null
@@ -1,427 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
- *****************************************************************************/
-
-module mvu_vvu_8sx9 #(
-	parameter IS_MVU,
-    int unsigned PE,
-    int unsigned SIMD,
-    int unsigned ACTIVATION_WIDTH,
-    int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-    bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
-	bit FORCE_BEHAVIORAL = 0,
-
-	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-  )
-  (
-    // Global Control
-	input   logic clk,
-    input   logic rst,
-    input   logic en,
-
-	// Input
-    input   logic last,
-    input   logic zero, // ignore current inputs and force this partial product to zero
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
-	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
-
-	// Ouput
-	output  logic vld,
-    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
-  );
-	// for verilator always use behavioral code
-	localparam bit  BEHAVIORAL =
-`ifdef VERILATOR
-		1 ||
-`endif
-		FORCE_BEHAVIORAL;
-
-//-------------------- Declare global signals --------------------\\
-	localparam int unsigned CHAINLEN = (SIMD+2)/3;
-	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
-	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
-	uwire [23:0] b_in_i [PE][CHAINLEN];
-	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
-
-//-------------------- Shift register for opmode select signal --------------------\\
-	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
-	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
-
-	always_ff @(posedge clk) begin
-		if(rst)     L <= '{default: 0};
-		else if(en) begin
-			L[1+MAX_PIPELINE_STAGES] <= last;
-			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
-		end
-	end
-	assign vld = L[0];
-
-//-------------------- Shift register for ZERO flag --------------------\\
-	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
-
-	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
-		always_ff @(posedge clk) begin
-			if (rst)      Z <= '{default: 0};
-			else if(en) begin
-				Z[0] <= zero;
-				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
-			end
-		end
-	end;
-
-//-------------------- Buffer for input activations --------------------\\
-	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-			localparam int TOTAL_PREGS = i/SEGLEN;
-			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
-
-			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
-				always_ff @(posedge clk) begin
-					if (rst)     A <= '{default: 0};
-					else if(en) begin
-						A[EXTERNAL_PREGS-1] <= 
-	// synthesis translate_off
-							zero ? '1 : 
-	// synthesis translate_on						
-							a[SIMD*k + 3*i +: LANES_OCCUPIED];
-						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
-					end
-				end
-				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
-				end : genAin
-				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
-				end : genAinZero
-			end : genExternalPregAct
-			else begin : genInpDSPAct
-				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
-	// synthesis translate_off
-						zero ? '1 : 				
-	// synthesis translate_on
-						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
-													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
-				end : genAin
-				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
-				end : genAinZero
-			end : genInpDSPAct
-		end : genActSIMD
-	end : genActPE
-
-//-------------------- Buffer for weights --------------------\\
-	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-
-	for (genvar i=0; i<PE; i++) begin : genWeightPE
-		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
-			localparam int TOTAL_PREGS = j/SEGLEN;
-			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
-
-			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
-				always_ff @(posedge clk) begin
-					if (rst)    B <= '{default: 0};
-					else if (en) begin
-						B[i][EXTERNAL_PREGS-1] <= 
-// synthesis translate_off
-							zero ? '1 : 						
-// synthesis translate_on							
-							w[i][3*j +: LANES_OCCUPIED];
-						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
-					end
-				end
-				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
-				end : genBin
-				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
-					assign b_in_i[i][j][8*k +: 8] = 8'b0;
-				end : genBinZero
-			end : genExternalPregWeight
-			else begin : genInpDSPWeight
-				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = 
-// synthesis translate_off					
-						zero ? '1 : 
-// synthesis translate_on					
-						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
-				end : genBin
-				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
-					assign b_in_i[i][j][8*k +: 8] = 8'b0;
-				end : genBinZero
-			end : genInpDSPWeight
-		end : genWeightSIMD
-	end : genWeightPE
-
-//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-	for (genvar i=0; i<PE; i++) begin : genDSPPE
-		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
-			localparam int TOTAL_PREGS = j/SEGLEN;
-			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
-			localparam bit FIRST = j == 0;
-			localparam bit LAST = j == CHAINLEN-1;
-			uwire [57:0] pp;
-
-			if (LAST) begin : genPOUT
-				assign p[i] = pp[ACCU_WIDTH-1:0];
-			end
-
-			// Note: Since the product B * AD is computed,
-			//       rst can be only applied to AD and zero only to B
-			//       with the same effect as zeroing both.
-			if(BEHAVIORAL) begin : genBehav
-				// Stage #1: Input A/B
-				logic signed [33:0] Areg [INTERNAL_PREGS];
-				always_ff @(posedge clk) begin
-					if (rst)	Areg <= '{ default : 0};
-					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
-						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
-					end
-				end
-				logic signed [23:0] Breg [INTERNAL_PREGS];
-				always_ff @(posedge clk) begin
-					if (rst)	Breg <= '{ default : 0};
-					else if (en) begin
-						Breg[0] <= b_in_i[i][j];
-						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
-					end
-				end
-
-				// Stage #2: Multiply-Accumulate
-				logic signed [57:0] Mreg;
-				logic InmodeZero = 0;
-				always_ff @(posedge clk) begin
-					if (rst)		InmodeZero <= 0;
-					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
-				end
-				always_ff @(posedge clk) begin
-					if (rst)	Mreg <= 0;
-					else if (en) begin
-						automatic logic signed [57:0] m = 0;
-						for (int k = 0; k < 3; k++) begin
-							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
-						end
-						Mreg <= m;
-					end
-				end
-
-				// Stage #3: Accumulate
-				logic signed [57:0] Preg;
-				logic Opmode = 0;
-				if (FIRST && !LAST) begin : genFirst
-					if (PREG) begin : genPregBehav
-						always_ff @(posedge clk) begin
-							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg;
-						end
-					end
-					else	assign Preg = Mreg;
-				end
-				else if (FIRST && LAST) begin : genSingle
-					always_ff @(posedge clk) begin
-						if (rst)		Opmode <= 0;
-						else if (en)	Opmode <= L[1];
-					end
-					always_ff @(posedge clk) begin
-						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
-					end
-				end
-				else if (!FIRST && LAST) begin : genLast
-					always_ff @(posedge clk) begin
-						if (rst)		Opmode <= 0;
-						else if (en)	Opmode <= L[1];
-					end
-					always_ff @(posedge clk) begin
-						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
-					end
-				end
-				else begin : genMid
-					if (PREG) begin : genPregBehav
-						always_ff @(posedge clk) begin
-							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg + pcout[i][j-1];
-						end
-					end
-					else	assign Preg = Mreg + pcout[i][j-1];
-				end
-				assign pp = Preg;
-				assign pcout[i][j] = Preg;
-			end : genBehav
-`ifndef VERILATOR
-			else begin: genDSP
-				DSP58 #(
-					// Feature Control Attributes: Data Path Selection
-					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-														// legacy mode.
-					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-					.RND(58'h000000000000000),          // Rounding Constant
-					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-					// Pattern Detector Attributes: Pattern Detection Configuration
-					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
-										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
-										2'b01, // Y : M
-										2'b01  // X: M
-					}), // Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-					// Register Control Attributes: Pipeline Register Configuration
-					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-					.CREG(0),                           // Pipeline stages for C (0-1)
-					.DREG(0),                           // Pipeline stages for D (0-1)
-					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-					.MREG(1),                           // Multiplier pipeline stages (0-1)
-					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
-					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-				)
-				DSP58_inst (
-					// Cascade outputs: Cascade Ports
-					.ACOUT(),                           // 34-bit output: A port cascade
-					.BCOUT(),                           // 24-bit output: B cascade
-					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
-					// Control outputs: Control Inputs/Status Bits
-					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
-					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-					// Data outputs: Data Ports
-					.CARRYOUT(),                        // 4-bit output: Carry
-					.P(pp),                             // 58-bit output: Primary data
-					.XOROUT(),                          // 8-bit output: XOR data
-					// Cascade inputs: Cascade Ports
-					.ACIN('x),                          // 34-bit input: A cascade data
-					.BCIN('x),                          // 24-bit input: B cascade
-					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
-					// Control inputs: Control Inputs/Status Bits
-					.ALUMODE(4'h0),                     // 4-bit input: ALU control
-					.CARRYINSEL('0),                    // 3-bit input: Carry select
-					.CLK(clk),                          // 1-bit input: Clock
-					.INMODE({
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-							2'b00,
-							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
-					}),                                 // 5-bit input: INMODE control
-					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-					.OPMODE({
-							LAST ? {1'b0, L[1]} : 2'b00,
-							7'b000_0000
-					}), // 9-bit input: Operation mode
-					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
-					.B(b_in_i[i][j]),                   // 24-bit input: B data
-					.C('x),                             // 58-bit input: C data
-					.CARRYIN('0),                       // 1-bit input: Carry-in
-					.D('x),                             // 27-bit input: D data
-					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
-					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-					.CEC('0),                           // 1-bit input: Clock enable for CREG
-					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-					.CED('0),                           // 1-bit input: Clock enable for DREG
-					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-					.CEM(en),                           // 1-bit input: Clock enable for MREG
-					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-					.RSTA(rst),                         // 1-bit input: Reset for AREG
-					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-					.RSTB(rst),                         // 1-bit input: Reset for BREG
-					.RSTC('0),                          // 1-bit input: Reset for CREG
-					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-					.RSTM(rst),                         // 1-bit input: Reset for MREG
-					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-				);
-			end : genDSP
-`endif
-		end : genDSPChain
-	end : genDSPPE
-
-endmodule : mvu_vvu_8sx9
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
deleted file mode 100644
index 5f215927d8..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ /dev/null
@@ -1,179 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_8sx9_axi #(
-	int unsigned MW,
-	int unsigned MH,
-	int unsigned PE,
-	int unsigned SIMD,
-	int unsigned ACTIVATION_WIDTH,
-	int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
-	parameter RAM_STYLE = "auto",
-
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_LANES = PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)
-(
-	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
-	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
-	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
-	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
-);
-
-//-------------------- Parameter sanity checks --------------------\\
-	initial begin
-		if (MW % SIMD != 0) begin
-			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-			$finish;
-		end
-		if (MH % PE != 0) begin
-			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-			$finish;
-		end
-		if (ACTIVATION_WIDTH > 9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-			$finish;
-		end
-		if (WEIGHT_WIDTH > 8) begin
-			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-			$finish;
-		end
-		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-			$finish;
-		end
-		if (SEGMENTLEN == 0) begin
-			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
-		end
-		if (SEGMENTLEN > (SIMD+2)/3) begin
-			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			$finish;
-		end
-	end
-
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
-
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
-	uwire alast;
-	uwire afin;
-	uwire avld;
-	uwire ardy;
-
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
-		.clk, .rst,
-		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
-	);
-
-//-------------------- Input control --------------------\\
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
-
-//-------------------- Core MVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][57:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
-		.clk, .rst, .en,
-		.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
-		.vld(ovld), .p(odat)
-	);
-
-//-------------------- Output register slice --------------------\\
-	struct {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [57:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-	
-	struct {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
-
-	assign	b_load = !B.vld || m_axis_output_tready;
-	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ default: 'x };
-		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end	
-	end
-
-	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
-
-endmodule
\ No newline at end of file
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
deleted file mode 100644
index 70ffa096ef..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
+++ /dev/null
@@ -1,208 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_8sx9_axi_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MW = 600;
-	localparam int unsigned MH = 256;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned PE = 16;
-	localparam int unsigned SEGMENTLEN = 4;
-	// Bit-width config  
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
-	// Simulation constants  
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-	// Generate clk and reset signal   
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic ap_rst_n = 0;
-	initial begin
-		repeat(16) @(posedge clk);
-		ap_rst_n <= 1;
-	end
-
-	uwire ap_clk = clk;
-
-	// Generate activations  
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-	struct {
-		activation_t dat;
-		logic vld;
-		logic rdy;
-	} activations;
-
-	initial begin
-		activations.vld = 0;
-		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		for (int i=0; i<SF; i++) begin
-			activations.dat <= ACTIVATIONS[i];
-			do begin 
-				activations.vld = $urandom()%7 > 1;
-				@(posedge clk);
-			end while (!(activations.vld === 1 && activations.rdy === 1));
-		end
-
-		activations.vld <= 0;
-		activations.dat <= 'x;
-	end
-
-	// Generate weights   
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF]; 
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-	struct {
-		weight_t dat;
-		logic vld;
-		logic rdy;
-	} weights;
-
-	initial begin
-		weights.vld = 0;
-		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		weights.vld <= 1;
-		for (int i=0; i<NF; i++) begin
-			for (int j=0; j<SF; j++) begin
-				weights.dat <= WEIGHTS[i][j];
-				@(posedge clk iff weights.rdy);
-			end
-		end
-
-		weights.vld <= 0;
-		weights.dat <= 'x;
-	end
-
-	// Function to compute golden output  
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	struct {
-		output_t dat;
-		logic vld;
-		logic rdy;
-	} outputs;
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end  
-		return res;
-	endfunction : check_output;
-
-	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-	int unsigned NF_CNT = 0;
-	initial begin
-		outputs.rdy = 0;
-		while (NF_CNT < NF) begin
-			// Loop until both rdy & vld are asserted
-			do begin
-				outputs.rdy <= $urandom()%7 >= 1;
-				@(posedge clk iff ap_rst_n);
-			end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-			// Compare produced outputs against golden outputs
-			foreach(outputs.dat[i]) begin
-				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin 
-					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-					$stop;
-				end  
-			end
-			
-			NF_CNT += 1;
-		end
-
-		$finish;  
-	end
-
-	// Instantiate DUT
-	mvu_8sx9_axi #(
-		.MW(MW),
-		.MH(MH),
-		.PE(PE),
-		.SIMD(SIMD),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.SEGMENTLEN(SEGMENTLEN)
-	)
-	dut (
-		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-		.m_axis_output_tready(outputs.rdy)
-	);
-  
-endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
deleted file mode 100644
index e15f77fbae..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ /dev/null
@@ -1,93 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Verilog AXI-lite wrapper for MVU.
- *****************************************************************************/
-
-module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter 	MW = $MW$,
-	parameter	MH = $MH$,
-	parameter 	PE = $PE$,
-	parameter 	SIMD = $SIMD$,
-	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
-	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
-	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
-	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
-	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
-
-	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter 	OUTPUT_LANES = PE,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)(
-	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *)
-	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
-	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
-	input	ap_rst_n,
-
-	// Weight Stream
-	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	s_axis_weights_tvalid,
-	output	s_axis_weights_tready,
-
-	// Input Stream
-	input	[INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	s_axis_input_tvalid,
-	output	s_axis_input_tready,
-
-	// Output Stream
-	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	m_axis_output_tvalid,
-	input	m_axis_output_tready
-);
-
-mvu_8sx9_axi #(
-	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
-	) inst (
-	.ap_clk(ap_clk),
-	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(s_axis_weights_tdata),
-	.s_axis_weights_tvalid(s_axis_weights_tvalid),
-	.s_axis_weights_tready(s_axis_weights_tready),
-	.s_axis_input_tdata(s_axis_input_tdata),
-	.s_axis_input_tvalid(s_axis_input_tvalid),
-	.s_axis_input_tready(s_axis_input_tready),
-	.m_axis_output_tdata(m_axis_output_tdata),
-	.m_axis_output_tvalid(m_axis_output_tvalid),
-	.m_axis_output_tready(m_axis_output_tready)
-);
-
-endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
deleted file mode 100644
index adf6a8f9c2..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_tb.sv
+++ /dev/null
@@ -1,165 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU core compute kernel.
- *****************************************************************************/
-
-module mvu_8sx9_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MH = 256;
-	localparam int unsigned PE = 16;
-	localparam int unsigned MW = 600;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned SEGMENTLEN = 4;
-	// Bit-width config  
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam bit SIGNED_ACTIVATIONS = 1;
-	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	typedef logic signed [PE-1:0][57:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end  
-		return res;
-	endfunction : check_output;
-
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic rst;
-	initial begin
-		rst = 1;
-		repeat(16) @(posedge clk);
-		rst <= 0;
-	end
-
-	logic last;
-	logic zero;
-	logic vld;
-	activation_t a;
-	weight_t w;
-	output_t p;
-	// Reference signals
-	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
-	// Counter for number of outputs (NF dimension) that are produced
-	int NF_CNT = 0;
-
-	initial begin
-		ACTIVATIONS = init_ACTIVATIONS();
-		WEIGHTS = init_WEIGHTS();
-		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-		last = 0;
-		zero = 0;
-		a = 'x;
-		w = 'x;
-
-		@(posedge clk iff !rst);
-
-		for (int j=0; j<NF; j++) begin
-			for (int i=0; i<SF; i++) begin
-				last <= (i==SF-1) ? 1 : 0;
-				a <= ACTIVATIONS[i];
-				w <= WEIGHTS[j][i];
-				@(posedge clk iff en);
-			end
-		end
-
-		last <= 0;
-		zero <= 1;  
-
-		// Continue until all NF outputs are produced & compared
-		@(posedge clk && (NF_CNT==NF));
-
-		$finish;
-	end
-
-	logic en = 0;
-	always_ff @(posedge clk) begin
-		en <= ($urandom()%7 > 1) && !rst;
-	end
-
-	// Compare computed output against golden output when vld flag is raised by DUT
-	always_ff @(posedge clk iff (vld && en)) begin
-		foreach(p[i]) begin
-			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-			else begin 
-				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				$stop;
-			end  
-		end
-		NF_CNT += 1;
-	end
-
-	// Instantiate DUT
-	mvu_8sx9 #(
-		.PE(PE),
-		.SIMD(SIMD),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.SEGMENTLEN(SEGMENTLEN)
-	)
-	dut (
-		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
-	);
-
-endmodule
diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
deleted file mode 100644
index 07ad32e6c8..0000000000
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ /dev/null
@@ -1,248 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
- * @details
- *	 The following compute cores are supported:
- *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
- *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
- *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
- *     'unconstrained' LUT-based MVU and VVU.
- *  Folding hints:
- *	 - PE scaling should divide MH.
- *   - SIMD scaling should divide MW.
- *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
- *	   impact critical paths more than PE scaling. PE scaling implies a
- *	   bigger fanout on the input activations.
- *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
- *****************************************************************************/
-
-module mvu_vvu_axi #(
-	bit IS_MVU, // string type causes error in Vivado
-	parameter COMPUTE_CORE,
-	int unsigned MW,
-	int unsigned MH,
-	int unsigned PE,
-	int unsigned SIMD,
-	int unsigned ACTIVATION_WIDTH,
-	int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
-	bit FORCE_BEHAVIORAL = 0,
-	bit M_REG_LUT = 1,
-
-	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
-)
-(
-	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
-	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
-	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
-	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
-);
-
-//-------------------- Parameter sanity checks --------------------\\
-	initial begin
-		if (MW % SIMD != 0) begin
-			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-			$finish;
-		end
-		if (MH % PE != 0) begin
-			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-			$finish;
-		end
-		if (WEIGHT_WIDTH > 8) begin
-			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-			$finish;
-		end
-		if (ACTIVATION_WIDTH > 8) begin
-			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
-				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
-				$finish;
-			end
-		end
-		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
-			if (SEGMENTLEN == 0) begin
-				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			end
-			if (SEGMENTLEN > (SIMD+2)/3) begin
-				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-				$finish;
-			end
-		end
-		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
-				$error("VVU only supported on DSP58 or LUT-based implementation");
-				$finish;
-			end
-		end
-	end
-
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
-
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
-	uwire alast;
-	uwire afin;
-	uwire avld;
-	uwire ardy;
-
-	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
-	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
-	);
-
-//-------------------- Input control --------------------\\
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
-
-//-------------------- Core MVU/VVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	uwire mvauin_t amvau_i;
-
-	if (IS_MVU) begin : genMVUInput
-		assign  amvau_i = amvau;
-	end : genMVUInput
-	else begin : genVVUInput
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
-		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
-									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-		end : genRewire
-	end : genVVUInput
-
-	case(COMPUTE_CORE)
-	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_8sx8u_dsp48":
-		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_vvu_lut":
-		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	default: initial begin
-		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
-		$finish;
-	end
-	endcase
-
-//-------------------- Output register slice --------------------\\
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
-
-	assign	b_load = !B.vld || m_axis_output_tready;
-	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ vld: 0, default: 'x };
-		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end
-	end
-
-	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
-
-endmodule : mvu_vvu_axi
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
deleted file mode 100644
index 239c5bbacd..0000000000
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ /dev/null
@@ -1,92 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Verilog AXI-lite wrapper for MVU.
- *****************************************************************************/
-
-module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter 	MW = $MW$,
-	parameter	MH = $MH$,
-	parameter 	PE = $PE$,
-	parameter 	SIMD = $SIMD$,
-	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
-	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
-	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
-	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
-	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter	MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$",
-	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
-
-	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter 	OUTPUT_LANES = PE,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)(
-	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
-	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
-	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
-	input	ap_rst_n,
-
-	// Weight Stream
-	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
-	input   weights_V_TVALID,
-	output  weights_V_TREADY,
-	// Input Stream
-	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
-	input	in0_V_TVALID,
-	output	in0_V_TREADY,
-	// Output Stream
-	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
-	output	out_V_TVALID,
-	input	out_V_TREADY
-);
-
-mvu_axi #(
-	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE)
-	) inst (
-	.ap_clk(ap_clk),
-	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(weights_V_TDATA),
-	.s_axis_weights_tvalid(weights_V_TVALID),
-	.s_axis_weights_tready(weights_V_TREADY),
-	.s_axis_input_tdata(in0_V_TDATA),
-	.s_axis_input_tvalid(in0_V_TVALID),
-	.s_axis_input_tready(in0_V_TREADY),
-	.m_axis_output_tdata(out_V_TDATA),
-	.m_axis_output_tvalid(out_V_TVALID),
-	.m_axis_output_tready(out_V_TREADY)
-);
-
-endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
deleted file mode 100644
index c100910d75..0000000000
--- a/finn-rtllib/mvu/mvu_lut.sv
+++ /dev/null
@@ -1,104 +0,0 @@
-module mvu_vvu_lut #(
-    bit IS_MVU,
-    int unsigned  PE,
-    int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-    int unsigned  ACTIVATION_WIDTH,
-    int unsigned  WEIGHT_WIDTH,
-    bit  SIGNED_ACTIVATIONS,
-    bit  M_REG = 1,
-
-    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
-    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-)(
-	// Global Control
-	input	logic  clk,
-	input	logic  rst,
-	input	logic  en,
-
-	// Input
-	input	logic  last,
-	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
-	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
-
-	// Ouput
-	output	logic  vld,
-	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
-);
-
-    typedef int unsigned  leave_load_t[2*SIMD-1];
-    function leave_load_t init_leave_loads();
-        automatic leave_load_t  res;
-        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
-        return res;
-    endfunction : init_leave_loads
-
-    // Pipeline for last indicator flag
-    uwire last_i;
-    generate if (M_REG) begin
-        logic [0:1] L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= {last, L[0]};
-        end
-        assign  last_i = L[1];
-    end
-    else begin 
-        logic L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= last;
-        end
-        assign  last_i = L;
-    end
-    endgenerate
-
-    // For each PE generate
-    for (genvar  i = 0; i < PE; i++)  begin : genPE
-        // Stage #1: SIMD multipliers in parallel
-        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
-        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
-            if (M_REG) begin : genMreg
-                logic [MULT_WIDTH-1 : 0] M [SIMD];
-                always_ff @(posedge clk) begin
-                    if(rst)         M[j] = '{ default : 0 };
-                    else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
-                end
-                assign  m1[j] = M[j];
-            end : genMreg
-            else begin : genNoMreg 
-                assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
-            end : genNoMreg
-        end : genSIMD
-
-        // Stage #2: Adder tree to reduce SIMD products
-        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
-        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
-        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
-        for(genvar n = 0; n < SIMD-1; n++) begin
-            // Sum truncated to actual maximum bit width at this node
-            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
-            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-            assign tree[n] = s;
-        end
-
-        // Stage #3: Buffer output
-        logic [ACCU_WIDTH-1:0] P2 [PE];
-        always_ff @(posedge clk) begin
-            if(rst)         P2[i] = '{ default : 0};
-            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
-        end
-
-        assign  vld = last_i;
-        assign  p[i] = P2[i];
-    end : genPE
-
-endmodule : mvu_vvu_lut
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
deleted file mode 100644
index b89b58f55b..0000000000
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ /dev/null
@@ -1,215 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_axi_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MW = 50;
-	localparam int unsigned MH = 8;
-	localparam int unsigned SIMD = 10;
-	localparam int unsigned PE = 2;
-	localparam int unsigned SEGMENTLEN = 2;
-	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
-	localparam bit FORCE_BEHAVIORAL = 1;
-	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 8;
-	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 0;
-	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-	// Generate clk and reset signal
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic ap_rst_n = 0;
-	initial begin
-		repeat(16) @(posedge clk);
-		ap_rst_n <= 1;
-	end
-
-	uwire ap_clk = clk;
-
-	// Generate activations
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-	struct {
-		activation_t dat;
-		logic vld;
-		logic rdy;
-	} activations;
-
-	initial begin
-		activations.vld = 0;
-		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		for (int i=0; i<SF; i++) begin
-			activations.dat <= ACTIVATIONS[i];
-			do begin
-				activations.vld <= $urandom()%7 >= 1;
-				@(posedge clk);
-			end while (!(activations.vld === 1 && activations.rdy === 1));
-		end
-
-		activations.vld <= 0;
-		activations.dat <= 'x;
-	end
-
-	// Generate weights
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-	struct {
-		weight_t dat;
-		logic vld;
-		logic rdy;
-	} weights;
-
-	initial begin
-		weights.vld = 0;
-		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		weights.vld <= 1;
-		for (int i=0; i<NF; i++) begin
-			for (int j=0; j<SF; j++) begin
-				weights.dat <= WEIGHTS[i][j];
-				@(posedge clk iff weights.rdy);
-			end
-		end
-
-		weights.vld <= 0;
-		weights.dat <= 'x;
-	end
-
-	// Function to compute golden output
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	struct {
-		output_t dat;
-		logic vld;
-		logic rdy;
-	} outputs;
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS)
-					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-				else
-					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end
-		return res;
-	endfunction : check_output;
-
-	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-	int unsigned NF_CNT = 0;
-	initial begin
-		outputs.rdy = 0;
-		while (NF_CNT < NF) begin
-			// Loop until both rdy & vld are asserted
-			do begin
-				outputs.rdy <= $urandom()%7 >= 1;
-				@(posedge clk iff ap_rst_n);
-			end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-			// Compare produced outputs against golden outputs
-			foreach(outputs.dat[i]) begin
-				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin
-					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-					$stop;
-				end
-			end
-
-			NF_CNT += 1;
-		end
-
-		$finish;
-	end
-
-	// Instantiate DUT
-	mvu_axi #(
-		.MW(MW),
-		.MH(MH),
-		.PE(PE),
-		.SIMD(SIMD),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
-		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
-	)
-	dut (
-		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-		.m_axis_output_tready(outputs.rdy)
-	);
-
-endmodule : mvu_axi_tb

From 14c5fa902820396e3489a244dc4d705fd1ebe532 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:12:47 +0100
Subject: [PATCH 072/123] [mvu vvu 8sx9]: renamed for consistency

---
 finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} (99%)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
similarity index 99%
rename from finn-rtllib/mvu/mvu_vvu_8sx9.sv
rename to finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 2aa9d71b6c..6ae117e3ab 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -31,7 +31,7 @@
  * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
  *****************************************************************************/
 
-module mvu_vvu_8sx9 #(
+module mvu_vvu_8sx9_dsp58 #(
 	bit IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
@@ -424,4 +424,4 @@ module mvu_vvu_8sx9 #(
 		end : genDSPChain
 	end : genDSPPE
 
-endmodule : mvu_vvu_8sx9
+endmodule : mvu_vvu_8sx9_dsp58

From 3a3758826512fd3d5ed0bcdd23358d5fd5b724cd Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:13:25 +0100
Subject: [PATCH 073/123] [mvu vvu axi]: changes for renamed module

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index ff677fc244..416480da79 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -174,7 +174,7 @@ module mvu_vvu_axi #(
 
 	case(COMPUTE_CORE)
 	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,

From afe36baa134b947718db34d140c8d6500b91cb2a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:44:17 +0100
Subject: [PATCH 074/123] [mvu vvu wrapper]: convert localparam to param

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 270fe7351f..9c65dbc06e 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
-	localparam	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	parameter	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)

From e4f2f9e0e4f1cb0bae2bf7e439c57356b3670620 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:45:48 +0100
Subject: [PATCH 075/123] [mvau-rtl custom-op]: bugfix to instantiate
 memstreamer, modified renamed files and axi wrapper template fill-out

---
 .../matrixvectoractivation_rtl.py             | 92 ++++++++++---------
 1 file changed, 51 insertions(+), 41 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 9f8130806b..c7fb855884 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -61,8 +61,7 @@
 
 
 class MatrixVectorActivation_rtl(HLSCustomOp):
-    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
-    function."""
+    """Class that corresponds to finn-rtl Matrix Vector Unit."""
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
@@ -73,8 +72,7 @@ def get_nodeattr_types(self):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
-            "ActVal": ("i", False, 0),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -165,7 +163,6 @@ def verify_node(self):
         # verify that all necessary attributes exist
         # TODO collect automatically from get_nodeattr_types
         try:
-            self.get_nodeattr("code_gen_dir_cppsim")
             self.get_nodeattr("executable_path")
             self.get_nodeattr("resType")
             self.get_nodeattr("MW")
@@ -199,7 +196,6 @@ def verify_node(self):
 
         return info_messages
 
-    # TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -213,7 +209,6 @@ def uram_estimation(self):
         mstyle = self.get_nodeattr("ram_style")
         if (
             (mmode == "decoupled" and mstyle != "ultra")
-            or (mmode == "const" and self.calc_wmem() <= 128)
             or (mmode == "external")
         ):
             return 0
@@ -221,7 +216,6 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
-    # TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -243,7 +237,6 @@ def bram_estimation(self):
         mstyle = self.get_nodeattr("ram_style")
         if (
             (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
-            or (mmode == "const" and self.calc_wmem() <= 128)
             or (mmode == "external")
         ):
             return 0
@@ -262,7 +255,6 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
-    # TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -275,7 +267,6 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
-    # TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -290,7 +281,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
+# TODO: fix lut estimations 
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -333,9 +324,13 @@ def lut_estimation(self):
 
         return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
+# TODO: fix DSP estimations --> depends on fpga_part
     def dsp_estimation(self):
         # multiplication
+        # mvu_8sx9 (DSP58): ceil(SIMD/3)
+        # mvu_4sx4u (DSP48/DSP58): ceil(PE/4)
+        # mvu_8sx8u (DSP48): ceil(PE/2)
+        # mvu_lut: 0
         P = self.get_nodeattr("PE")
         res_type = self.get_nodeattr("resType")
         Q = self.get_nodeattr("SIMD")
@@ -349,18 +344,24 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point
+# TODO: fix exp_cycles estimations --> depends on fpga_part and clk
     def get_exp_cycles(self):
+        # mvu_8sx9 (DSP58):
+        # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice)
+        # + MW/SIMD * MH/PE
+        # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): 
+        # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane)
+        # + MW/SIMD * MH/PE
+        # mvu_lut:
+        # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) 
+        # + MW/SIMD * MH/PE
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         num_inp_vec = self.get_nodeattr("numInputVectors")
         mh = self.get_nodeattr("MH")
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1
-        # Actual exp_cycles is probably slightly larger (say 3 cycles
-        # (DSP A/B, M, P - reg) + additional pipeline buffer cycles.
-        # Most probably <10)
+        mmv = 1     
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -711,7 +712,7 @@ def execute_node(self, context, graph):
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+            has to be set to "rtlsim" """.format(
                     mode
                 )
             )
@@ -795,11 +796,12 @@ def code_generation_ipi(self):
                 os.path.join(
                     code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
                 ),
-                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "mvu_vvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
-                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -813,7 +815,7 @@ def code_generation_ipi(self):
             )
 
             # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "amd.com:FINN:memstream:1.0"
+            strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
                 "create_bd_cell -type ip -vlnv %s /%s/%s"
@@ -890,11 +892,12 @@ def code_generation_ipi(self):
                 os.path.join(
                     code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
                 ),
-                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "mvu_vvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
-                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -959,27 +962,32 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-    # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
-        # Insert pipeline registers in the DSP chain to meet target clock frequency
-        return 4 # default to 4 for now
+        # Insert pipeline registers in the DSP58 chain to meet target clock frequency
+        # 0.741 ns seems the worst-case delay through first DSP
+        # 0.605 ns seems to be (on average) delay for all subsequent DSPs
+        dsp_chain_len = np.floor((clk - 0.741) / 0.605)
+        return max(1, dsp_chain_len)
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
-        # supported RTL module
-        act_width = self.get_input_datatype(0).bitwidth()
-        weight_width = self.get_input_datatype(1).bitwidth()
-        is_versal = (
-            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-            or fpgapart[0:5] == "xqrvc"
-        )
-        if act_width == 4 and weight_width == 4:
-            return "mvu_4sx4u"
+        # supported RTL compute core
+        if self.get_nodeattr("resType") == "lut":
+            return "mvu_vvu_lut"
         else:
-            if is_versal:
-                return "mvu_8sx9_dsp58"
+            act_width = self.get_input_datatype(0).bitwidth()
+            weight_width = self.get_input_datatype(1).bitwidth()
+            is_versal = (
+                fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+                or fpgapart[0:5] == "xqrvc"
+            )
+            if act_width == 4 and weight_width == 4:
+                return "mvu_4sx4u"
             else:
-                return "mvu_8sx8u_dsp48"
+                if is_versal:
+                    return "mvu_vvu_8sx9_dsp58"
+                else:
+                    return "mvu_8sx8u_dsp48"
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation
@@ -1023,9 +1031,11 @@ def generate_hdl(self, model, fpgapart, clk):
         self.set_nodeattr("ip_path", code_gen_dir)
 
     def prepare_codegen_default(self, fpgapart, clk):
-        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v"
 
         code_gen_dict = {}
+        code_gen_dict["$IS_MVU$"] = [str(1)]
+        code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
         code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
@@ -1039,7 +1049,7 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-        code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
+        code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
 
         return template_path, code_gen_dict
 

From b49b79a0a669caad9355e59e1ee877ca59b65d27 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:47:50 +0100
Subject: [PATCH 076/123] [specialize to rtl]: fix to changed attribute name
 and added support for converting HLS-based VVU custom-ops to RTL-based
 custom-ops

---
 .../fpgadataflow/specialize_to_rtl_layers.py  | 82 ++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 47ed5ce863..5061282695 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 from qonnx.transformation.base import Transformation
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.core.datatype import DataType
@@ -60,7 +61,7 @@ def apply(self, model):
         for n in graph.node:
             node_ind += 1
             if n.op_type == "MatrixVectorActivation":
-                preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp"
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
                 supported_in_rtl = self._is_rtl_variant_compatible(n)
                 if (preferred_in_rtl and supported_in_rtl):
                     mvau_input = n.input[0]
@@ -76,6 +77,7 @@ def apply(self, model):
                     pe = getCustomOp(n).get_nodeattr("PE")
                     mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
                     ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    resType = getCustomOp(n).get_nodeattr("resType")
                     runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
 
                     new_node = helper.make_node(
@@ -93,6 +95,7 @@ def apply(self, model):
                         outputDataType=outputDataType,
                         numInputVectors=numInputVectors,
                         mem_mode=mem_mode,
+                        resType=resType,
                         name=n.name + "_rtl",
                         ram_style=ram_style,
                         runtime_writeable_weights=runtime_writeable_weights
@@ -108,4 +111,81 @@ def apply(self, model):
             model = model.transform(InferDataTypes())
             model = model.transform(GiveUniqueNodeNames())
         
+        return (model, graph_modified)
+
+class InferRTLVectorVectorActivation(Transformation):
+    """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported."""
+
+    def __init__(self):
+        super().__init__()
+
+    def _is_rtl_variant_compatible(self, n):
+        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
+        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
+        folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0)
+        
+        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
+            return True
+        else:
+            return False
+    
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "VectorVectorActivation":
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
+                supported_in_rtl = self._is_rtl_variant_compatible(n)
+                if (preferred_in_rtl and supported_in_rtl):
+                    vvau_input = n.input[0]
+                    vvau_weight = n.input[1]
+                    vvau_output = n.output[0]
+                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
+                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
+                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
+                    pe = getCustomOp(n).get_nodeattr("PE")
+                    simd = getCustomOp(n).get_nodeattr("SIMD")
+                    dim = getCustomOp(n).get_nodeattr("Dim")
+                    channels = getCustomOp(n).get_nodeattr("Channels")
+                    kernel = getCustomOp(n).get_nodeattr("Kernel")
+                    resType = getCustomOp(n).get_nodeattr("resType")
+                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
+                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    resType = getCustomOp(n).get_nodeattr("resType")                    
+
+                    new_node = helper.make_node(
+                        "VectorVectorActivation_rtl",
+                        [vvau_input, vvau_weight],
+                        [vvau_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        name=n.name + "_rtl",
+                        PE=pe,
+                        SIMD=simd,
+                        Dim=dim,
+                        Channels=channels,
+                        Kernel=kernel,
+                        resType=resType,
+                        inputDataType=inputDataType,
+                        weightDataType=weightDataType,
+                        outputDataType=outputDataType,
+                        mem_mode=mem_mode,
+                        runtime_writeable_weights=runtime_writeable_weights,
+                        ram_style=ram_style
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified=True
+        
+        if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+            model = model.transform(GiveUniqueNodeNames())
+        
         return (model, graph_modified)
\ No newline at end of file

From 9bdba031df228a2afbe99b8ea2fb576b678bba86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 19 Sep 2023 15:27:28 +0100
Subject: [PATCH 077/123] Adding core for DSP48 backport.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 416c12c1cc..07c44cf89a 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -4,7 +4,9 @@ module mvu_8sx8u_dsp48 #(
 	int unsigned  ACCU_WIDTH,
 	int unsigned  ACTIVATION_WIDTH,
 	int unsigned  WEIGHT_WIDTH,
-	bit FORCE_BEHAVIORAL = 0,
+
+	bit  SIGNED_ACTIVATIONS = 0,
+	bit  FORCE_BEHAVIORAL = 0,
 
 	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
 )(
@@ -16,8 +18,8 @@ module mvu_8sx8u_dsp48 #(
 	// Input
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  w,	// signed weights
-	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
 
 	// Ouput
 	output	logic  vld,
@@ -47,7 +49,7 @@ module mvu_8sx8u_dsp48 #(
 	assign	vld = L[5];
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
-    localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
+	localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
 
 	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
 	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
@@ -61,7 +63,7 @@ module mvu_8sx8u_dsp48 #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = a[s];
+			uwire [23:0]  bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
 			logic [33:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx;

From 2cf1ef70306339b1409ed61d8e18eda243bf56ad Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 14:48:34 +0100
Subject: [PATCH 078/123] [mvu rtl core]: added support for signed activations
 for DSP48-based MVUs

---
 finn-rtllib/mvu/mvu_4sx4u.sv   | 3 ++-
 finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 88985312c9..706347d700 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -2,6 +2,7 @@ module mvu_4sx4u #(
 	int unsigned  PE,
 	int unsigned  SIMD,
 	int unsigned  ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
 	bit FORCE_BEHAVIORAL = 0
 )(
 	// Global Control
@@ -57,7 +58,7 @@ module mvu_4sx4u #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = a[s];
+			uwire [23:0]  bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
 			logic [33:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx[3:1];
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 416480da79..da7e00cc55 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -182,14 +182,14 @@ module mvu_vvu_axi #(
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)

From ab8d4a8e075ac9b3ccf78d2a08907d5dcc116fdb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 16:17:38 +0100
Subject: [PATCH 079/123] [rtl mvu custom-op]: add upper bound to SEGMENTLEN
 equal to number of DSP58s chained together

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index c7fb855884..d0a638475a 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -966,7 +966,9 @@ def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
         # 0.741 ns seems the worst-case delay through first DSP
         # 0.605 ns seems to be (on average) delay for all subsequent DSPs
-        dsp_chain_len = np.floor((clk - 0.741) / 0.605)
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605)
+        max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
+        dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
         return max(1, dsp_chain_len)
 
     def _resolve_impl_style(self, fpgapart):

From 5a429fcbe14ca6177082fab472549407f47f97d6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:29:39 +0100
Subject: [PATCH 080/123] [mvu_vvu dsp58]: change weight input to 2D instead of
 3D array

---
 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 6ae117e3ab..53cf71fd5f 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -42,7 +42,8 @@ module mvu_vvu_8sx9_dsp58 #(
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
 	bit FORCE_BEHAVIORAL = 0,
 
-	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD,
+	localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD
   )
   (
     // Global Control
@@ -53,7 +54,7 @@ module mvu_vvu_8sx9_dsp58 #(
 	// Input
     input   logic last,
     input   logic zero, // ignore current inputs and force this partial product to zero
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+    input   logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights
 	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
 
 	// Ouput
@@ -164,7 +165,8 @@ module mvu_vvu_8sx9_dsp58 #(
 // synthesis translate_off
 							zero ? '1 : 						
 // synthesis translate_on							
-							w[i][3*j +: LANES_OCCUPIED];
+							//w[i][3*j +: LANES_OCCUPIED];
+							w[SIMD*i+3*j +: LANES_OCCUPIED];
 						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
@@ -181,7 +183,8 @@ module mvu_vvu_8sx9_dsp58 #(
 // synthesis translate_off					
 						zero ? '1 : 
 // synthesis translate_on					
-						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+						//PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+						PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] };
 				end : genBin
 				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
 					assign b_in_i[i][j][8*k +: 8] = 8'b0;

From a4a18bb08cef96bb52c02096d54b573b421bcd12 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:30:55 +0100
Subject: [PATCH 081/123] [mvu_vvu axi]: re-wire weights appropriately for VVU
 DSP58

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index da7e00cc55..f0f75c633a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -45,7 +45,7 @@
  *****************************************************************************/
 
 module mvu_vvu_axi #(
-	bit IS_MVU, // string type causes error in Vivado
+	bit IS_MVU,
 	parameter COMPUTE_CORE,
 	int unsigned MW,
 	int unsigned MH,
@@ -64,8 +64,8 @@ module mvu_vvu_axi #(
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
+	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE),
+	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
 (
@@ -91,11 +91,11 @@ module mvu_vvu_axi #(
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
-		if (MW % SIMD != 0) begin
+		if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin
 			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
 			$finish;
 		end
-		if (MH % PE != 0) begin
+		if (MH % PE != 0 && IS_MVU) begin
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
@@ -137,7 +137,7 @@ module mvu_vvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
 	.clk, .rst,
 	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
 	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
@@ -154,9 +154,11 @@ module mvu_vvu_axi #(
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	uwire mvauin_t amvau_i;
+	uwire mvauin_weight_t wmvau_i;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;
+		assign  wmvau_i = s_axis_weights_tdata;
 	end : genMVUInput
 	else begin : genVVUInput
 		// The input stream will have the channels interleaved for VVU when PE>1
@@ -164,11 +166,14 @@ module mvu_vvu_axi #(
 		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
 		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
 		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		localparam int num_of_elements = PE*SIMD;
 		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
 			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
 									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
 									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+			assign  wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? 
+									s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH]
+									: s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH];
 		end : genRewire
 	end : genVVUInput
 
@@ -178,7 +183,7 @@ module mvu_vvu_axi #(
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":

From cc0737bcd00cdd6df6e3d4ff38215ac5d9eb42e6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:31:35 +0100
Subject: [PATCH 082/123] [mvu_vvu axi wrapper]: fix to IS_MVU parameter

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 9c65dbc06e..01deb23840 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -32,7 +32,7 @@
  *****************************************************************************/
 
 module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter	IS_MVU = "$IS_MVU$",
+	parameter	IS_MVU = $IS_MVU$,
 	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
 	parameter	MW = $MW$,
 	parameter	MH = $MH$,

From c0eff0b819828a5e1d1ef80815f63be0042ce742 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:32:47 +0100
Subject: [PATCH 083/123] [mvu_vvu tb]: WIP -- changes to self-checker and
 shape of input data

---
 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 79 +++++++++++++++++-----------
 1 file changed, 49 insertions(+), 30 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
index 82c2e8e7b0..b46fc588c9 100644
--- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
@@ -35,23 +35,23 @@ module mvu_vvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam bit IS_MVU = 1;
+	localparam bit IS_MVU = 0;
 	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
-	localparam int unsigned MW = 1500;
-	localparam int unsigned MH = 256;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned PE = 16;
-	localparam int unsigned SEGMENTLEN = 2.0;
+	localparam int unsigned MW = 36;
+	localparam int unsigned MH = 1;
+	localparam int unsigned SIMD = 3;
+	localparam int unsigned PE = 4;
+	localparam int unsigned SEGMENTLEN = 1.0;
 	localparam bit FORCE_BEHAVIORAL = 1;
 	localparam bit M_REG_LUT = 1;
 	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 4;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW)
-	localparam bit SIGNED_ACTIVATIONS = 0;
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 6;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
 	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NF = IS_MVU ? MH/PE : 1;
+	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE);
 	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
 	localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8;
 	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
@@ -72,7 +72,7 @@ module mvu_vvu_axi_tb();
 
 	// Generate activations
 	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF];
+	typedef activation_t activation_vector_t[SF];
 
 	function activation_vector_t init_ACTIVATIONS;
 		automatic activation_vector_t res;
@@ -93,14 +93,12 @@ module mvu_vvu_axi_tb();
 		activations.dat = 'X;
 		@(posedge clk iff ap_rst_n);
 
-		for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin
-			for (int i=0; i<SF; i++) begin
-				activations.dat <= ACTIVATIONS[SF*j+i];
-				do begin
-					activations.vld <= $urandom()%7 >= 0;
-					@(posedge clk);
-				end while (!(activations.vld === 1 && activations.rdy === 1));
-			end
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin
+				activations.vld <= $urandom()%7 >= 0;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
 
 		activations.vld <= 0;
@@ -143,7 +141,9 @@ module mvu_vvu_axi_tb();
 	end
 
 	// Function to compute golden output
-	// a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
 	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
 	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
 	typedef output_t output_vector_t [NF];
@@ -156,14 +156,33 @@ module mvu_vvu_axi_tb();
 
 	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
 		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS)
-					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-				else
-					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+		// for (int j = 0; j<MH; j++) begin
+		// 	for (int i = 0; i<MW; i++) begin
+		// 		if (SIGNED_ACTIVATIONS)
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]);
+		// 		else
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]);
+		// 	end
+		// end
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		for (int i = 0; i < NF; i++) begin
+			for (int j = 0; j < SF; j++) begin
+				for (int k = 0; k < PE; k++) begin
+					for (int l = 0; l < SIMD; l++) begin
+						if (SIGNED_ACTIVATIONS)
+							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) :
+												 $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]);
+						else
+							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) :
+												 $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]);
+					end
+				end
 			end
 		end
 		return res;

From cf7f4946dc44f264de665e8a23893bd858277796 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 15:20:07 +0000
Subject: [PATCH 084/123] [mvu vvu axi]: minor bugfixes to enable VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index f0f75c633a..ddedec1e8a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -64,7 +64,7 @@ module mvu_vvu_axi #(
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE),
+	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
@@ -91,11 +91,11 @@ module mvu_vvu_axi #(
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
-		if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin
+		if (MW % SIMD != 0) begin
 			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
 			$finish;
 		end
-		if (MH % PE != 0 && IS_MVU) begin
+		if (MH % PE != 0) begin
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
@@ -152,13 +152,10 @@ module mvu_vvu_axi #(
 //-------------------- Core MVU/VVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	uwire mvauin_t amvau_i;
-	uwire mvauin_weight_t wmvau_i;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;
-		assign  wmvau_i = s_axis_weights_tdata;
 	end : genMVUInput
 	else begin : genVVUInput
 		// The input stream will have the channels interleaved for VVU when PE>1
@@ -169,11 +166,8 @@ module mvu_vvu_axi #(
 		localparam int num_of_elements = PE*SIMD;
 		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
 			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
 									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-			assign  wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? 
-									s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH]
-									: s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH];
 		end : genRewire
 	end : genVVUInput
 
@@ -183,7 +177,7 @@ module mvu_vvu_axi #(
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i),
+			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":

From 5ffc221eaa07828001e423551ad05f8207178656 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:35:45 +0000
Subject: [PATCH 085/123] [mvu vvu axi]: minor fix -- define mvauin_weight_t

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index ddedec1e8a..8eb92a93e6 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -153,6 +153,7 @@ module mvu_vvu_axi #(
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	uwire mvauin_t amvau_i;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;

From 40d652ccb817295e5668ed765f8e348346584465 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 29 Nov 2023 14:02:33 +0000
Subject: [PATCH 086/123] [rtl mvu op]: minor fix to chain length estimation
 and enabled behavioral mode for rtl sim

---
 .../custom_op/fpgadataflow/matrixvectoractivation_rtl.py   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index d0a638475a..da560d73fd 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -966,10 +966,12 @@ def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
         # 0.741 ns seems the worst-case delay through first DSP
         # 0.605 ns seems to be (on average) delay for all subsequent DSPs
-        critical_path_dsps = np.floor((clk - 0.741) / 0.605)
+        # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
+        assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
         max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
         dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
-        return max(1, dsp_chain_len)
+        return dsp_chain_len
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
@@ -1051,7 +1053,6 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-        code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
 
         return template_path, code_gen_dict
 

From 6e98bac42f225e7ed8629e0cb67211e78db61d15 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 13 Dec 2023 09:36:25 +0000
Subject: [PATCH 087/123] [rtlsim]: use pyverilator util functions

---
 src/finn/custom_op/fpgadataflow/hlscustomop.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 4fed8ed4b5..01b94c20ca 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -31,7 +31,7 @@
 import subprocess
 import warnings
 from abc import abstractmethod
-from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io
+from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
@@ -491,15 +491,11 @@ def exec_precompiled_singlenode_model(self):
     def reset_rtlsim(self, sim):
         """Sets reset input in pyverilator to zero, toggles the clock and set it
         back to one"""
-        sim.io.ap_rst_n = 0
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
-        sim.io.ap_rst_n = 1
+        reset_rtlsim(sim)
 
     def toggle_clk(self, sim):
         """Toggles the clock input in pyverilator once."""
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
+        toggle_clk(sim)
 
     def hls_sname(self):
         """Get the naming convention used by Vitis HLS for stream signals

From 5dd74ad1dede3bf2a0405de8c803a4adfb2e65d3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 8 Dec 2023 17:12:42 +0000
Subject: [PATCH 088/123] [mvu vvu axi]: sign extend output tdata
 (byte-aligned)

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 8eb92a93e6..699662bd72 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -243,6 +243,6 @@ module mvu_vvu_axi #(
 	end
 
 	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
+	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
 endmodule : mvu_vvu_axi

From b20410bfd968c27395537b60bba11849b599a33a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:55:56 +0000
Subject: [PATCH 089/123] [mvu core]: dsp48 convert unpacked array to packed
 array to work around limitation on max array indices in Verilator

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 4 ++--
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 706347d700..7a2af35742 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -309,7 +309,7 @@ module mvu_4sx4u #(
 			// Conclusive high part accumulation
 			if(i >= PE_REM && i < 3) begin : genHi
 				// Adder Tree across all SIMD high contributions, each from [-1:1]
-				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
@@ -333,7 +333,7 @@ module mvu_4sx4u #(
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
-				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 07c44cf89a..1e6855f779 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -335,7 +335,7 @@ module mvu_8sx8u_dsp48 #(
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
-				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node

From 1c2cc0c2c1d98d7cde569f65eb20873a10e1f12f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:57:19 +0000
Subject: [PATCH 090/123] [mvu axi]: update list of deduced parameters

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 699662bd72..dd357c94bb 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -60,13 +60,14 @@ module mvu_vvu_axi #(
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  SF = MW / SIMD,
+	localparam int unsigned  NF = IS_MVU ? MH / PE : 1
 )
 (
 	// Global Control

From eeb3cea623865a13d8da78acb5a9c7fc621caf0e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:58:02 +0000
Subject: [PATCH 091/123] [mvu custom-op]: remove lut-based implementation and
 update compute core selection

---
 .../matrixvectoractivation_rtl.py             | 39 ++++++++++---------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index da560d73fd..fcab06658c 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -191,7 +191,12 @@ def verify_node(self):
 
         if mem_mode not in ["decoupled", "external"]:
             info_messages.append(
-                "RTL-based MVAU supports only decoupled or external weights."
+                "RTL-based MVU only supports decoupled or external weights."
+            )
+
+        if self.get_nodeattr("resType") == "lut":
+            info_message.append(
+                "RTL-based MVU only supports DSP-based implementation"
             )
 
         return info_messages
@@ -635,7 +640,6 @@ def execute_node(self, context, graph):
         mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
-        # TODO ensure codegen dir exists
         if mode == "cppsim":
             raise Exception(
                 "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim"
@@ -801,7 +805,6 @@ def code_generation_ipi(self):
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
-                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -897,7 +900,6 @@ def code_generation_ipi(self):
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
-                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -964,8 +966,8 @@ def derive_characteristic_fxns(self, period):
 
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
-        # 0.741 ns seems the worst-case delay through first DSP
-        # 0.605 ns seems to be (on average) delay for all subsequent DSPs
+        # ~0.741 ns seems the worst-case delay through first DSP
+        # ~0.605 ns seems to be (on average) delay for all subsequent DSPs
         # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
         assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
         critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
@@ -976,22 +978,23 @@ def _resolve_segment_len(self, clk):
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
         # supported RTL compute core
-        if self.get_nodeattr("resType") == "lut":
-            return "mvu_vvu_lut"
+        
+        assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name)
+
+        act_width = self.get_input_datatype(0).bitwidth()
+        weight_width = self.get_input_datatype(1).bitwidth()
+        is_versal = (
+            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+            or fpgapart[0:5] == "xqrvc"
+        )
+        
+        if is_versal:
+            return "mvu_vvu_8sx9_dsp58"
         else:
-            act_width = self.get_input_datatype(0).bitwidth()
-            weight_width = self.get_input_datatype(1).bitwidth()
-            is_versal = (
-                fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-                or fpgapart[0:5] == "xqrvc"
-            )
             if act_width == 4 and weight_width == 4:
                 return "mvu_4sx4u"
             else:
-                if is_versal:
-                    return "mvu_vvu_8sx9_dsp58"
-                else:
-                    return "mvu_8sx8u_dsp48"
+                return "mvu_8sx8u_dsp48"
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation

From 0813d1463a219384b4666fad2db93a4f7dee1a0f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:59:30 +0000
Subject: [PATCH 092/123] [mvu axi]: remove LUT-based compute core

---
 finn-rtllib/mvu/mvu_vvu_axi.sv |  11 +---
 finn-rtllib/mvu/mvu_vvu_lut.sv | 104 ---------------------------------
 2 files changed, 2 insertions(+), 113 deletions(-)
 delete mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index dd357c94bb..a3b051c9a1 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -120,8 +120,8 @@ module mvu_vvu_axi #(
 			end
 		end
 		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
-				$error("VVU only supported on DSP58 or LUT-based implementation");
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin
+				$error("VVU only supported on DSP58");
 				$finish;
 			end
 		end
@@ -195,13 +195,6 @@ module mvu_vvu_axi #(
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-	"mvu_vvu_lut":
-		mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
 	default: initial begin
 		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
 		$finish;
diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv
deleted file mode 100644
index c100910d75..0000000000
--- a/finn-rtllib/mvu/mvu_vvu_lut.sv
+++ /dev/null
@@ -1,104 +0,0 @@
-module mvu_vvu_lut #(
-    bit IS_MVU,
-    int unsigned  PE,
-    int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-    int unsigned  ACTIVATION_WIDTH,
-    int unsigned  WEIGHT_WIDTH,
-    bit  SIGNED_ACTIVATIONS,
-    bit  M_REG = 1,
-
-    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
-    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-)(
-	// Global Control
-	input	logic  clk,
-	input	logic  rst,
-	input	logic  en,
-
-	// Input
-	input	logic  last,
-	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
-	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
-
-	// Ouput
-	output	logic  vld,
-	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
-);
-
-    typedef int unsigned  leave_load_t[2*SIMD-1];
-    function leave_load_t init_leave_loads();
-        automatic leave_load_t  res;
-        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
-        return res;
-    endfunction : init_leave_loads
-
-    // Pipeline for last indicator flag
-    uwire last_i;
-    generate if (M_REG) begin
-        logic [0:1] L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= {last, L[0]};
-        end
-        assign  last_i = L[1];
-    end
-    else begin 
-        logic L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= last;
-        end
-        assign  last_i = L;
-    end
-    endgenerate
-
-    // For each PE generate
-    for (genvar  i = 0; i < PE; i++)  begin : genPE
-        // Stage #1: SIMD multipliers in parallel
-        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
-        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
-            if (M_REG) begin : genMreg
-                logic [MULT_WIDTH-1 : 0] M [SIMD];
-                always_ff @(posedge clk) begin
-                    if(rst)         M[j] = '{ default : 0 };
-                    else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
-                end
-                assign  m1[j] = M[j];
-            end : genMreg
-            else begin : genNoMreg 
-                assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
-            end : genNoMreg
-        end : genSIMD
-
-        // Stage #2: Adder tree to reduce SIMD products
-        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
-        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
-        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
-        for(genvar n = 0; n < SIMD-1; n++) begin
-            // Sum truncated to actual maximum bit width at this node
-            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
-            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-            assign tree[n] = s;
-        end
-
-        // Stage #3: Buffer output
-        logic [ACCU_WIDTH-1:0] P2 [PE];
-        always_ff @(posedge clk) begin
-            if(rst)         P2[i] = '{ default : 0};
-            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
-        end
-
-        assign  vld = last_i;
-        assign  p[i] = P2[i];
-    end : genPE
-
-endmodule : mvu_vvu_lut

From 4892d6614b734a08315062b86ec6d5e1f1af0dc1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 12:02:38 +0000
Subject: [PATCH 093/123] [hls custom-op]: enable reset in sim

---
 src/finn/custom_op/fpgadataflow/hlscustomop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 01b94c20ca..bc59c69192 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -600,6 +600,7 @@ def rtlsim_multi_io(self, sim, io_dict):
             trace_file=trace_file,
             sname=sname,
             liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+            do_reset=True,
         )
         self.set_nodeattr("cycles_rtlsim", total_cycle_count)
 

From 44f6e0f3e70eea06408b94a31e555f0f6b9ea358 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 12:21:00 +0000
Subject: [PATCH 094/123] [test mvu rtl]: updated test flow (DSP58 only)

---
 .../test_fpgadataflow_mvau_rtl.py             | 167 +++++++++---------
 1 file changed, 87 insertions(+), 80 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index 3db7a718f5..1e9de44fb2 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -27,141 +27,148 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import os
 
 import numpy as np
+import os
+import pickle
 from onnx import TensorProto, helper
-from qonnx.util.basic import (
-    qonnx_make_model,
-    gen_finn_dt_tensor
-)
-from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.core.datatype import DataType
-from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+
+
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from qonnx.transformation.general import ApplyConfig
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
-#import qonnx.core.data_layout as DataLayout
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
-def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt):
-    (ofm_h, ofm_w) = ofm_shape
-    ofm = helper.make_tensor_value_info(
-        "ofm",
-        TensorProto.FLOAT,
-        (1, ofm_h, ofm_w, mh)
-    )
-
-    matmul_node = helper.make_node(
-        "MatMul",
-        ["ifm", "weights"],
-        ["ofm"]
-    )
-    graph = helper.make_graph(
-        nodes=[matmul_node],
-        name="matmul_graph",
-        inputs=[ifm],
-        outputs=[ofm]
-    )
+
+def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W):
+    matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"])
+    graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm])
 
     model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("ifm", idt)
     model.set_tensor_datatype("weights", wdt)
-    model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype
+    model.set_tensor_datatype(
+        "ofm", DataType["INT32"]
+    )  # At this step, the MatMul layer does not optimize the bit-width of the output datatype
     model.set_initializer("weights", W)
-
     # model.set_tensor_layout("ifm", DataLayout.NHWC)
 
     return model
 
+
 def prepare_inputs(input_tensor):
-    return {"inp": input_tensor}
+    return {"global_in": input_tensor}
+
 
-@pytest.mark.parametrize("mh", [16])
-@pytest.mark.parametrize("mw", [32])
-@pytest.mark.parametrize("pe", [1, 4, 16])
-#@pytest.mark.parametrize("simd", [1, 30, 90])
-@pytest.mark.parametrize("simd", [1, 4, 32])
+# @pytest.mark.parametrize("mh", [36])
+# @pytest.mark.parametrize("mw", [256])
+@pytest.mark.parametrize("mh", [9])
+@pytest.mark.parametrize("mw", [36])
+# @pytest.mark.parametrize("pe", [1, 4, 9, 36])
+# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256])
+@pytest.mark.parametrize("pe", [1, 3, 9])
+@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36])
 @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
-@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
-#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
-@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
-@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]])
+# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"])
+@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"])
+@pytest.mark.parametrize("clk_ns", [1.66, 4])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+def test_fpgadataflow_mvau_rtl(
+    mh, mw, pe, simd, idt, wdt, part, clk_ns
+):
+    if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66:
+        pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test")
+
     # Create test input vector (produced by SWG)
     ofm_shape = (5, 5)
     ofm_h, ofm_w = ofm_shape
-    ifm = helper.make_tensor_value_info(
-        "ifm",
-        TensorProto.FLOAT,
-        [1, ofm_h, ofm_w, mw]
-    )
-    weights = helper.make_tensor_value_info(
-        "weights",
-        TensorProto.FLOAT,
-        [mw, mh]
-    )
+    ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
+    ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
     W = gen_finn_dt_tensor(wdt, (mw, mh))
-    model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
+    model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
 
-    model.save(build_dir+"/matmul.onnx")
+    model.save(build_dir + "/matmul.onnx")
 
     # Create MatMul & obtain golden reference output
-    A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in"))
     input_dict = prepare_inputs(A)
 
-    ## Execute ONNX model
-    output_matmul = oxe.execute_onnx(model, input_dict)
+    # Execute ONNX model
+    output_matmul = oxe.execute_onnx(model, input_dict)["global_out"]
+
+    with open(build_dir + "/onnx_output.pkl", "wb") as f:
+        pickle.dump(output_matmul, f)
 
     # Create MVAU (HLS)
     model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
     model = model.transform(GiveUniqueNodeNames())
-    
+
     # Apply folding (i.e. specify to use DSPs)
     folding_config = {
         "Defaults": {},
         "MatrixVectorActivation_0": {
-            "PE" : pe,
-            "SIMD" : simd,
-            "mem_mode" : "decoupled",
-            "ram_style" : "auto",
-            "resType" : "dsp",
-            "impl" : "rtl"
-        }
+            "PE": pe,
+            "SIMD": simd,
+            "mem_mode": "decoupled",
+            "ram_style": "auto",
+            "resType": "dsp",
+            "preferred_backend" : "rtl"
+        },
     }
     model = model.transform(ApplyConfig(folding_config))
-    model.save(build_dir+"/mvau_hls.onnx")
-
-    model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP(part, 5))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(PrepareRTLSim())
-    output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+    model.save(build_dir + "/mvau_hls.onnx")
 
     # Apply convert-to-rtl step
     model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
     model = model.transform(GiveUniqueNodeNames())
-    model.save(build_dir+"/mvau_rtl.onnx")
+    model.save(build_dir + "/mvau_rtl.onnx")
 
+    # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated
+    for n in model.graph.node:
+        getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd")
+    
     model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5))
+    model = model.transform(PrepareIP(part, clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
-    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"]
+    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"]
+
+    with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_rtl, f)
+
+    model.save(build_dir + "/mvau_rtl_sim.onnx")
+    assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!"
+
+    model = model.transform(InsertAndSetFIFODepths(part, clk_ns))
+    model = model.transform(PrepareIP(part, clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(part, clk_ns))
 
-    model.save(build_dir+"/mvau_rtl_sim.onnx")
+    os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+    model.set_metadata_prop("rtlsim_so", "")
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd")
+    model.save(build_dir + "/stitched_ip.onnx")
+    output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"]
 
-    assert (output_mvau_hls == output_mvau_rtl).all()
-    assert (output_mvau_hls.size > 0)
+    assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
\ No newline at end of file

From 9b2ccebba2c3689d6a1e55b6df027f461244d216 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 14:43:46 +0000
Subject: [PATCH 095/123] [mvu vvu axi]: reworked flow control and backpressure
 handling by tpreusser

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 130 ++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 69 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index a3b051c9a1..0168f20563 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -62,12 +62,12 @@ module mvu_vvu_axi #(
 	// Safely deducible parameters
 	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  SF = MW / SIMD,
-	localparam int unsigned  NF = IS_MVU ? MH / PE : 1
+	localparam int unsigned  NF = MH / PE
 )
 (
 	// Global Control
@@ -119,81 +119,73 @@ module mvu_vvu_axi #(
 				$finish;
 			end
 		end
-		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin
-				$error("VVU only supported on DSP58");
-				$finish;
-			end
-		end
 	end
 
 	uwire clk = ap_clk;
 	uwire rst = !ap_rst_n;
 
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
+	//- Replay to Accommodate Neuron Fold -----------------------------------
+	typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t;
+	uwire mvu_flatin_t amvau;
 	uwire alast;
 	uwire afin;
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay (
 	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
 	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
-//-------------------- Input control --------------------\\
+	//- Unflatten inputs into structured matrices ---------------------------
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
+	typedef logic         [SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
+
+	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
+	uwire  mvu_a_t  mvu_a = amvau;
+
+	//- Flow Control Bracket around Compute Core ----------------------------
 	uwire en;
 	uwire istb = avld && s_axis_weights_tvalid;
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-//-------------------- Core MVU/VVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	uwire mvauin_t amvau_i;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-
-	if (IS_MVU) begin : genMVUInput
-		assign  amvau_i = amvau;
-	end : genMVUInput
-	else begin : genVVUInput
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = PE*SIMD;
-		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
-									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-		end : genRewire
-	end : genVVUInput
+	//- Instantiate compute core ----------------------------
+	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
+	uwire dsp_vld;
+	uwire dsp_p_t  dsp_p;
+
+	uwire dsp_clk = ap_clk;
+	uwire dsp_en = en;
+	uwire dsp_last = alast && avld;
+	uwire dsp_zero = !istb;
+	uwire mvu_w_t dsp_w = mvu_w;
+	uwire mvu_a_t dsp_a = mvu_a;
+	uwire ovld = dsp_vld;
+	uwire dsp_p_t  odat = dsp_p;
 
 	case(COMPUTE_CORE)
 	"mvu_vvu_8sx9_dsp58":
 		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	default: initial begin
 		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
@@ -202,41 +194,41 @@ module mvu_vvu_axi #(
 	endcase
 
 //-------------------- Output register slice --------------------\\
+	// Make `en`computation independent from external inputs.
+	// Drive all outputs from registers.
 	struct packed {
-		logic vld;
+		logic rdy;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-
+	}  A = '{ rdy: 1, default: 'x };	// side-step register used when encountering backpressure
 	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
+	}  B = '{ vld: 0, default: 'x };	// ultimate output register
+
+	assign	en = A.rdy;
+	uwire  b_load = !B.vld || m_axis_output_tready;
 
-	assign	b_load = !B.vld || m_axis_output_tready;
 	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ vld: 0, default: 'x };
+		if(rst) begin
+			A <= '{ rdy: 1, default: 'x };
+			B <= '{ vld: 0, default: 'x };
+		end
 		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+			if(A.rdy)  A.dat <= odat;
+			A.rdy <= (A.rdy && !ovld) || b_load;
+
+			if(b_load) begin
+				B <= '{
+					vld: ovld || !A.rdy,
+					dat: A.rdy? odat : A.dat
+				};
+			end
 		end
 	end
-
 	assign	m_axis_output_tvalid = B.vld;
+	// Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
+	// These extra bits should never be used. Why not 'x them out?
 	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
+
 endmodule : mvu_vvu_axi

From ee9f027592e0f28deeab5cbe8d008f3be6076c92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 31 Jan 2024 09:59:17 +0000
Subject: [PATCH 096/123] Adding DSP48E1 support for 8-bit compute. Todo: finer
 core differentiation to select DSP48E2 explicitly again.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 165 ++++++++++++++++++++++++-----
 1 file changed, 139 insertions(+), 26 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 1e6855f779..f3cde9dea9 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -5,10 +5,9 @@ module mvu_8sx8u_dsp48 #(
 	int unsigned  ACTIVATION_WIDTH,
 	int unsigned  WEIGHT_WIDTH,
 
+	int unsigned  VERSION = 1,
 	bit  SIGNED_ACTIVATIONS = 0,
-	bit  FORCE_BEHAVIORAL = 0,
-
-	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+	bit  FORCE_BEHAVIORAL = 0
 )(
 	// Global Control
 	input	logic  clk,
@@ -49,6 +48,7 @@ module mvu_8sx8u_dsp48 #(
 	assign	vld = L[5];
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+	localparam int unsigned  SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH;
 	localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
 
 	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
@@ -63,8 +63,8 @@ module mvu_8sx8u_dsp48 #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
-			logic [33:0]  aa;
+			uwire [17:0]  bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
+			logic [29:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx;
 			if(1) begin : blkVectorize
@@ -99,14 +99,14 @@ module mvu_8sx8u_dsp48 #(
 				end
 			end : blkVectorize
 
-			uwire [57:0]  pp;
+			uwire [47:0]  pp;
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
 			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
-				logic signed [23:0]  B1  = 0;
+				logic signed [17:0]  B1  = 0;
 				always_ff @(posedge clk) begin
 					if(zero)     B1  <= 0;
 					else if(en)  B1  <= bb;
@@ -119,7 +119,7 @@ module mvu_8sx8u_dsp48 #(
 				end
 
 				// Stage #2: Multiply
-				logic signed [50:0]  M2 = 0;
+				logic signed [45:0]  M2 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      M2 <= 0;
 					else if(en)  M2 <=
@@ -130,7 +130,7 @@ module mvu_8sx8u_dsp48 #(
 				end
 
 				// Stage #3: Accumulate
-				logic signed [57:0]  P3 = 0;
+				logic signed [47:0]  P3 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      P3 <= 0;
 					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
@@ -140,7 +140,115 @@ module mvu_8sx8u_dsp48 #(
 			end : genBehav
 `ifndef VERILATOR
 			else begin : genDSP
-				DSP48E2 #(
+				localparam logic [6:0]  OPMODE_INVERSION = 7'b010_01_01;
+				uwire [6:0]  opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
+				case(VERSION)
+				1: DSP48E1 #(
+					// Feature Control Attributes: Data Path Selection
+					.A_INPUT("DIRECT"),		// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.B_INPUT("DIRECT"),		// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.USE_DPORT("TRUE"),		// Select D port usage (TRUE or FALSE)
+					.USE_MULT("MULTIPLY"),	// Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE")
+					.USE_SIMD("ONE48"),		// SIMD selection ("ONE48", "TWO24", "FOUR12")
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),		// "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH"
+					.MASK('1),							// 48-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),						// 48-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),					// "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2"
+					.SEL_PATTERN("PATTERN"),			// Select pattern value ("PATTERN" or "C")
+					.USE_PATTERN_DETECT("NO_PATDET"),	// Enable pattern detect ("PATDET" or "NO_PATDET")
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),		// Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2)
+					.ADREG(1),			// Number of pipeline stages for pre-adder (0 or 1)
+					.ALUMODEREG(0),		// Number of pipeline stages for ALUMODE (0 or 1)
+					.AREG(0),			// Number of pipeline stages for A (0, 1 or 2)
+					.BCASCREG(1),		// Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2)
+					.BREG(1),			// Number of pipeline stages for B (0, 1 or 2)
+					.CARRYINREG(0),		// Number of pipeline stages for CARRYIN (0 or 1)
+					.CARRYINSELREG(0),	// Number of pipeline stages for CARRYINSEL (0 or 1)
+					.CREG(0),			// Number of pipeline stages for C (0 or 1)
+					.DREG(0),			// Number of pipeline stages for D (0 or 1)
+					.INMODEREG(0),		// Number of pipeline stages for INMODE (0 or 1)
+					.MREG(1),			// Number of multiplier pipeline stages (0 or 1)
+					.OPMODEREG(1),		// Number of pipeline stages for OPMODE (0 or 1)
+					.PREG(1)			// Number of pipeline stages for P (0 or 1)
+				) dsp (
+					// Cascade: 30-bit (each) output: Cascade Ports
+					.ACOUT(),			// 30-bit output: A port cascade output
+					.BCOUT(),			// 18-bit output: B port cascade output
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry output
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade output
+					.PCOUT(),			// 48-bit output: Cascade output
+
+					// Control: 1-bit (each) output: Control Inputs/Status Bits
+					.OVERFLOW(),		 // 1-bit output: Overflow in add/acc output
+					.PATTERNBDETECT(),	 // 1-bit output: Pattern bar detect output
+					.PATTERNDETECT(),	 // 1-bit output: Pattern detect output
+					.UNDERFLOW(),		 // 1-bit output: Underflow in add/acc output
+
+					// Data: 4-bit (each) output: Data Ports
+					.CARRYOUT(),	// 4-bit output: Carry output
+					.P(pp),			// 48-bit output: Primary data output
+
+					// Cascade: 30-bit (each) input: Cascade Ports
+					.ACIN('x),			 // 30-bit input: A cascade data input
+					.BCIN('x),			 // 18-bit input: B cascade input
+					.CARRYCASCIN('x),	 // 1-bit input: Cascade carry input
+					.MULTSIGNIN('x),	 // 1-bit input: Multiplier sign input
+					.PCIN('x),			 // 48-bit input: P cascade input
+
+					// Control: 4-bit (each) input: Control Inputs/Status Bits
+					.CLK(clk),				// 1-bit input: Clock input
+					.ALUMODE('0),			// 4-bit input: ALU control input
+					.CARRYINSEL('0),		// 3-bit input: Carry select input
+					.INMODE(5'b01100),		// 5-bit input: INMODE control input
+					.OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
+
+					// Data: 30-bit (each) input: Data Ports
+					.A(aa),			// 30-bit input: A data input
+					.B(bb),			// 18-bit input: B data input
+					.C('x),			// 48-bit input: C data input
+					.CARRYIN('0),	// 1-bit input: Carry input signal
+					.D(dd),			// 25-bit input: D data input
+
+					// Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable input for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable input for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable input for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable input for ALUMODERE
+					.CEB1('0),			// 1-bit input: Clock enable input for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable input for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable input for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable input for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable input for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable input for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable input for MREG
+					.CEP(en),			// 1-bit input: Clock enable input for PREG
+					.RSTA('0),			// 1-bit input: Reset input for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				2: DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
 					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
 					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
@@ -163,21 +271,21 @@ module mvu_8sx8u_dsp48 #(
 					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
 
 					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
-					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
-					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
-					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
-					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
-					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
-					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+					.IS_ALUMODE_INVERTED('0),							// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),							// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),								// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),							// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),						// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),						// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),								// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),								// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),							// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),								// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),								// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),							// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),								// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),								// Optional inversion for RSTP
 
 					// Register Control Attributes: Pipeline Register Configuration
 					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
@@ -225,7 +333,7 @@ module mvu_8sx8u_dsp48 #(
 					.ALUMODE(4'h0),				// 4-bit input: ALU control
 					.CARRYINSEL('0),			// 3-bit input: Carry select
 					.INMODE(5'b01100),			// 5-bit input: INMODE control
-					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+					.OPMODE({ 2'b00, opmode }),	// 9-bit input: Operation mode
 
 					// Data inputs: Data Ports
 					.A(aa),						// 34-bit input: A data
@@ -269,6 +377,11 @@ module mvu_8sx8u_dsp48 #(
 					.RSTM(rst),			// 1-bit input: Reset for MREG
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
+				default: initial begin
+					$error("Unknown version DSP48E%0d.", VERSION);
+					$finish;
+				end
+				endcase
 			end : genDSP
 `endif
 

From 3ab82966e1af64aa6ddb75f88561c5e6c86196b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 31 Jan 2024 10:15:32 +0000
Subject: [PATCH 097/123] Adding DSP48E1 support for 4-bit compute. Todo: finer
 core differentiation to select DSP48E2 explicitly again.

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 169 +++++++++++++++++++++++++++++------
 1 file changed, 142 insertions(+), 27 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 7a2af35742..b49315637f 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -2,8 +2,10 @@ module mvu_4sx4u #(
 	int unsigned  PE,
 	int unsigned  SIMD,
 	int unsigned  ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	bit FORCE_BEHAVIORAL = 0
+
+	int unsigned  VERSION = 1,
+	bit  SIGNED_ACTIVATIONS = 0,
+	bit  FORCE_BEHAVIORAL = 0
 )(
 	// Global Control
 	input	logic  clk,
@@ -14,7 +16,7 @@ module mvu_4sx4u #(
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
 	input	logic signed [PE-1:0][SIMD-1:0][3:0]  w,	// signed weights
-	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations
+	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
 
 	// Ouput
 	output	logic  vld,
@@ -58,8 +60,8 @@ module mvu_4sx4u #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
-			logic [33:0]  aa;
+			uwire [17:0]  bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
+			logic [29:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx[3:1];
 			if(1) begin : blkVectorize
@@ -94,14 +96,14 @@ module mvu_4sx4u #(
 				end
 			end : blkVectorize
 
-			uwire [57:0]  pp;
+			uwire [47:0]  pp;
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (BEHAVIORAL) begin : genBehav
+			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
-				logic signed [23:0]  B1  = 0;
+				logic signed [17:0]  B1  = 0;
 				always_ff @(posedge clk) begin
 					if(zero)     B1  <= 0;
 					else if(en)  B1  <= bb;
@@ -114,7 +116,7 @@ module mvu_4sx4u #(
 				end
 
 				// Stage #2: Multiply
-				logic signed [50:0]  M2 = 0;
+				logic signed [45:0]  M2 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      M2 <= 0;
 					else if(en)  M2 <=
@@ -125,7 +127,7 @@ module mvu_4sx4u #(
 				end
 
 				// Stage #3: Accumulate
-				logic signed [57:0]  P3 = 0;
+				logic signed [47:0]  P3 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      P3 <= 0;
 					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
@@ -135,7 +137,115 @@ module mvu_4sx4u #(
 			end : genBehav
 `ifndef VERILATOR
 			else begin : genDSP
-				DSP48E2 #(
+				localparam logic [6:0]  OPMODE_INVERSION = 7'b010_01_01;
+				uwire [6:0]  opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
+				case(VERSION)
+				1: DSP48E1 #(
+					// Feature Control Attributes: Data Path Selection
+					.A_INPUT("DIRECT"),		// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.B_INPUT("DIRECT"),		// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.USE_DPORT("TRUE"),		// Select D port usage (TRUE or FALSE)
+					.USE_MULT("MULTIPLY"),	// Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE")
+					.USE_SIMD("ONE48"),		// SIMD selection ("ONE48", "TWO24", "FOUR12")
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),		// "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH"
+					.MASK('1),							// 48-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),						// 48-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),					// "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2"
+					.SEL_PATTERN("PATTERN"),			// Select pattern value ("PATTERN" or "C")
+					.USE_PATTERN_DETECT("NO_PATDET"),	// Enable pattern detect ("PATDET" or "NO_PATDET")
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),		// Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2)
+					.ADREG(1),			// Number of pipeline stages for pre-adder (0 or 1)
+					.ALUMODEREG(0),		// Number of pipeline stages for ALUMODE (0 or 1)
+					.AREG(0),			// Number of pipeline stages for A (0, 1 or 2)
+					.BCASCREG(1),		// Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2)
+					.BREG(1),			// Number of pipeline stages for B (0, 1 or 2)
+					.CARRYINREG(0),		// Number of pipeline stages for CARRYIN (0 or 1)
+					.CARRYINSELREG(0),	// Number of pipeline stages for CARRYINSEL (0 or 1)
+					.CREG(0),			// Number of pipeline stages for C (0 or 1)
+					.DREG(0),			// Number of pipeline stages for D (0 or 1)
+					.INMODEREG(0),		// Number of pipeline stages for INMODE (0 or 1)
+					.MREG(1),			// Number of multiplier pipeline stages (0 or 1)
+					.OPMODEREG(1),		// Number of pipeline stages for OPMODE (0 or 1)
+					.PREG(1)			// Number of pipeline stages for P (0 or 1)
+				) dsp (
+					// Cascade: 30-bit (each) output: Cascade Ports
+					.ACOUT(),			// 30-bit output: A port cascade output
+					.BCOUT(),			// 18-bit output: B port cascade output
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry output
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade output
+					.PCOUT(),			// 48-bit output: Cascade output
+
+					// Control: 1-bit (each) output: Control Inputs/Status Bits
+					.OVERFLOW(),		 // 1-bit output: Overflow in add/acc output
+					.PATTERNBDETECT(),	 // 1-bit output: Pattern bar detect output
+					.PATTERNDETECT(),	 // 1-bit output: Pattern detect output
+					.UNDERFLOW(),		 // 1-bit output: Underflow in add/acc output
+
+					// Data: 4-bit (each) output: Data Ports
+					.CARRYOUT(),	// 4-bit output: Carry output
+					.P(pp),			// 48-bit output: Primary data output
+
+					// Cascade: 30-bit (each) input: Cascade Ports
+					.ACIN('x),			 // 30-bit input: A cascade data input
+					.BCIN('x),			 // 18-bit input: B cascade input
+					.CARRYCASCIN('x),	 // 1-bit input: Cascade carry input
+					.MULTSIGNIN('x),	 // 1-bit input: Multiplier sign input
+					.PCIN('x),			 // 48-bit input: P cascade input
+
+					// Control: 4-bit (each) input: Control Inputs/Status Bits
+					.CLK(clk),				// 1-bit input: Clock input
+					.ALUMODE('0),			// 4-bit input: ALU control input
+					.CARRYINSEL('0),		// 3-bit input: Carry select input
+					.INMODE(5'b01100),		// 5-bit input: INMODE control input
+					.OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
+
+					// Data: 30-bit (each) input: Data Ports
+					.A(aa),			// 30-bit input: A data input
+					.B(bb),			// 18-bit input: B data input
+					.C('x),			// 48-bit input: C data input
+					.CARRYIN('0),	// 1-bit input: Carry input signal
+					.D(dd),			// 25-bit input: D data input
+
+					// Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable input for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable input for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable input for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable input for ALUMODERE
+					.CEB1('0),			// 1-bit input: Clock enable input for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable input for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable input for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable input for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable input for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable input for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable input for MREG
+					.CEP(en),			// 1-bit input: Clock enable input for PREG
+					.RSTA('0),			// 1-bit input: Reset input for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				2: DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
 					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
 					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
@@ -158,21 +268,21 @@ module mvu_4sx4u #(
 					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
 
 					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
-					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
-					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
-					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
-					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
-					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
-					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+					.IS_ALUMODE_INVERTED('0),							// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),							// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),								// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),							// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),						// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),						// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),								// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),								// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),							// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),								// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),								// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),							// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),								// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),								// Optional inversion for RSTP
 
 					// Register Control Attributes: Pipeline Register Configuration
 					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
@@ -220,7 +330,7 @@ module mvu_4sx4u #(
 					.ALUMODE(4'h0),				// 4-bit input: ALU control
 					.CARRYINSEL('0),			// 3-bit input: Carry select
 					.INMODE(5'b01100),			// 5-bit input: INMODE control
-					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+					.OPMODE({ 2'b00, opmode }),	// 9-bit input: Operation mode
 
 					// Data inputs: Data Ports
 					.A(aa),						// 34-bit input: A data
@@ -264,6 +374,11 @@ module mvu_4sx4u #(
 					.RSTM(rst),			// 1-bit input: Reset for MREG
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
+				default: initial begin
+					$error("Unknown version DSP48E%0d.", VERSION);
+					$finish;
+				end
+				endcase
 			end : genDSP
 `endif
 

From 23c3f82a87a405d996ad6e3b096ca9352314adf1 Mon Sep 17 00:00:00 2001
From: johnnoel <johnnoel@xilinx.com>
Date: Wed, 31 Jan 2024 10:36:52 +0000
Subject: [PATCH 098/123] [Tests] Temporarily re-enable SWG exception for
 bnn_w2_a2_cnv_Pynq-Z1 test

---
 tests/end2end/test_end2end_bnn_pynq.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index b296dad827..9fb41ec78e 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -653,7 +653,13 @@ def test_set_fifo_depths(self, topology, wbits, abits, board):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(board, target_clk_ns)["part"]
-        model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
+        if topology == "cnv" and wbits == 2 and abits == 2 and board == "Pynq-Z1":
+            # Enabling swg_exception for this single test case. Disabling the exception results in a design
+            # that exceeds the resources of the Pynq-Z1 board. In future this should be revisited and handled
+            # correctly as the swg_exception is poorly justified.
+            model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns, swg_exception=True))
+        else:
+            model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
         fifo_layers = model.get_nodes_by_op_type("StreamingFIFO")
         assert len(fifo_layers) > 0
         model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board))

From 562d153b96c96ac28968d01a9f09b2be9471ea17 Mon Sep 17 00:00:00 2001
From: johnnoel <johnnoel@xilinx.com>
Date: Wed, 31 Jan 2024 13:37:50 +0000
Subject: [PATCH 099/123] [Tests] Fix fpgadataflow split large fifos test

---
 tests/fpgadataflow/test_split_large_fifos.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py
index 3061696a68..653e1e7896 100644
--- a/tests/fpgadataflow/test_split_large_fifos.py
+++ b/tests/fpgadataflow/test_split_large_fifos.py
@@ -54,7 +54,7 @@ def fetch_test_model(topology, wbits=2, abits=2):
 def get_folding_cfg(depth=65536):
     cfg = dict()
     cfg["Defaults"] = dict()
-    for i in range(3):
+    for i in range(4):
         key = "StreamingFIFO_" + str(i)
         cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"}
     return cfg

From a884e11ff52023e68a0f798c47bf777bacb873df Mon Sep 17 00:00:00 2001
From: johnnoel <johnnoel@xilinx.com>
Date: Wed, 31 Jan 2024 13:48:05 +0000
Subject: [PATCH 100/123] Fix linting

---
 tests/end2end/test_end2end_bnn_pynq.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 9fb41ec78e..db065fec42 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -654,10 +654,12 @@ def test_set_fifo_depths(self, topology, wbits, abits, board):
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(board, target_clk_ns)["part"]
         if topology == "cnv" and wbits == 2 and abits == 2 and board == "Pynq-Z1":
-            # Enabling swg_exception for this single test case. Disabling the exception results in a design
-            # that exceeds the resources of the Pynq-Z1 board. In future this should be revisited and handled
-            # correctly as the swg_exception is poorly justified.
-            model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns, swg_exception=True))
+            # Enabling swg_exception for this single test case. Disabling the exception results in
+            # a design that exceeds the resources of the Pynq-Z1 board. In future this should be
+            # revisited and handled correctly as the swg_exception is poorly justified.
+            model = model.transform(
+                InsertAndSetFIFODepths(test_fpga_part, target_clk_ns, swg_exception=True)
+            )
         else:
             model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
         fifo_layers = model.get_nodes_by_op_type("StreamingFIFO")

From bcd72ad90f066ffab173dd0c132e553a1f4b2cd6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 15:20:07 +0000
Subject: [PATCH 101/123] [mvu vvu axi]: minor bugfixes to enable VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 55 +++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 0168f20563..014481b29a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -60,14 +60,13 @@ module mvu_vvu_axi #(
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
-	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH	= SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
-	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  SF = MW / SIMD,
-	localparam int unsigned  NF = MH / PE
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
@@ -151,28 +150,36 @@ module mvu_vvu_axi #(
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-	//- Instantiate compute core ----------------------------
-	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
-	uwire dsp_vld;
-	uwire dsp_p_t  dsp_p;
-
-	uwire dsp_clk = ap_clk;
-	uwire dsp_en = en;
-	uwire dsp_last = alast && avld;
-	uwire dsp_zero = !istb;
-	uwire mvu_w_t dsp_w = mvu_w;
-	uwire mvu_a_t dsp_a = mvu_a;
-	uwire ovld = dsp_vld;
-	uwire dsp_p_t  odat = dsp_p;
+//-------------------- Core MVU/VVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
+	uwire mvauin_t amvau_i;
+
+	if (IS_MVU) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		localparam int num_of_elements = PE*SIMD;
+		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
+			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+		end : genRewire
+	end : genVVUInput
 
 	case(COMPUTE_CORE)
 	"mvu_vvu_8sx9_dsp58":
 		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk(dsp_clk), .rst, .en(dsp_en),
-			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
-			.vld(dsp_vld), .p(dsp_p)
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
+			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (

From b1167334cf206f8cc550018594e989e9798768ce Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 15:26:30 +0000
Subject: [PATCH 102/123] [mvu tb]: created separate vvu testbench and renamed
 mvu_vvu_axi tb

---
 .../tb/{mvu_vvu_axi_tb.sv => mvu_axi_tb.sv}   |  16 +-
 finn-rtllib/mvu/tb/vvu_axi_tb.sv              | 227 ++++++++++++++++++
 2 files changed, 235 insertions(+), 8 deletions(-)
 rename finn-rtllib/mvu/tb/{mvu_vvu_axi_tb.sv => mvu_axi_tb.sv} (96%)
 create mode 100644 finn-rtllib/mvu/tb/vvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
similarity index 96%
rename from finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
rename to finn-rtllib/mvu/tb/mvu_axi_tb.sv
index b46fc588c9..8614e9f811 100644
--- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -31,24 +31,24 @@
  * @brief	Testbench for MVU AXI-lite interface wrapper.
  *****************************************************************************/
 
-module mvu_vvu_axi_tb();
+module mvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
 	localparam bit IS_MVU = 0;
 	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
 	localparam int unsigned MW = 36;
-	localparam int unsigned MH = 1;
-	localparam int unsigned SIMD = 3;
+	localparam int unsigned MH = 4;
+	localparam int unsigned SIMD = 36;
 	localparam int unsigned PE = 4;
-	localparam int unsigned SEGMENTLEN = 1.0;
+	localparam int unsigned SEGMENTLEN = 2.0;
 	localparam bit FORCE_BEHAVIORAL = 1;
 	localparam bit M_REG_LUT = 1;
 	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 6;
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
+	localparam bit SIGNED_ACTIVATIONS = 0;
 	// Simulation constants
 	localparam int unsigned NF = IS_MVU ? MH/PE : 1;
 	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE);
@@ -238,4 +238,4 @@ module mvu_vvu_axi_tb();
 		.m_axis_output_tready(outputs.rdy)
 	);
 
-endmodule : mvu_vvu_axi_tb
+endmodule : mvu_axi_tb
diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv
new file mode 100644
index 0000000000..fbb45845e1
--- /dev/null
+++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv
@@ -0,0 +1,227 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module vvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam bit IS_MVU = 0;
+	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
+	localparam int unsigned MW = 25; // Kernel*Kernel
+	localparam int unsigned MH = 4; // Channels
+	localparam int unsigned SIMD = 25; // MW%SIMD == 0
+	localparam int unsigned PE = 2; // MH%PE == 0
+	localparam int unsigned SEGMENTLEN = 3.0;
+	localparam bit FORCE_BEHAVIORAL = 1;
+	localparam bit M_REG_LUT = 1;
+	// Bit-width config
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations
+	typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[NF*SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF*NF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin
+				activations.vld <= $urandom()%7 >= 0;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output
+	// a: [NF*SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		for (int i = 0; i < NF; i++) begin
+			for (int j = 0; j < SF; j++) begin
+				for (int k = 0; k < PE; k++) begin
+					for (int l = 0; l < SIMD; l++) begin
+						if (SIGNED_ACTIVATIONS)
+							res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]);
+						else
+							res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]);
+					end
+				end
+			end
+		end
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 0;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end
+			end
+
+			NF_CNT += 1;
+		end
+
+		$finish;
+	end
+
+	// Instantiate DUT
+	mvu_vvu_axi #(
+		.IS_MVU(IS_MVU),
+		.COMPUTE_CORE(COMPUTE_CORE),
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+		.M_REG_LUT(M_REG_LUT)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+
+endmodule : vvu_axi_tb

From e1f8db14faf969c422b6f362c0b9329a8be6269e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:35:45 +0000
Subject: [PATCH 103/123] [mvu vvu axi]: minor fix -- define mvauin_weight_t

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 014481b29a..20be83910a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -154,6 +154,7 @@ module mvu_vvu_axi #(
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	uwire mvauin_t amvau_i;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;

From 88da6965ee560b53f672229012eccca2c343111a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:43:58 +0000
Subject: [PATCH 104/123] [folding]: first attempt to extend folding
 transformation to parallelize multi-packed DSPs in MVU/VVU more efficiently

---
 .../fpgadataflow/set_folding.py               | 75 +++++++++++++++----
 1 file changed, 60 insertions(+), 15 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index eca1053f8f..871919f3f2 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -31,6 +31,7 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.core.datatype import DataType
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
@@ -80,11 +81,12 @@ class SetFolding(Transformation):
       unfolded before SIMD is increased
     """
 
-    def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True):
+    def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True, fpga_part=None):
         super().__init__()
         self.target_cycles_per_frame = target_cycles_per_frame
         self.mvau_wwidth_max = mvau_wwidth_max
         self.two_pass_relaxation = two_pass_relaxation
+        self.fpga_part = fpga_part
 
     def optimize_attribute_val(self, node_inst, max_val, attr_name):
         node_inst.set_nodeattr(attr_name, 1)
@@ -95,6 +97,10 @@ def optimize_attribute_val(self, node_inst, max_val, attr_name):
                 # finish if target met
                 break
 
+    def _is_versal(self, fpga_part):
+        assert fpga_part is not None, "Please specify a target board before setting the folding configuration for a more efficient folding configuration for RTL-based MVU/VVU"
+        return fpga_part[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpga_partt[0:5] == "xqrvc"
+
     def apply(self, model):
         graph = model.graph
         # these ops use PE parallelism, up to a max value of NumChannels
@@ -112,13 +118,14 @@ def apply(self, model):
         simd_ops = [
             "DownSampler",
             "FMPadding_Batch",
+            "FMPadding_Batch_rtl",
             "ConvolutionInputGenerator",
             "ConvolutionInputGenerator1D",
             "ConvolutionInputGenerator_rtl",
         ]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
-        depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"]
+        depthwise_op_exceptions = ["VectorVectorActivation", "VectorVectorActivation_rtl", "Pool_Batch"]
         for node in graph.node:
             if not is_fpgadataflow_node(node):
                 continue
@@ -148,6 +155,37 @@ def apply(self, model):
                         break
                 # increase PE until target met or reached max_pe
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
+            if op_type == "MatrixVectorActivation_rtl":
+                max_simd = node_inst.get_nodeattr("MW")
+                max_pe = node_inst.get_nodeattr("MH")
+                node_inst.set_nodeattr("PE", 1)
+                node_inst.set_nodeattr("SIMD", 1)
+                # Depending on the board and the layer's config, either the
+                # SIMD or PE folding dimension would be preferred to enable efficient DSP-packing
+                act_width = DataType[node_inst.get_nodeattr("inputDataType")].bitwidth()
+                weight_width = DataType[node_inst.get_nodeattr("weightDataType")].bitwidth()
+                is_versal = self._is_versal(self.fpga_part)
+                is_dsp48 = act_width < 5 and weight_width < 5 or not(is_versal)
+                preferred_folding_dimension = "PE" if is_dsp48 else "SIMD"
+                preferred_folding_max = max_pe if is_dsp48 else max_simd
+                second_folding_dimension = "SIMD" if is_dsp48 else "PE"
+                second_folding_max = max_simd if is_dsp48 else max_pe
+                for fold_val in divisors(preferred_folding_max):
+                    prev_fold_val = node_inst.get_nodeattr(preferred_folding_dimension)
+                    node_inst.set_nodeattr(preferred_folding_dimension, fold_val)
+                    cyc = node_inst.get_exp_cycles()
+                    if cyc < self.target_cycles_per_frame:
+                        # finish if target met
+                        break
+                    if (
+                        node_inst.get_weight_datatype().bitwidth() * node_inst.get_nodeattr(preferred_folding_dimension)
+                        > self.mvau_wwidth_max
+                    ):
+                        # revert if we've gone above width threshold
+                        node_inst.set_nodeattr(preferred_folding_dimension, prev_fold_val)
+                        break
+                # increase SIMD until target met or reached max_simd
+                self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension)
             elif op_type in pe_ops:
                 max_pe = node_inst.get_nodeattr("NumChannels")
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
@@ -156,37 +194,44 @@ def apply(self, model):
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
             elif op_type in depthwise_op_exceptions:
                 # init/reset SIMD of VVAU
-                if op_type == "VectorVectorActivation":
-                    node_inst.set_nodeattr("SIMD", 1)
+                is_hls_vvu_or_pool = op_type in ["VectorVectorActivation", "Pool_Batch"]
                 max_pe = node_inst.get_nodeattr("Channels")
-                self.optimize_attribute_val(node_inst, max_pe, "PE")
-                # increase SIMD for VVAU once PE is exhausted
-                pe = node_inst.get_nodeattr("PE")
+                max_simd = np.prod(node_inst.get_nodeattr("Kernel")) if op_type.startswith("VectorVectorActivation") else 0
+                preferred_folding_dimension = "PE" if is_hls_vvu_or_pool else "SIMD"
+                preferred_folding_max = max_pe if is_hls_vvu_or_pool else max_simd
+                second_folding_dimension = "SIMD" if is_hls_vvu_or_pool else "PE"
+                second_folding_max = max_simd if is_hls_vvu_or_pool else max_pe
+                if op_type.startswith("VectorVectorActivation"):
+                    node_inst.set_nodeattr(second_folding_dimension, 1)
+                self.optimize_attribute_val(node_inst, preferred_folding_max, preferred_folding_dimension)
+                # increase SIMD(/PE) for VVAU once PE(/SIMD) is exhausted
+                fold_val = node_inst.get_nodeattr(preferred_folding_dimension)
                 cyc = node_inst.get_exp_cycles()
                 if (
-                    op_type == "VectorVectorActivation"
-                    and pe == max_pe
+                    op_type.startswith("VectorVectorActivation")
+                    and fold_val == preferred_folding_max
                     and cyc > self.target_cycles_per_frame
                 ):
-                    max_simd = np.prod(node_inst.get_nodeattr("Kernel"))
-                    self.optimize_attribute_val(node_inst, max_simd, "SIMD")
-                # also set the folding of the upsteam DW SWU
+                    self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension)
+                # also set the folding of the upsteam DW SWU (in case of HLS-based VVU)
                 # which must be identical to this node
                 swu_node = model.find_producer(node.input[0])
                 if swu_node.op_type.startswith("ConvolutionInputGenerator"):
                     swu_node_inst = getCustomOp(swu_node)
-                    swu_node_inst.set_nodeattr("SIMD", pe)
                     # enable parallel_window mode of RTL SWG if needed
                     if swu_node.op_type == "ConvolutionInputGenerator_rtl":
                         if (
-                            op_type == "VectorVectorActivation"
+                            op_type.startswith("VectorVectorActivation")
                             and node_inst.get_nodeattr("SIMD") > 1
                         ):
                             swu_node_inst.set_nodeattr("parallel_window", 1)
+                            swu_node_inst.set_nodeattr("SIMD", max_pe)
                         else:
                             swu_node_inst.set_nodeattr("parallel_window", 0)
+                            pe = node_inst.get_nodeattr("PE")
+                            swu_node_inst.set_nodeattr("SIMD", pe)
                 else:
-                    if op_type == "VectorVectorActivation":
+                    if op_type.startswith("VectorVectorActivation"):
                         ksize = np.prod(node_inst.get_nodeattr("Kernel"))
                     elif op_type == "Pool_Batch":
                         ksize = node_inst.get_nodeattr("KernelSize")

From 1814ea08ccdb995107faf54000a0ecdb52c292b1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:57:19 +0000
Subject: [PATCH 105/123] [mvu axi]: update list of deduced parameters

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 20be83910a..f2b030342b 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -60,13 +60,14 @@ module mvu_vvu_axi #(
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  SF = MW / SIMD,
+	localparam int unsigned  NF = IS_MVU ? MH / PE : 1
 )
 (
 	// Global Control

From f939c3e845b75bd940f4f2b2453b416c07a28457 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 14:43:46 +0000
Subject: [PATCH 106/123] [mvu vvu axi]: reworked flow control and backpressure
 handling by tpreusser

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 45 ++++++++++++++--------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index f2b030342b..0168f20563 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -62,12 +62,12 @@ module mvu_vvu_axi #(
 	// Safely deducible parameters
 	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  SF = MW / SIMD,
-	localparam int unsigned  NF = IS_MVU ? MH / PE : 1
+	localparam int unsigned  NF = MH / PE
 )
 (
 	// Global Control
@@ -151,37 +151,28 @@ module mvu_vvu_axi #(
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-//-------------------- Core MVU/VVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	uwire mvauin_t amvau_i;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-
-	if (IS_MVU) begin : genMVUInput
-		assign  amvau_i = amvau;
-	end : genMVUInput
-	else begin : genVVUInput
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = PE*SIMD;
-		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
-									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-		end : genRewire
-	end : genVVUInput
+	//- Instantiate compute core ----------------------------
+	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
+	uwire dsp_vld;
+	uwire dsp_p_t  dsp_p;
+
+	uwire dsp_clk = ap_clk;
+	uwire dsp_en = en;
+	uwire dsp_last = alast && avld;
+	uwire dsp_zero = !istb;
+	uwire mvu_w_t dsp_w = mvu_w;
+	uwire mvu_a_t dsp_a = mvu_a;
+	uwire ovld = dsp_vld;
+	uwire dsp_p_t  odat = dsp_p;
 
 	case(COMPUTE_CORE)
 	"mvu_vvu_8sx9_dsp58":
 		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (

From ef12de1a86111cfab783640cd3a2a835de2791fe Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 6 Feb 2024 14:13:19 +0000
Subject: [PATCH 107/123] [mvu/vvu axi]: picked out modifications from another
 branch to enable VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 0168f20563..3affe4bb7b 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -62,7 +62,7 @@ module mvu_vvu_axi #(
 	// Safely deducible parameters
 	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH	= SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
@@ -125,25 +125,44 @@ module mvu_vvu_axi #(
 	uwire rst = !ap_rst_n;
 
 	//- Replay to Accommodate Neuron Fold -----------------------------------
-	typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t;
+	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_flatin_t;
 	uwire mvu_flatin_t amvau;
 	uwire alast;
 	uwire afin;
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay (
 	.clk, .rst,
 	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
 	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
 	//- Unflatten inputs into structured matrices ---------------------------
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
-	typedef logic         [SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
-
+	localparam int unsigned  ACT_PE = IS_MVU? 1 : PE;
+	typedef logic [PE    -1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
+	typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
+
+	//- Conditional Activations Layout Adjustment for VVU
+	uwire mvu_a_t  amvau_i;
+	if (IS_MVU || (PE == 1)) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i)
+		for(genvar  pe = 0; pe < ACT_PE; pe++) begin
+			for(genvar  simd = 0; simd < SIMD; simd++) begin
+				assign	amvau_i[pe][simd] = amvau[simd*ACT_PE+pe];
+			end
+		end
+	end : genVVUInput
+	
 	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
-	uwire  mvu_a_t  mvu_a = amvau;
+	uwire  mvu_a_t  mvu_a = amvau_i;
 
 	//- Flow Control Bracket around Compute Core ----------------------------
 	uwire en;

From 3d49ab5a204b0428420ca171ff7aba3b89b52cb9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 6 Feb 2024 14:28:40 +0000
Subject: [PATCH 108/123] [mvu test]: cleaned up test

---
 finn-rtllib/mvu/tb/mvu_axi_tb.sv | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index 8614e9f811..62aa0919f4 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -35,13 +35,13 @@ module mvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam bit IS_MVU = 0;
+	localparam bit IS_MVU = 1;
 	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
 	localparam int unsigned MW = 36;
 	localparam int unsigned MH = 4;
-	localparam int unsigned SIMD = 36;
-	localparam int unsigned PE = 4;
-	localparam int unsigned SEGMENTLEN = 2.0;
+	localparam int unsigned SIMD = 9;
+	localparam int unsigned PE = 2;
+	localparam int unsigned SEGMENTLEN = 1.0;
 	localparam bit FORCE_BEHAVIORAL = 1;
 	localparam bit M_REG_LUT = 1;
 	// Bit-width config
@@ -156,16 +156,6 @@ module mvu_axi_tb();
 
 	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
 		automatic output_vector_t res = '{default: 0};
-		// for (int j = 0; j<MH; j++) begin
-		// 	for (int i = 0; i<MW; i++) begin
-		// 		if (SIGNED_ACTIVATIONS)
-		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]);
-		// 		else
-		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]);
-		// 	end
-		// end
 		// The input stream will have the channels interleaved for VVU when PE>1
 		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
 		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:

From 105ae6fde79d9b7eca17f23fc3f7c80b0db51f6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 07:58:41 +0100
Subject: [PATCH 109/123] Revised control interface attributes.

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 93 ++++++++++++++++++++++++++
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v  |  2 +-
 2 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
new file mode 100644
index 0000000000..e15f77fbae
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -0,0 +1,93 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter 	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter 	PE = $PE$,
+	parameter 	SIMD = $SIMD$,
+	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter 	SEGMENTLEN = $SEGMENTLEN$,
+	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
+
+	// Safely deducible parameters
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter 	OUTPUT_LANES = PE,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)(
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
+
+	// Weight Stream
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	s_axis_weights_tvalid,
+	output	s_axis_weights_tready,
+
+	// Input Stream
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	s_axis_input_tvalid,
+	output	s_axis_input_tready,
+
+	// Output Stream
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	m_axis_output_tvalid,
+	input	m_axis_output_tready
+);
+
+mvu_8sx9_axi #(
+	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(s_axis_weights_tdata),
+	.s_axis_weights_tvalid(s_axis_weights_tvalid),
+	.s_axis_weights_tready(s_axis_weights_tready),
+	.s_axis_input_tdata(s_axis_input_tdata),
+	.s_axis_input_tvalid(s_axis_input_tvalid),
+	.s_axis_input_tready(s_axis_input_tready),
+	.m_axis_output_tdata(m_axis_output_tdata),
+	.m_axis_output_tvalid(m_axis_output_tvalid),
+	.m_axis_output_tready(m_axis_output_tready)
+);
+
+endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 01deb23840..99178880f7 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -89,4 +89,4 @@ mvu_vvu_axi #(
 	.m_axis_output_tready(out_V_TREADY)
 );
 
-endmodule // $MODULE_NAME_AXI_WRAPPER$
+endmodule : $MODULE_NAME_AXI_WRAPPER$

From 936ef69bb868d2702472d7c1a6c3767a11263cf4 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 15:49:19 +0100
Subject: [PATCH 110/123] [rtl mvu]: extension to allow selecting PE values
 that are not multiples of 4

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index b49315637f..304637dd31 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -440,7 +440,7 @@ module mvu_4sx4u #(
 				end
 				assign	hi4[i] = Hi4;
 			end : genHi
-			else if (i < 3) begin : genHiZero
+			else begin : genHiZero
 				assign hi4[i] = '0;
 			end : genHiZero
 

From 9bf7e33408b84a7facaf0c0785eef5c5f053bfea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 29 Sep 2023 15:24:28 +0100
Subject: [PATCH 111/123] Starting on pumped DSP compute.

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 224 ++++++++++++++++++++++++++-------
 1 file changed, 182 insertions(+), 42 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 3affe4bb7b..1b690195f3 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -51,27 +51,27 @@ module mvu_vvu_axi #(
 	int unsigned MH,
 	int unsigned PE,
 	int unsigned SIMD,
+	int unsigned SEGMENTLEN = 0,
+
 	int unsigned ACTIVATION_WIDTH,
 	int unsigned WEIGHT_WIDTH,
 	int unsigned ACCU_WIDTH,
 	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
+
+	bit PUMPED_COMPUTE = 0,	// requires an even SIMD % 2 == 0
 	bit FORCE_BEHAVIORAL = 0,
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
-	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
-	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  SF = MW / SIMD,
-	localparam int unsigned  NF = MH / PE
-)
-(
+	localparam int unsigned  WEIGHT_STREAM_WIDTH    = PE * SIMD * WEIGHT_WIDTH,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH     = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA  = (INPUT_STREAM_WIDTH  + 7)/8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)(
 	// Global Control
 	input	logic  ap_clk,
+	input	logic  ap_clk2x,	// only used when PUMPED_COMPUTE
 	input	logic  ap_rst_n,
 
 	// Weight Stream
@@ -119,23 +119,39 @@ module mvu_vvu_axi #(
 				$finish;
 			end
 		end
+		if (!IS_MVU) begin
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+				$error("VVU only supported on DSP58 or LUT-based implementation");
+				$finish;
+			end
+		end
+
+		//- Pumping Constraints ---------
+		if(PUMPED_COMPUTE) begin
+			if(SIMD % 2 != 0) begin
+				$error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD);
+				$finish;
+			end
+		end
 	end
 
 	uwire clk = ap_clk;
 	uwire rst = !ap_rst_n;
 
 	//- Replay to Accommodate Neuron Fold -----------------------------------
-	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_flatin_t;
+	typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_flatin_t;
 	uwire mvu_flatin_t amvau;
 	uwire alast;
 	uwire afin;
 	uwire avld;
 	uwire ardy;
 
+	localparam int unsigned  SF = MW/SIMD;
+	localparam int unsigned  NF = MH/PE;
 	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay (
-	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
-	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
 	//- Unflatten inputs into structured matrices ---------------------------
@@ -143,6 +159,8 @@ module mvu_vvu_axi #(
 	typedef logic [PE    -1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
 	typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
 
+	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
+
 	//- Conditional Activations Layout Adjustment for VVU
 	uwire mvu_a_t  amvau_i;
 	if (IS_MVU || (PE == 1)) begin : genMVUInput
@@ -184,33 +202,155 @@ module mvu_vvu_axi #(
 	uwire ovld = dsp_vld;
 	uwire dsp_p_t  odat = dsp_p;
 
-	case(COMPUTE_CORE)
-	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk(dsp_clk), .rst, .en(dsp_en),
-			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
-			.vld(dsp_vld), .p(dsp_p)
-		);
-	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk(dsp_clk), .rst, .en(dsp_en),
-			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
-			.vld(dsp_vld), .p(dsp_p)
-		);
-	"mvu_8sx8u_dsp48":
-		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk(dsp_clk), .rst, .en(dsp_en),
-			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
-			.vld(dsp_vld), .p(dsp_p)
-		);
-	default: initial begin
-		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
-		$finish;
-	end
-	endcase
+	//- Flow Control Bracket around Compute Core ----------------------------
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+	//- Conditionally Pumped DSP Compute ------------------------------------
+	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
+	uwire  ovld;
+	uwire dsp_p_t  odat;
+	if(1) begin : blkDsp
+		localparam int unsigned  DSP_SIMD = SIMD/(PUMPED_COMPUTE+1);
+		typedef logic [PE    -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH    -1:0]  dsp_w_t;
+		typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  dsp_a_t;
+
+		uwire  dsp_clk;
+		uwire  dsp_en;
+
+		uwire  dsp_last;
+		uwire  dsp_zero;
+		uwire dsp_w_t  dsp_w;
+		uwire dsp_a_t  dsp_a;
+
+		uwire  dsp_vld;
+		uwire dsp_p_t  dsp_p;
+
+		if(!PUMPED_COMPUTE) begin : genUnpumpedCompute
+			assign	dsp_clk = clk;
+			assign	dsp_en  = en;
+
+			assign	dsp_last = alast && avld;
+			assign	dsp_zero = !istb;
+			assign	dsp_w = mvu_w;
+			assign	dsp_a = amvau_i;
+
+			assign	ovld = dsp_vld;
+			assign	odat = dsp_p;
+		end : genUnpumpedCompute
+		else begin : genPumpedCompute
+			assign	dsp_clk = clk2x;
+
+			// Identify second fast cycle before active slow clock edge
+			logic  Active = 0;
+			always_ff @(posedge clk2x)  Active <= clk;
+
+			// The input for a slow cycle is split across two fast cycles along the SIMD dimension.
+			//	- Both fast cycles are controlled by the same enable state.
+			//	- A zero cycle is duplicated across both fast cycles.
+			//	- The last flag must be restricted to the second fast cycle.
+			logic  En = 0;
+			logic  Last[1:0] = '{ default: 1'b0 };
+			logic  Zero = 1;
+			dsp_w_t  W[1:0] = '{ default: 'x };
+			dsp_a_t  A[1:0] = '{ default: 'x };
+			always_ff @(posedge clk2x) begin
+				if(rst) begin
+					En   <= 0;
+					Last <= '{ default: 1'b0 };
+					Zero <=  1;
+					W <= '{ default: 'x };
+					A <= '{ default: 'x };
+				end
+				else begin
+					if(Active) begin
+						En <= en;
+						if(en) begin
+							Last <= '{ alast && avld, 1'b0 };
+							Zero <= !istb;
+							for(int unsigned  simd = 0; simd < SIMD; simd++) begin
+								for(int unsigned  pe = 0; pe < PE; pe++) begin
+									W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= mvu_w[pe][simd];
+								end
+								for(int unsigned  pe = 0; pe < ACT_PE; pe++) begin
+									A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= amvau_i[pe][simd];
+								end
+							end
+						end
+					end
+					else if(En) begin
+						Last <= '{ 'x, Last[1] };
+						W    <= '{ 'x, W[1] };
+						A    <= '{ 'x, A[1] };
+					end
+				end
+			end
+			assign	dsp_en = En;
+
+			assign	dsp_last = Last[0];
+			assign	dsp_zero = Zero;
+			assign	dsp_w = W[0];
+			assign	dsp_a = A[0];
+
+			// Since no two consecutive last cycles will ever be asserted on the input,
+			// valid outputs will also always be spaced by, at least, one other cycle.
+			// We can always hold a captured output for two cycles to allow the slow
+			// clock to pick it up.
+			logic    Vld = 0;
+			dsp_p_t  P = 'x;
+			always_ff @(posedge clk2x) begin
+				if(rst) begin
+					Vld <= 0;
+					P   <= 'x;
+				end
+				else begin
+					if(dsp_vld)  P <= dsp_p;
+					Vld <= dsp_vld || (Vld && !Active);
+				end
+			end
+			assign	ovld = Vld;
+			assign	odat = P;
+
+		end : genPumpedCompute
+
+		case(COMPUTE_CORE)
+		"mvu_vvu_8sx9_dsp58":
+			mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+			.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_4sx4u":
+			mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_8sx8u_dsp48":
+			mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_vvu_lut":
+			mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+			.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		default: initial begin
+			$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
+			$finish;
+		end
+		endcase
+
+	end : blkDsp
 
 //-------------------- Output register slice --------------------\\
 	// Make `en`computation independent from external inputs.

From 80a5510cdb06b443c9d71cc5180f1c8bc6569886 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 8 Feb 2024 14:00:19 +0000
Subject: [PATCH 112/123] pulled latest changes related to double-pumping

---
 finn-rtllib/mvu/mvu_4sx4u.sv          |   2 +-
 finn-rtllib/mvu/mvu_vvu_axi.sv        | 136 ++++++++++++------------
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v |   9 +-
 finn-rtllib/mvu/tb/mvu_axi_tb.sv      |  18 +++-
 finn-rtllib/mvu/tb/mvu_dsp58_tb.sv    | 142 ++++++++++++++++++++++++++
 5 files changed, 227 insertions(+), 80 deletions(-)
 create mode 100644 finn-rtllib/mvu/tb/mvu_dsp58_tb.sv

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 304637dd31..b49315637f 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -440,7 +440,7 @@ module mvu_4sx4u #(
 				end
 				assign	hi4[i] = Hi4;
 			end : genHi
-			else begin : genHiZero
+			else if (i < 3) begin : genHiZero
 				assign hi4[i] = '0;
 			end : genHiZero
 
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 1b690195f3..d40c5e1b10 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -67,11 +67,13 @@ module mvu_vvu_axi #(
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
 	localparam int unsigned  INPUT_STREAM_WIDTH     = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned  INPUT_STREAM_WIDTH_BA  = (INPUT_STREAM_WIDTH  + 7)/8 * 8,
-	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned  OUTPUT_STREAM_WIDTH    = PE*ACCU_WIDTH,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8,
+	localparam bit  		 SIMD_UNEVEN = SIMD % 2
 )(
 	// Global Control
 	input	logic  ap_clk,
-	input	logic  ap_clk2x,	// only used when PUMPED_COMPUTE
+	input	logic  ap_clk2x,	// synchronous, double-speed clock; only used for PUMPED_COMPUTE
 	input	logic  ap_rst_n,
 
 	// Weight Stream
@@ -126,17 +128,18 @@ module mvu_vvu_axi #(
 			end
 		end
 
-		//- Pumping Constraints ---------
-		if(PUMPED_COMPUTE) begin
-			if(SIMD % 2 != 0) begin
-				$error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD);
-				$finish;
-			end
-		end
+		// //- Pumping Constraints ---------
+		// if(PUMPED_COMPUTE) begin
+		// 	if(SIMD % 2 != 0) begin
+		// 		$error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD);
+		// 		$finish;
+		// 	end
+		// end
 	end
 
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
+	uwire  clk = ap_clk;
+	uwire  clk2x = ap_clk2x;
+	uwire  rst = !ap_rst_n;
 
 	//- Replay to Accommodate Neuron Fold -----------------------------------
 	typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_flatin_t;
@@ -178,29 +181,6 @@ module mvu_vvu_axi #(
 			end
 		end
 	end : genVVUInput
-	
-	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
-	uwire  mvu_a_t  mvu_a = amvau_i;
-
-	//- Flow Control Bracket around Compute Core ----------------------------
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
-
-	//- Instantiate compute core ----------------------------
-	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
-	uwire dsp_vld;
-	uwire dsp_p_t  dsp_p;
-
-	uwire dsp_clk = ap_clk;
-	uwire dsp_en = en;
-	uwire dsp_last = alast && avld;
-	uwire dsp_zero = !istb;
-	uwire mvu_w_t dsp_w = mvu_w;
-	uwire mvu_a_t dsp_a = mvu_a;
-	uwire ovld = dsp_vld;
-	uwire dsp_p_t  odat = dsp_p;
 
 	//- Flow Control Bracket around Compute Core ----------------------------
 	uwire en;
@@ -213,7 +193,8 @@ module mvu_vvu_axi #(
 	uwire  ovld;
 	uwire dsp_p_t  odat;
 	if(1) begin : blkDsp
-		localparam int unsigned  DSP_SIMD = SIMD/(PUMPED_COMPUTE+1);
+		localparam int unsigned  EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; 
+		localparam int unsigned  DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1);
 		typedef logic [PE    -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH    -1:0]  dsp_w_t;
 		typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  dsp_a_t;
 
@@ -243,56 +224,66 @@ module mvu_vvu_axi #(
 		else begin : genPumpedCompute
 			assign	dsp_clk = clk2x;
 
-			// Identify second fast cycle before active slow clock edge
+			// Identify second fast cycle just before active slow clock edge
 			logic  Active = 0;
-			always_ff @(posedge clk2x)  Active <= clk;
+			if(1) begin : blkActive
+				uwire  clk_lut[2];	// Put some LUT delay on the input from the fast clock net
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk));
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0]));
+				always_ff @(posedge clk2x)  Active <= clk_lut[1];
+			end : blkActive
 
 			// The input for a slow cycle is split across two fast cycles along the SIMD dimension.
 			//	- Both fast cycles are controlled by the same enable state.
 			//	- A zero cycle is duplicated across both fast cycles.
 			//	- The last flag must be restricted to the second fast cycle.
-			logic  En = 0;
-			logic  Last[1:0] = '{ default: 1'b0 };
+
+			dsp_w_t  W = 'x;
+			for(genvar  pe = 0; pe < PE; pe++) begin : genPERegW
+
+				uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0]  w;
+				for(genvar  i =    0; i <       SIMD; i++)  assign  w[i] = mvu_w[pe][i];
+				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  w[i] = 0;
+
+				always_ff @(posedge clk2x) begin
+					if(rst)      W[pe] <= 'x;
+					else if(en)  W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+				end
+
+			end : genPERegW
+
+			dsp_a_t  A = 'x;
+			for(genvar  pe = 0; pe < ACT_PE; pe++) begin : genPERegA
+
+				uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  a;
+				for(genvar  i =    0; i <       SIMD; i++)  assign  a[i] = amvau_i[pe][i];
+				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  a[i] = 0;
+
+				always_ff @(posedge clk2x) begin
+					if(rst)      A[pe] <= 'x;
+					else if(en)  A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+				end
+
+			end : genPERegA
+
 			logic  Zero = 1;
-			dsp_w_t  W[1:0] = '{ default: 'x };
-			dsp_a_t  A[1:0] = '{ default: 'x };
+			logic  Last = 0;
 			always_ff @(posedge clk2x) begin
 				if(rst) begin
-					En   <= 0;
-					Last <= '{ default: 1'b0 };
-					Zero <=  1;
-					W <= '{ default: 'x };
-					A <= '{ default: 'x };
+					Zero <= 1;
+					Last <= 0;
 				end
-				else begin
-					if(Active) begin
-						En <= en;
-						if(en) begin
-							Last <= '{ alast && avld, 1'b0 };
-							Zero <= !istb;
-							for(int unsigned  simd = 0; simd < SIMD; simd++) begin
-								for(int unsigned  pe = 0; pe < PE; pe++) begin
-									W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= mvu_w[pe][simd];
-								end
-								for(int unsigned  pe = 0; pe < ACT_PE; pe++) begin
-									A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= amvau_i[pe][simd];
-								end
-							end
-						end
-					end
-					else if(En) begin
-						Last <= '{ 'x, Last[1] };
-						W    <= '{ 'x, W[1] };
-						A    <= '{ 'x, A[1] };
-					end
+				else if(en) begin
+					Zero <= !istb;
+					Last <= alast && avld && Active;
 				end
 			end
-			assign	dsp_en = En;
 
-			assign	dsp_last = Last[0];
+			assign	dsp_en = en;
+			assign	dsp_last = Last;
 			assign	dsp_zero = Zero;
-			assign	dsp_w = W[0];
-			assign	dsp_a = A[0];
+			assign	dsp_w = W;
+			assign	dsp_a = A;
 
 			// Since no two consecutive last cycles will ever be asserted on the input,
 			// valid outputs will also always be spaced by, at least, one other cycle.
@@ -305,7 +296,7 @@ module mvu_vvu_axi #(
 					Vld <= 0;
 					P   <= 'x;
 				end
-				else begin
+				else if(en) begin
 					if(dsp_vld)  P <= dsp_p;
 					Vld <= dsp_vld || (Vld && !Active);
 				end
@@ -389,5 +380,4 @@ module mvu_vvu_axi #(
 	// These extra bits should never be used. Why not 'x them out?
 	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
-
 endmodule : mvu_vvu_axi
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 99178880f7..11949dec24 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -34,6 +34,7 @@
 module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	IS_MVU = $IS_MVU$,
 	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
+	parameter	PUMPED_COMPUTE = $PUMPED_COMPUTE$,
 	parameter	MW = $MW$,
 	parameter	MH = $MH$,
 	parameter	PE = $PE$,
@@ -54,6 +55,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
 	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
 	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *)
+	input   ap_clk2x,
 	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
 	input	ap_rst_n,
 
@@ -72,11 +76,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 );
 
 mvu_vvu_axi #(
-	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
 	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
 	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 	) inst (
 	.ap_clk(ap_clk),
+	.ap_clk2x(ap_clk2x),
 	.ap_rst_n(ap_rst_n),
 	.s_axis_weights_tdata(weights_V_TDATA),
 	.s_axis_weights_tvalid(weights_V_TVALID),
@@ -89,4 +94,4 @@ mvu_vvu_axi #(
 	.m_axis_output_tready(out_V_TREADY)
 );
 
-endmodule : $MODULE_NAME_AXI_WRAPPER$
+endmodule // $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index 62aa0919f4..8614e9f811 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -35,13 +35,13 @@ module mvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam bit IS_MVU = 1;
+	localparam bit IS_MVU = 0;
 	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
 	localparam int unsigned MW = 36;
 	localparam int unsigned MH = 4;
-	localparam int unsigned SIMD = 9;
-	localparam int unsigned PE = 2;
-	localparam int unsigned SEGMENTLEN = 1.0;
+	localparam int unsigned SIMD = 36;
+	localparam int unsigned PE = 4;
+	localparam int unsigned SEGMENTLEN = 2.0;
 	localparam bit FORCE_BEHAVIORAL = 1;
 	localparam bit M_REG_LUT = 1;
 	// Bit-width config
@@ -156,6 +156,16 @@ module mvu_axi_tb();
 
 	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
 		automatic output_vector_t res = '{default: 0};
+		// for (int j = 0; j<MH; j++) begin
+		// 	for (int i = 0; i<MW; i++) begin
+		// 		if (SIGNED_ACTIVATIONS)
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]);
+		// 		else
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]);
+		// 	end
+		// end
 		// The input stream will have the channels interleaved for VVU when PE>1
 		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
 		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv
new file mode 100644
index 0000000000..108980c497
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv
@@ -0,0 +1,142 @@
+module mvu_dsp58_tb;
+
+	localparam int unsigned  N = 1000;
+
+	localparam int unsigned  MW = 12;
+	localparam int unsigned  MH = 4;
+	localparam int unsigned  PE = 2;
+	localparam int unsigned  SIMD = 6;
+	localparam int unsigned  ACTIVATION_WIDTH = 8;
+	localparam int unsigned  WEIGHT_WIDTH = 8;
+	localparam int unsigned  ACCU_WIDTH = 24;
+
+	//- Global Control ------------------
+	logic  clk = 1;
+	logic  clk2x = 1;
+	always #5ns clk = !clk;
+	always #2.5ns clk2x = !clk2x;
+
+	logic  rst = 1;
+	initial begin
+		repeat(8) @(posedge clk);
+		rst <= 0;
+	end
+
+	//- DUTs ----------------------------
+
+	// Weight Stream
+	logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  s_axis_weights_tdata;
+	logic  s_axis_weights_tvalid[2];
+	uwire  s_axis_weights_tready[2];
+
+	// Input Stream
+	logic [SIMD-1:0][ACTIVATION_WIDTH-1:0]  s_axis_input_tdata;
+	logic  s_axis_input_tvalid[2];
+	uwire  s_axis_input_tready[2];
+
+	// Output Stream
+	uwire [PE-1:0][ACCU_WIDTH-1:0]  m_axis_output_tdata[2];
+	uwire  m_axis_output_tvalid[2];
+	logic  m_axis_output_tready[2];
+
+	for(genvar  i = 0; i < 2; i++) begin : genDUTs
+		mvu_vvu_axi #(
+			.IS_MVU(1),
+			.COMPUTE_CORE("mvu_vvu_8sx9_dsp58"),
+			.MW(MW), .MH(MH),
+			.PE(PE), .SIMD(SIMD),
+			.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+			.WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.ACCU_WIDTH(ACCU_WIDTH),
+			.PUMPED_COMPUTE(i)
+		) dut (
+			.ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst),
+			.s_axis_weights_tdata,                        .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]),
+			.s_axis_input_tdata,                          .s_axis_input_tvalid  (s_axis_input_tvalid  [i]), .s_axis_input_tready  (s_axis_input_tready  [i]),
+			.m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i])
+		);
+	end : genDUTs
+
+
+	//- Stimuli -------------------------
+
+	// Weight Feed
+	initial begin
+		s_axis_weights_tvalid = '{ default: 0 };
+		s_axis_weights_tdata  = 'x;
+		@(posedge clk iff !rst);
+
+		repeat(N * (MH/PE)*(MW/SIMD)) begin
+			automatic type(s_axis_weights_tdata)  weights;
+			std::randomize(weights);
+			s_axis_weights_tdata <= weights;
+			s_axis_weights_tvalid <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff s_axis_weights_tready[0]);
+					s_axis_weights_tvalid[0] <= 0;
+				end
+				begin
+					@(posedge clk iff s_axis_weights_tready[1]);
+					s_axis_weights_tvalid[1] <= 0;
+				end
+			join
+		end
+	end
+
+	// Input Feed
+	initial begin
+		s_axis_input_tvalid = '{ default: 0 };
+		s_axis_input_tdata  = 'x;
+		@(posedge clk iff !rst);
+
+		repeat(N * (MW/SIMD)) begin
+			automatic type(s_axis_input_tdata)  in;
+			std::randomize(in);
+			s_axis_input_tdata <= in;
+			s_axis_input_tvalid <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff s_axis_input_tready[0]);
+					s_axis_input_tvalid[0] <= 0;
+				end
+				begin
+					@(posedge clk iff s_axis_input_tready[1]);
+					s_axis_input_tvalid[1] <= 0;
+				end
+			join
+		end
+	end
+
+	// Output Capture and Comparison
+	initial begin
+		m_axis_output_tready = '{ default: 0 };
+		@(posedge clk iff !rst);
+
+		repeat(N * (MH/PE)) begin
+			automatic type(m_axis_output_tdata)  res;
+			m_axis_output_tready <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff m_axis_output_tvalid[0]);
+					m_axis_output_tready[0] <= 0;
+					res[0] = m_axis_output_tdata[0];
+				end
+				begin
+					@(posedge clk iff m_axis_output_tvalid[1]);
+					m_axis_output_tready[1] <= 0;
+					res[1] = m_axis_output_tdata[1];
+				end
+			join
+			assert(res[0] == res[1]) else begin
+				$error("Output mismatch: %0x <=> %0x", res[0], res[1]);
+				$stop;
+			end
+			while($urandom()%7 < MW/SIMD) @(posedge clk);	// Occassional backpressure
+		end
+
+		$display("Test completed.");
+		$finish;
+	end
+
+endmodule : mvu_dsp58_tb

From 289749b0c4b0e72fef39d4f7011380571f4b6869 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 8 Feb 2024 14:00:40 +0000
Subject: [PATCH 113/123] minor fix to param

---
 finn-rtllib/mvu/tb/mvu_axi_tb.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index 8614e9f811..08e8679214 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -35,7 +35,7 @@ module mvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam bit IS_MVU = 0;
+	localparam bit IS_MVU = 1;
 	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
 	localparam int unsigned MW = 36;
 	localparam int unsigned MH = 4;

From b51837b3a366a85f15931d7bc8a1aef0dc82494b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 7 Feb 2024 09:38:59 +0000
Subject: [PATCH 114/123] added RTL-based MVAU and VVAU custom-ops

---
 src/finn/custom_op/fpgadataflow/rtl/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
index 914c033584..28e08aa445 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
@@ -34,6 +34,8 @@
     StreamingDataWidthConverter_rtl,
 )
 from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl
+from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MatrixVectorActivation_rtl
+from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VectorVectorActivation_rtl
 
 custom_op = dict()
 
@@ -43,3 +45,5 @@
 custom_op["FMPadding_rtl"] = FMPadding_rtl
 custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl
 custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl
+custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl
+custom_op["VectorVectorActivation_rtl"] = VectorVectorActivation_rtl

From 86465e0316e567b805b209e8eece1d8c87d9158d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 7 Feb 2024 09:40:11 +0000
Subject: [PATCH 115/123] [builder]: renamed specialize_to_rtl step to
 specialize_layers step, default standalone_thresholds set to False

---
 src/finn/builder/build_dataflow_config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 073bc9e12b..85b7d61ce5 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -116,6 +116,7 @@ class VerificationStepType(str, Enum):
     "step_tidy_up",
     "step_streamline",
     "step_convert_to_hls",
+    "step_specialize_layers",
     "step_create_dataflow_partition",
     "step_target_fps_parallelization",
     "step_apply_folding_config",
@@ -139,6 +140,7 @@ class VerificationStepType(str, Enum):
     "step_tidy_up",
     "step_streamline",
     "step_convert_to_hls",
+    "step_specialize_layers",
     "step_create_dataflow_partition",
     "step_target_fps_parallelization",
     "step_apply_folding_config",
@@ -234,7 +236,7 @@ class DataflowBuildConfig:
     #: activations in FINN) will be implemented as stand-alone HLS layers,
     #: instead of being part of MatrixVectorActivation layer. This gives larger
     #: flexibility, and makes it possible to have runtime-writable thresholds.
-    standalone_thresholds: Optional[bool] = True
+    standalone_thresholds: Optional[bool] = False
 
     #: (Optional) Whether optimizations that minimize the bit width of the
     #: weights and accumulator will be applied. Because this optimization relies

From c8b793c081ea0f72d491c48cef8681222f91b6f0 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 7 Feb 2024 09:40:45 +0000
Subject: [PATCH 116/123] [builder]: added first version of specialize_layer
 step

---
 src/finn/builder/build_dataflow_steps.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 2629efef11..b74dc7adc5 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -53,7 +53,7 @@
 from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -473,12 +473,9 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
     return model
 
 
-def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
-    """Convert layers implemented in HLS to an equivalent specialized RTL
-    implementation if possible."""
-    specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()]
-    for trn in specialize_to_rtl_transforms:
-        model = model.transform(trn)
+def step_specialize_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Convert HW custom-ops into custom-ops suitable for FPGA implementation either with HLS or RTL backend."""
+    model = model.transform(SpecializeLayers())
     return model
 
 
@@ -844,7 +841,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
     "step_apply_folding_config": step_apply_folding_config,
     "step_minimize_bit_width": step_minimize_bit_width,
     "step_generate_estimate_reports": step_generate_estimate_reports,
-    "step_specialize_to_rtl": step_specialize_to_rtl,
+    "step_specialize_layers": step_specialize_layers,
     "step_hls_codegen": step_hls_codegen,
     "step_hls_ipgen": step_hls_ipgen,
     "step_set_fifo_depths": step_set_fifo_depths,

From 79ff91137c9b753807803d76b60aea6f169ca59f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 8 Feb 2024 14:41:26 +0000
Subject: [PATCH 117/123] pulled latest changes

---
 .../fpgadataflow/create_stitched_ip.py        |  37 ++-
 .../fpgadataflow/set_folding.py               |   4 +-
 .../test_fpgadataflow_mvau_rtl.py             | 123 +++++----
 .../test_fpgadataflow_vvau_rtl.py             | 234 ++++++++++++++++++
 4 files changed, 339 insertions(+), 59 deletions(-)
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py

diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 1c316e1285..f797e3d841 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -48,13 +48,12 @@ def is_external_input(model, node, i):
     # True only if input is unconnected and has no initializer
     # Only esception is second input of FC layers when mem_mode is external
     node_inst = getCustomOp(node)
-    op_type = node.op_type
     producer = model.find_producer(node.input[i])
     if producer is None:
         if model.get_initializer(node.input[i]) is None:
             return True
         else:
-            if op_type.startswith("MatrixVectorActivation"):
+            if node.op_type == "MatrixVectorActivation":
                 if node_inst.get_nodeattr("mem_mode") == "external":
                     return True
     return False
@@ -103,6 +102,7 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu
         # keep track of top-level interface names
         self.intf_names = {
             "clk": [],
+            "clk2x": [],
             "rst": [],
             "s_axis": [],
             "m_axis": [],
@@ -110,10 +110,19 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu
             "axilite": [],
         }
 
+    def _is_double_pumped(self, node):
+        try:
+            pumped_compute = getCustomOp(node).get_nodeattr("pumpedCompute")
+            return pumped_compute==1
+        except:
+            return False
+
     def connect_clk_rst(self, node):
         inst_name = node.name
         node_inst = getCustomOp(node)
         clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0]
+        if self._is_double_pumped(node):
+            clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0]
         reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0]
         # make clock and reset external, if they aren't already
         if not self.clock_reset_are_external:
@@ -128,6 +137,22 @@ def connect_clk_rst(self, node):
             self.clock_reset_are_external = True
             self.intf_names["clk"] = ["ap_clk"]
             self.intf_names["rst"] = ["ap_rst_n"]
+        # make clk2x external, if it isn't already and connect clk and reset
+        elif self._is_double_pumped(node) and not self.clock2x_is_external:
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name)
+            )
+            self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]")
+            self.clock2x_is_external = True
+            self.intf_names["clk2x"] = ["ap_clk2x"]
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
+            )
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
         # otherwise connect clock and reset
         else:
             self.connect_cmds.append(
@@ -138,6 +163,11 @@ def connect_clk_rst(self, node):
                 "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
                 % (inst_name, clock_intf_name)
             )
+            if self._is_double_pumped(node):
+                self.connect_cmds.append(
+                    "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]"
+                    % (inst_name, clock2x_intf_name)
+                )
 
     def connect_axi(self, node):
         inst_name = node.name
@@ -285,7 +315,7 @@ def apply(self, model):
         ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream")
         if self.signature:
             ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info")
-        if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA_hls"]:
+        if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]:
             warnings.warn(
                 """First node is not StreamingFIFO or IODMA.
                 You may experience incorrect stitched-IP rtlsim or hardware
@@ -377,6 +407,7 @@ def apply(self, model):
         fclk_hz = fclk_mhz * 1000000
         model.set_metadata_prop("clk_ns", str(self.clk_ns))
         tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz))
+        tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2*fclk_hz))
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
         # create wrapper hdl (for rtlsim later on)
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 5555237ca3..871919f3f2 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -118,7 +118,7 @@ def apply(self, model):
         simd_ops = [
             "DownSampler",
             "FMPadding_Batch",
-            "FMPadding_Pixel",
+            "FMPadding_Batch_rtl",
             "ConvolutionInputGenerator",
             "ConvolutionInputGenerator1D",
             "ConvolutionInputGenerator_rtl",
@@ -131,7 +131,7 @@ def apply(self, model):
                 continue
             op_type = node.op_type
             node_inst = getCustomOp(node)
-            if op_type.startswith("MatrixVectorActivation"):
+            if op_type == "MatrixVectorActivation":
                 max_simd = node_inst.get_nodeattr("MW")
                 max_pe = node_inst.get_nodeattr("MH")
                 node_inst.set_nodeattr("PE", 1)
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index 1e9de44fb2..45b33b24e8 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -27,6 +27,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+import os
+import pickle
 
 import numpy as np
 import os
@@ -42,13 +44,7 @@
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
-
-
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from qonnx.custom_op.registry import getCustomOp
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
@@ -70,52 +66,58 @@ def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W):
 
     return model
 
-
 def prepare_inputs(input_tensor):
-    return {"global_in": input_tensor}
-
-
-# @pytest.mark.parametrize("mh", [36])
-# @pytest.mark.parametrize("mw", [256])
-@pytest.mark.parametrize("mh", [9])
-@pytest.mark.parametrize("mw", [36])
-# @pytest.mark.parametrize("pe", [1, 4, 9, 36])
-# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256])
-@pytest.mark.parametrize("pe", [1, 3, 9])
-@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36])
-@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
-@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]])
-# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"])
+    return {"ifm": input_tensor}
+
+@pytest.mark.parametrize("mh", [4])
+# @pytest.mark.parametrize("mw", [36])
+@pytest.mark.parametrize("mw", [18])
+# @pytest.mark.parametrize("pe", [1,2,4,8])
+@pytest.mark.parametrize("pe", [2])
+# @pytest.mark.parametrize("simd", [1,3,6,9,18,36])
+@pytest.mark.parametrize("simd", [6])
+#@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
+@pytest.mark.parametrize("idt", [DataType["UINT8"]])
+#@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
+@pytest.mark.parametrize("wdt", [DataType["INT8"]])
+#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
+#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
 @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"])
-@pytest.mark.parametrize("clk_ns", [1.66, 4])
+@pytest.mark.parametrize("segmentlen", [1])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_mvau_rtl(
-    mh, mw, pe, simd, idt, wdt, part, clk_ns
-):
-    if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66:
-        pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test")
-
+def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+    # Synthesis constants
+    clk_ns = 5
     # Create test input vector (produced by SWG)
     ofm_shape = (5, 5)
     ofm_h, ofm_w = ofm_shape
     ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
     ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
     W = gen_finn_dt_tensor(wdt, (mw, mh))
-    model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
+    # np.save("weights.npy", W)
+    ##
+    # W = np.load("weights.npy")
+    model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
 
     model.save(build_dir + "/matmul.onnx")
 
     # Create MatMul & obtain golden reference output
-    A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in"))
+    A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    # np.save("activations.npy", A)
+    ##
+    # A = np.load("activations.npy")
     input_dict = prepare_inputs(A)
 
     # Execute ONNX model
     output_matmul = oxe.execute_onnx(model, input_dict)["global_out"]
 
+    with open(build_dir + "/onnx_output.pkl", "wb") as f:
+        pickle.dump(output_matmul, f)
+
     with open(build_dir + "/onnx_output.pkl", "wb") as f:
         pickle.dump(output_matmul, f)
 
@@ -127,26 +129,41 @@ def test_fpgadataflow_mvau_rtl(
     folding_config = {
         "Defaults": {},
         "MatrixVectorActivation_0": {
-            "PE": pe,
-            "SIMD": simd,
-            "mem_mode": "decoupled",
-            "ram_style": "auto",
-            "resType": "dsp",
+            "PE" : pe,
+            "SIMD" : simd,
+            "mem_mode" : "decoupled",
+            "ram_style" : "auto",
+            "resType" : "dsp",
             "preferred_backend" : "rtl"
-        },
+        }
     }
     model = model.transform(ApplyConfig(folding_config))
-    model.save(build_dir + "/mvau_hls.onnx")
+    model.save(build_dir+"/mvau_hls.onnx")
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    for n in model.graph.node:
+        getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_hls.vcd")
+    output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+    
 
     # Apply convert-to-rtl step
     model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
     model = model.transform(GiveUniqueNodeNames())
-    model.save(build_dir + "/mvau_rtl.onnx")
+    for n in model.graph.node:
+        if n.op_type=="MatrixVectorActivation_rtl":
+            getCustomOp(n).set_nodeattr("pumpedCompute", 0)
+    model.save(build_dir+"/mvau_rtl.onnx")
 
     # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated
     for n in model.graph.node:
-        getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd")
-    
+        getCustomOp(n).set_nodeattr("rtlsim_so", "")
+        getCustomOp(n).set_nodeattr("code_gen_dir_ipgen", "")
+        getCustomOp(n).set_nodeattr("ipgen_path", "")
+        getCustomOp(n).set_nodeattr("ip_path", "")
+        getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_rtl.vcd")
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(PrepareIP(part, clk_ns))
     model = model.transform(HLSSynthIP())
@@ -156,19 +173,17 @@ def test_fpgadataflow_mvau_rtl(
     with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f:
         pickle.dump(output_mvau_rtl, f)
 
-    model.save(build_dir + "/mvau_rtl_sim.onnx")
-    assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!"
+    with open(build_dir + "/hls_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_hls, f)
 
-    model = model.transform(InsertAndSetFIFODepths(part, clk_ns))
-    model = model.transform(PrepareIP(part, clk_ns))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(CreateStitchedIP(part, clk_ns))
+    with open(build_dir + "/rtl_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_rtl, f)
 
-    os.environ["RTLSIM_TRACE_DEPTH"] = "3"
-    model.set_metadata_prop("rtlsim_so", "")
-    model.set_metadata_prop("exec_mode", "rtlsim")
-    model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd")
-    model.save(build_dir + "/stitched_ip.onnx")
-    output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"]
+    # model = model.transform(PrepareIP(part, clk_ns))
+    # model = model.transform(HLSSynthIP())
+    # model = model.transform(CreateStitchedIP(fpgapart=part, clk_ns=clk_ns, vitis=True))
+    # model.save(build_dir+"/stitched_ip.onnx")
 
-    assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
\ No newline at end of file
+    #assert (output_mvau_hls == output_mvau_rtl).all()
+    assert (output_matmul['ofm'] == output_mvau_rtl).all()
+    # assert (output_mvau_hls.size > 0)
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
new file mode 100644
index 0000000000..25fad308ee
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
@@ -0,0 +1,234 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import os
+import pickle
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+
+# import qonnx.core.data_layout as DataLayout
+
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+
+def make_single_dw_conv_modelwrapper(conv_config, idt, wdt):
+    kernel_size, in_feature_dim, in_chn = conv_config
+    stride = 1
+    pad = 0
+
+    out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad)
+    group = out_chn = in_chn
+
+    conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = group
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    conv_config["pads"] = [pad, pad, pad, pad]
+    conv_config["strides"] = [stride, stride]
+
+    ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, input_shape)
+    ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, output_shape)
+    weights = [helper.make_tensor_value_info("weights", TensorProto.FLOAT, conv_param_shape)]
+
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="conv_test",
+            inputs=[ifm],
+            outputs=[ofm],
+            value_info=weights,
+            nodes=[helper.make_node("Conv", ["ifm", "weights"], ["ofm"], **conv_config)],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("ifm", idt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_initializer("weights", gen_finn_dt_tensor(wdt, conv_param_shape))
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"global_in": input_tensor}
+
+
+@pytest.mark.parametrize("kernel_size", [3])
+@pytest.mark.parametrize("in_feature_dim", [5])
+@pytest.mark.parametrize("in_chn", [4])
+@pytest.mark.parametrize("idt", [DataType["INT8"]])
+# @pytest.mark.parametrize("idt", [DataType["UINT8"]])
+@pytest.mark.parametrize("wdt", [DataType["INT6"]])
+@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.parametrize("pe", [1, 2, 4])
+@pytest.mark.parametrize("simd", [1, 3, 9])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_vvau_rtl(
+    kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd
+):
+    # Create depthwise-separable convolution
+    conv_config = (kernel_size, in_feature_dim, in_chn)
+    model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt)
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir + "/dw_conv.onnx")
+
+    # Obtain golden reference output
+    golden_in = gen_finn_dt_tensor(
+        model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")
+    )
+    input_dict = prepare_inputs(golden_in)
+    golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
+    with open(build_dir + "/onnx_dws_conv.pkl", "wb") as f:
+        pickle.dump(golden_out, f)
+
+    # Convert to HLS custom-op first
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+    model = model.transform(to_hls.InferVectorVectorActivation())
+    model = model.transform(MinimizeAccumulatorWidth())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir + "/hls_vvau.onnx")
+
+    # Apply folding (i.e. specify to use DSPs)
+    folding_config = {
+        "Defaults": {},
+        "ConvolutionInputGenerator_rtl_0": {"SIMD": 4, "parallel_window": 1},
+        "VectorVectorActivation_0": {
+            "PE": pe,
+            "SIMD": simd,
+            "mem_mode": "decoupled",
+            "ram_style": "auto",
+            "resType": "dsp",
+            "preferred_backend": "rtl",
+        },
+    }
+    model = model.transform(ApplyConfig(folding_config))
+    model.save(build_dir + "/hls_vvau_folded.onnx")
+
+    # Obtain second reference from HLS-based VVAU layer
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
+    with open(build_dir + "/hls_vvau_folded_output.pkl", "wb") as f:
+        pickle.dump(conv_hls_out, f)
+
+    # Stitched-IP RTLsim
+    model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir))
+    model.save(build_dir + "/ip-stitched.onnx")
+    partition_model_path = getCustomOp(
+        model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    ).get_nodeattr("model")
+    partitioned_model = ModelWrapper(partition_model_path)
+    # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism
+    partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5))
+    partitioned_model = partitioned_model.transform(PrepareIP(part, 5))
+    partitioned_model = partitioned_model.transform(HLSSynthIP())
+    partitioned_model.save(build_dir + "/partitioned_model.onnx")
+    partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5))
+    partitioned_model.save(partition_model_path)
+    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/hls-vvu.vcd")
+    # set top-level prop for stitched-ip rtlsim and launch
+    partitioned_model.set_metadata_prop("exec_mode", "rtlsim")
+    # transpose input since we're now simulating HW layers (NCHW --> NHWC)
+    input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1))
+    stitched_ip_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
+    with open(build_dir + "/stitched_ip_output.pkl", "wb") as f:
+        pickle.dump(stitched_ip_out, f)
+
+    # Apply convert-to-rtl step
+    partitioned_model = partitioned_model.transform(to_rtl.InferRTLVectorVectorActivation())
+    partitioned_model = partitioned_model.transform(GiveUniqueNodeNames())
+    partitioned_model = partitioned_model.transform(GiveReadableTensorNames())
+    partitioned_model = partitioned_model.transform(PrepareIP(part, 5))
+    partitioned_model = partitioned_model.transform(HLSSynthIP())
+    partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5))
+    partitioned_model.save(build_dir + "/partition_rtl_vvau.onnx")
+    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/rtl-vvu.vcd")
+    # Reset rtlsim_so path to re-generate Pyverilator sim object
+    partitioned_model.set_metadata_prop("rtlsim_so", "")
+    # set top-level prop for stitched-ip rtlsim and launch
+    partitioned_model.set_metadata_prop("exec_mode", "rtlsim")
+    vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
+    with open(build_dir + "/rtl_vvau_output.pkl", "wb") as f:
+        pickle.dump(vvu_rtl_out, f)
+
+    golden_ret = golden_out["global_out"]
+    # tranpose hardware-generated outputs NHWC -> NCHW to be comparable
+    vvu_rtl_ret = vvu_rtl_out["global_out"].transpose(0, 3, 1, 2)
+    hls_ret = stitched_ip_out["global_out"].transpose(0, 3, 1, 2)
+
+    assert (
+        vvu_rtl_ret == golden_ret
+    ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+    assert (
+        vvu_rtl_ret == hls_ret
+    ).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!"

From 3f9e85ce68edc0835a06850d34ed7eca0a01c53c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 8 Feb 2024 16:30:15 +0000
Subject: [PATCH 118/123] fixed broken merge

---
 src/finn/custom_op/fpgadataflow/__init__.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 70ea858b66..1f2c2740bb 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -41,12 +41,7 @@
 from finn.custom_op.fpgadataflow.labelselect import LabelSelect
 from finn.custom_op.fpgadataflow.lookup import Lookup
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
-<<<<<<< HEAD
 from finn.custom_op.fpgadataflow.pool import Pool
-=======
-from finn.custom_op.fpgadataflow.matrixvectoractivation_rtl import MatrixVectorActivation_rtl
-from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
->>>>>>> origin/feature/mvu_vvu_dsp_pumping
 from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
     StreamingDataflowPartition,
 )
@@ -65,21 +60,10 @@
 # make sure new HLSCustomOp subclasses are imported here so that they get
 # registered and plug in correctly into the infrastructure
 custom_op["MatrixVectorActivation"] = MatrixVectorActivation
-<<<<<<< HEAD
-=======
-custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl
-custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
-custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
-custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
-custom_op["TLastMarker"] = TLastMarker
-custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
-custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl
->>>>>>> origin/feature/mvu_vvu_dsp_pumping
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["Thresholding"] = Thresholding
 custom_op["VectorVectorActivation"] = VectorVectorActivation
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
-
 custom_op["AddStreams"] = AddStreams
 custom_op["ChannelwiseOp"] = ChannelwiseOp
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator

From 1245763291cb536f31bb91b187cba1ea99014ad9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 8 Feb 2024 16:31:20 +0000
Subject: [PATCH 119/123] [mvau hls]: added lut/dsp estimation functions,
 function for stitching the ip and small fix to node execution

---
 .../hls/matrixvectoractivation_hls.py         | 94 ++++++++++++++++++-
 1 file changed, 92 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
index 5206ee3a06..d6d122e41b 100644
--- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
@@ -33,6 +33,7 @@
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from pyverilator.util.axi_utils import toggle_clk, reset_rtlsim
 
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -54,6 +55,84 @@ def get_nodeattr_types(self):
         my_attrs.update(HLSBackend.get_nodeattr_types(self))
         return my_attrs
 
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        MW = self.get_nodeattr("MW")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_datatype = self.get_accumulator_datatype()
+        # if accDataType is not set, then it will default to INT32, which would
+        # be a large overestimate in most (if not all) cases. In this scenario,
+        # we would use the minimum accumulator as determined by the data types
+        # bound, derived in https://arxiv.org/abs/2301.13376
+        alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed())
+        acc_bits = min(
+            acc_datatype.bitwidth(),
+            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+        )
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        tmem_style = self.get_nodeattr("ram_style_thresholds")
+        if (noact == 0) and (tmem_style == "distributed"):
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+        )
+
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
     def get_template_param_values(self):
         """Returns the template parameter values according to input, output and weight
         data types."""
@@ -468,8 +547,8 @@ def execute_node(self, context, graph):
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
             inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
-            self.reset_rtlsim(sim)
-            self.toggle_clk(sim)
+            reset_rtlsim(sim)
+            toggle_clk(sim)
             if mem_mode in ["external", "decoupled"]:
                 wnbits = self.get_weightstream_width()
                 export_wdt = self.get_weight_datatype()
@@ -501,3 +580,14 @@ def execute_node(self, context, graph):
                     mode
                 )
             )
+
+    def code_generation_ipi(self, cmd):
+        # instantiate the HLS IP
+        vlnv = self.get_nodeattr("ip_vlnv")
+        if self.get_nodeattr("mem_mode") == "decoupled":
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (vlnv, node_name, node_name)
+            )
+        else:
+            cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name))
\ No newline at end of file

From 9a4dd046b51a26c46fd9c25dcd064d7ae7c81826 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 8 Feb 2024 16:32:06 +0000
Subject: [PATCH 120/123] [hwcustom op]: removed do_reset flag

---
 src/finn/custom_op/fpgadataflow/hwcustomop.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index 773938525b..f62cf1af8a 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -307,7 +307,6 @@ def rtlsim_multi_io(self, sim, io_dict):
             trace_file=trace_file,
             sname=sname,
             liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
-            do_reset=True,
         )
         self.set_nodeattr("cycles_rtlsim", total_cycle_count)
 

From 30f6ddf91ebcbee60bb0fbbf972e2597ce1229ec Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 8 Feb 2024 16:32:45 +0000
Subject: [PATCH 121/123] [mvau hw-op]: moved lut/dsp estimations to
 specialized ops, modified stitched-ip method

---
 .../fpgadataflow/matrixvectoractivation.py    | 89 +------------------
 1 file changed, 4 insertions(+), 85 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 145cf4f6e6..74aee63dc1 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -438,84 +438,6 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-    def lut_estimation(self):
-        """Calculates resource estimations for LUTs based on:
-        - FINN-R: An End-to-End Deep-Learning Framework for Fast
-        Exploration of Quantized Neural Networks
-        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
-        Y. Umuroglu, M. Leeser and K. Vissers
-        - 12. Sep 2018
-        """
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        MW = self.get_nodeattr("MW")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        # determine tdt with input and weight data types
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        # parameters from experiments in paper mentioned above
-        c0 = 300
-        c1 = 1.1
-        c2 = 0
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "decoupled" and mstyle == "distributed") or (
-            mmode == "const" and self.calc_wmem() <= 128
-        ):
-            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
-
-        # multiplication
-        res_type = self.get_nodeattr("resType")
-        if res_type == "dsp":
-            mult_luts = 0
-        else:
-            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
-        # adder tree
-        addertree_luts = (W + A) * (2 * Q - 1)
-        # accumulator
-        acc_datatype = self.get_accumulator_datatype()
-        # if accDataType is not set, then it will default to INT32, which would
-        # be a large overestimate in most (if not all) cases. In this scenario,
-        # we would use the minimum accumulator as determined by the data types
-        # bound, derived in https://arxiv.org/abs/2301.13376
-        alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed())
-        acc_bits = min(
-            acc_datatype.bitwidth(),
-            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
-        )
-        acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        tmem_style = self.get_nodeattr("ram_style_thresholds")
-        if (noact == 0) and (tmem_style == "distributed"):
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
-            comp_luts = (2**B - 1) * acc_bits
-
-        return int(
-            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
-        )
-
-    def dsp_estimation(self):
-        # multiplication
-        P = self.get_nodeattr("PE")
-        res_type = self.get_nodeattr("resType")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        if res_type == "dsp":
-            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
-        else:
-            mult_dsp = 0
-        return int(mult_dsp)
-
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -955,12 +877,9 @@ def code_generation_ipi(self):
                 "create_bd_intf_pin -mode Slave "
                 "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # instantiate the hls ip
-            cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
-            )
-
+            # Instantiate either the HLS or RTL IP depending on operator
+            self.code_generation_ipi(cmd)
+            
             # instantiate a streamer and connect it to the HLS IP
             strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
@@ -1031,7 +950,7 @@ def code_generation_ipi(self):
             cmd.append("save_bd_design")
         elif mem_mode == "const" or mem_mode == "external":
             # base class impl sufficient for const/external modes
-            return super().code_generation_ipi()
+            self.code_generation_ipi(cmd)
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd

From 12ad48c6fdf1166068719ac98ae4469ce71d49a5 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 13 Feb 2024 11:21:32 +0000
Subject: [PATCH 122/123] [hls mvau]: fixed cppsim bipolar activations, added
 call to util functions from pyverilator and method to wire in IP

---
 .../hls/matrixvectoractivation_hls.py         | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
index d6d122e41b..aa3631a240 100644
--- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
@@ -495,6 +495,7 @@ def execute_node(self, context, graph):
         mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
+        # TODO ensure codegen dir exists
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         elif mode == "rtlsim":
@@ -512,6 +513,7 @@ def execute_node(self, context, graph):
         for inputs in node.input:
             # it is assumed that the first input of the node is the data input
             # the second input are the weights
+            # the third input are the thresholds
             if in_ind == 0:
                 assert (
                     str(context[inputs].dtype) == "float32"
@@ -519,7 +521,12 @@ def execute_node(self, context, graph):
                 not float32 as expected."""
                 expected_inp_shape = self.get_folded_input_shape()
                 reshaped_input = context[inputs].reshape(expected_inp_shape)
-                export_idt = self.get_input_datatype()
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType["BINARY"]
+                else:
+                    export_idt = self.get_input_datatype()
                 # make copy before saving the array
                 reshaped_input = reshaped_input.copy()
                 np.save(
@@ -549,9 +556,13 @@ def execute_node(self, context, graph):
             inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
             reset_rtlsim(sim)
             toggle_clk(sim)
-            if mem_mode in ["external", "decoupled"]:
+            if mem_mode == "external" or mem_mode == "decoupled":
                 wnbits = self.get_weightstream_width()
                 export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType["BIPOLAR"]:
+                    export_wdt = DataType["BINARY"]
                 wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
                 num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
                 io_dict = {
@@ -568,6 +579,7 @@ def execute_node(self, context, graph):
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
             rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+
             # load and reshape output
             output = np.load(out_npy_path)
             oshape = self.get_normal_output_shape()
@@ -576,14 +588,15 @@ def execute_node(self, context, graph):
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to "rtlsim" """.format(
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
                     mode
                 )
             )
 
-    def code_generation_ipi(self, cmd):
+    def instantiate_ip(self, cmd):
         # instantiate the HLS IP
         vlnv = self.get_nodeattr("ip_vlnv")
+        node_name = self.onnx_node.name
         if self.get_nodeattr("mem_mode") == "decoupled":
             cmd.append(
                 "create_bd_cell -type ip -vlnv %s /%s/%s"

From 3202dc1331aea3cd1e71663d5087fe2d479b5ac3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 13 Feb 2024 11:22:49 +0000
Subject: [PATCH 123/123] [hw mvau]: fixed bug for executing 2D arrays,
 modified create-stitched-ip method and reverted default resType to LUT

---
 .../fpgadataflow/matrixvectoractivation.py       | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 74aee63dc1..28c0c24c09 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -63,7 +63,7 @@ def get_nodeattr_types(self):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
+            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
             "ActVal": ("i", False, 0),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
@@ -152,11 +152,13 @@ def execute_node(self, context, graph):
             odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR"
             out_scale = 2 if odt_is_bipolar else 1
             out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal")
-            # NHWC to NCHW for multithreshold node
-            result = result.transpose((0, 3, 1, 2))
+            if result.ndim == 4:
+                # NHWC to NCHW for multithreshold node
+                result = result.transpose((0, 3, 1, 2))
             result = multithreshold(result, mvau_thr, out_scale, out_bias)
-            # NCHW to NHWC
-            result = result.transpose((0, 2, 3, 1))
+            if result.ndim == 4:
+                # NCHW to NHWC
+                result = result.transpose((0, 2, 3, 1))
 
         context[node.output[0]] = result
 
@@ -878,7 +880,7 @@ def code_generation_ipi(self):
                 "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
             # Instantiate either the HLS or RTL IP depending on operator
-            self.code_generation_ipi(cmd)
+            self.instantiate_ip(cmd)
             
             # instantiate a streamer and connect it to the HLS IP
             strm_vlnv = "amd.com:finn:memstream:1.0"
@@ -950,7 +952,7 @@ def code_generation_ipi(self):
             cmd.append("save_bd_design")
         elif mem_mode == "const" or mem_mode == "external":
             # base class impl sufficient for const/external modes
-            self.code_generation_ipi(cmd)
+            self.instantiate_ip(cmd)
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd